diff options
Diffstat (limited to 'drivers/staging/lustre/lustre/include')
76 files changed, 33997 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/include/cl_object.h b/drivers/staging/lustre/lustre/include/cl_object.h new file mode 100644 index 000000000000..4bb68801d3a9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/cl_object.h @@ -0,0 +1,3279 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * - cl_req represents a collection of pages for a transfer. cl_req is + * constructed by req-forming engine that tries to saturate + * transport with large and continuous transfers. + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - cl_object_header::coh_lock_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include <lu_object.h> +#include <lvfs.h> +# include <linux/mutex.h> +# include <linux/radix-tree.h> + +struct inode; + +struct cl_device; +struct cl_device_operations; + +struct cl_object; +struct cl_object_page_operations; +struct cl_object_lock_operations; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req; +struct cl_req_slice; + +/** + * Operations for each data device in the client stack. + * + * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops + */ +struct cl_device_operations { + /** + * Initialize cl_req. This method is called top-to-bottom on all + * devices in the stack to get them a chance to allocate layer-private + * data, and to attach them to the cl_req by calling + * cl_req_slice_add(). + * + * \see osc_req_init(), lov_req_init(), lovsub_req_init() + * \see ccc_req_init() + */ + int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +}; + +/** + * Device in the client stack. + * + * \see ccc_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; + /** Per-layer operation vector. */ + const struct cl_device_operations *cd_ops; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = 1 << 0, + CAT_KMS = 1 << 1, + CAT_MTIME = 1 << 3, + CAT_ATIME = 1 << 4, + CAT_CTIME = 1 << 5, + CAT_BLOCKS = 1 << 6, + CAT_UID = 1 << 7, + CAT_GID = 1 << 8 +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see ccc_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; + /** offset of page slice in cl_page buffer */ + int co_slice_off; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lustre_md *coc_md; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; + /** + * Layout lock handle. + */ + struct ldlm_lock *coc_lock; + /** + * Operation to handle layout, OBJECT_CONF_XYZ. + */ + int coc_opc; +}; + +enum { + /** configure layout, set up a new stripe, must be called while + * holding layout lock. */ + OBJECT_CONF_SET = 0, + /** invalidate the current stripe configuration due to losing + * layout lock. */ + OBJECT_CONF_INVALIDATE = 1, + /** wait for old layout to go away so that new layout can be + * set up. */ + OBJECT_CONF_WAIT = 2 +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + int (*coo_page_init)(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct page *vmpage); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see ccc_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + /** \name locks + * \todo XXX move locks below to the separate cache-lines, they are + * mostly useless otherwise. + */ + /** @{ */ + /** Lock protecting page tree. */ + spinlock_t coh_page_guard; + /** Lock protecting lock list. */ + spinlock_t coh_lock_guard; + /** @} locks */ + /** Radix tree of cl_page's, cached for this object. */ + struct radix_tree_root coh_tree; + /** # of pages in radix tree. */ + unsigned long coh_pages; + /** List of cl_lock's granted for this object. */ + struct list_head coh_locks; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Size of cl_page + page slices + */ + unsigned short coh_page_bufsize; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned char coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** @} cl_object */ + +#ifndef pgoff_t +#define pgoff_t unsigned long +#endif + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), struct page. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and struct page is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and struct page has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked struct page and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and struct page (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an cl_req being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read cl_req from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO, lockless IO and liblustre. */ + CPT_TRANSIENT, +}; + +/** + * Flags maintained for every cl_page. + */ +enum cl_page_flags { + /** + * Set when pagein completes. Used for debugging (read completes at + * most once for a page). + */ + CPF_READ_COMPLETED = 1 << 0 +}; + +/** + * Fields are protected by the lock on struct page, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** Logical page index within the object. Immutable after creation. */ + pgoff_t cp_index; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** Parent page, NULL for top-level page. Immutable after creation. */ + struct cl_page *cp_parent; + /** Lower-layer page. NULL for bottommost page. Immutable after + * creation. */ + struct cl_page *cp_child; + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + const enum cl_page_state cp_state; + /** Linkage of pages within group. Protected by cl_page::cp_mutex. */ + struct list_head cp_batch; + /** Mutex serializing membership of a page in a batch. */ + struct mutex cp_mutex; + /** Linkage of pages within cl_req. */ + struct list_head cp_flight; + /** Transfer error. */ + int cp_error; + + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type; + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** + * Debug information, the task is owning the page. + */ + task_t *cp_task; + /** + * Owning IO request in cl_page_state::CPS_PAGEOUT and + * cl_page_state::CPS_PAGEIN states. This field is maintained only in + * the top-level pages. Protected by a VM lock. + */ + struct cl_req *cp_req; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link *cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link *cp_queue_ref; + /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ + unsigned cp_flags; + /** Assigned if doing a sync_io */ + struct cl_sync_io *cp_sync_io; +}; + +/** + * Per-layer part of cl_page. + * + * \see ccc_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; + /** Linkage into cl_page::cp_layers. Immutable after creation. */ + struct list_head cpl_linkage; +}; + +/** + * Lock mode. For the client extent locks. + * + * \warning: cl_lock_mode_match() assumes particular ordering here. + * \ingroup cl_lock + */ +enum cl_lock_mode { + /** + * Mode of a lock that protects no data, and exists only as a + * placeholder. This is used for `glimpse' requests. A phantom lock + * might get promoted to real lock at some point. + */ + CLM_PHANTOM, + CLM_READ, + CLM_WRITE, + CLM_GROUP +}; + +/** + * Requested transfer type. + * \ingroup cl_req + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->struct page methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * \return the underlying VM page. Optional. + */ + struct page *(*cpo_vmpage)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + int (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces whether the page contains valid data or not by \a uptodate. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice, int uptodate); + /** + * Unmaps page from the user space (if it is mapped). + * + * \see cl_page_unmap() + * \see vvp_page_unmap() + */ + int (*cpo_unmap)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice); + + /** + * Checks whether the page is protected by a cl_lock. This is a + * per-layer method, because certain layers have ways to check for the + * lock much more efficiently than through the generic locks scan, or + * implement locking mechanisms separate from cl_lock, e.g., + * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks + * being canceled, or scheduled for cancellation as soon as the last + * user goes away, too. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. + * + * \see cl_page_is_under_lock() + */ + int (*cpo_is_under_lock)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. See comment on cl_req for a description of + * transfer formation and life-cycle. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * cl_req as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Announce that this page is to be written out + * opportunistically, that is, page is dirty, it is not + * necessary to start write-out transfer right now, but + * eventually page has to be written out. + * + * Main caller of this is the write path (see + * vvp_io_commit_write()), using this method to build a + * "transfer cache" from which large transfers are then + * constructed by the req-formation engine. + * + * \todo XXX it would make sense to add page-age tracking + * semantics here, and to oblige the req-formation engine to + * send the page out not later than it is too old. + * + * \see cl_page_cache_add() + */ + int (*cpo_cache_add)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * \pre the page was queued for transferring. + * \post page is removed from client's pending list, or -EBUSY + * is returned if it has already been in transferring. + * + * This is one of seldom page operation which is: + * 0. called from top level; + * 1. don't have vmpage locked; + * 2. every layer should synchronize execution of its ->cpo_cancel() + * with completion handlers. Osc uses client obd lock for this + * purpose. Based on there is no vvp_page_cancel and + * lov_page_cancel(), cpo_cancel is defacto protected by client lock. + * + * \see osc_page_cancel(). + */ + int (*cpo_cancel)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Write out a page by kernel. This is only called by ll_writepage + * right now. + * + * \see cl_page_flush() + */ + int (*cpo_flush)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_header_print(env, &msgdata, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +static inline int __page_in_use(const struct cl_page *page, int refc) +{ + if (page->cp_type == CPT_CACHEABLE) + ++refc; + LASSERT(atomic_read(&page->cp_ref) > 0); + return (atomic_read(&page->cp_ref) > refc); +} +#define cl_page_in_use(pg) __page_in_use(pg, 1) +#define cl_page_in_use_noref(pg) __page_in_use(pg, 0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * All locks for a given object are linked into cl_object_header::coh_locks + * list (protected by cl_object_header::coh_lock_guard spin-lock) through + * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can + * sort it in starting lock offset, or use altogether different data structure + * like a tree. + * + * Typical cl_lock consists of the two layers: + * + * - vvp_lock (vvp specific data), and + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - lovsub_lock, and + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is reference counted. When reference counter drops to 0, lock is + * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING + * lock is destroyed when last reference is released. Referencing between + * top-lock and its sub-locks is described in the lov documentation module. + * + * STATE MACHINE + * + * Also, cl_lock is a state machine. This requires some clarification. One of + * the goals of client IO re-write was to make IO path non-blocking, or at + * least to make it easier to make it non-blocking in the future. Here + * `non-blocking' means that when a system call (read, write, truncate) + * reaches a situation where it has to wait for a communication with the + * server, it should --instead of waiting-- remember its current state and + * switch to some other work. E.g,. instead of waiting for a lock enqueue, + * client should proceed doing IO on the next stripe, etc. Obviously this is + * rather radical redesign, and it is not planned to be fully implemented at + * this time, instead we are putting some infrastructure in place, that would + * make it easier to do asynchronous non-blocking IO easier in the + * future. Specifically, where old locking code goes to sleep (waiting for + * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When + * enqueue reply comes, its completion handler signals that lock state-machine + * is ready to transit to the next state. There is some generic code in + * cl_lock.c that sleeps, waiting for these signals. As a result, for users of + * this cl_lock.c code, it looks like locking is done in normal blocking + * fashion, and it the same time it is possible to switch to the non-blocking + * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c + * functions). + * + * For a description of state machine states and transitions see enum + * cl_lock_state. + * + * There are two ways to restrict a set of states which lock might move to: + * + * - placing a "hold" on a lock guarantees that lock will not be moved + * into cl_lock_state::CLS_FREEING state until hold is released. Hold + * can be only acquired on a lock that is not in + * cl_lock_state::CLS_FREEING. All holds on a lock are counted in + * cl_lock::cll_holds. Hold protects lock from cancellation and + * destruction. Requests to cancel and destroy a lock on hold will be + * recorded, but only honored when last hold on a lock is released; + * + * - placing a "user" on a lock guarantees that lock will not leave + * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of + * states, once it enters this set. That is, if a user is added onto a + * lock in a state not from this set, it doesn't immediately enforce + * lock to move to this set, but once lock enters this set it will + * remain there until all users are removed. Lock users are counted in + * cl_lock::cll_users. + * + * User is used to assure that lock is not canceled or destroyed while + * it is being enqueued, or actively used by some IO. + * + * Currently, a user always comes with a hold (cl_lock_invariant() + * checks that a number of holds is not less than a number of users). + * + * CONCURRENCY + * + * This is how lock state-machine operates. struct cl_lock contains a mutex + * cl_lock::cll_guard that protects struct fields. + * + * - mutex is taken, and cl_lock::cll_state is examined. + * + * - for every state there are possible target states where lock can move + * into. They are tried in order. Attempts to move into next state are + * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try(). + * + * - if the transition can be performed immediately, state is changed, + * and mutex is released. + * + * - if the transition requires blocking, _try() function returns + * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to + * sleep, waiting for possibility of lock state change. It is woken + * up when some event occurs, that makes lock state change possible + * (e.g., the reception of the reply from the server), and repeats + * the loop. + * + * Top-lock and sub-lock has separate mutexes and the latter has to be taken + * first to avoid dead-lock. + * + * To see an example of interaction of all these issues, take a look at the + * lov_cl.c:lov_lock_enqueue() function. It is called as a part of + * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by + * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note + * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It + * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be + * done in parallel, rather than one after another (this is used for glimpse + * locks, that cannot dead-lock). + * + * INTERFACE AND USAGE + * + * struct cl_lock_operations provide a number of call-backs that are invoked + * when events of interest occurs. Layers can intercept and handle glimpse, + * blocking, cancel ASTs and a reception of the reply from the server. + * + * One important difference with the old client locking model is that new + * client has a representation for the top-lock, whereas in the old code only + * sub-locks existed as real data structures and file-level locks are + * represented by "request sets" that are created and destroyed on each and + * every lock creation. + * + * Top-locks are cached, and can be found in the cache by the system calls. It + * is possible that top-lock is in cache, but some of its sub-locks were + * canceled and destroyed. In that case top-lock has to be enqueued again + * before it can be used. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released into cache. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * As multi-stripe locks have to be allowed, it makes sense to cache them, so + * that, for example, a sequence of O_APPEND writes can proceed quickly + * without going down to the individual stripes to do lock matching. On the + * other hand, multi-stripe locks shouldn't be used by normal read/write + * calls. To achieve this, every layer can implement ->clo_fits_into() method, + * that is called by lock matching code (cl_lock_lookup()), and that can be + * used to selectively disable matching of certain locks for certain IOs. For + * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe + * locks to be matched only for truncates and O_APPEND writes. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Group ID, for group lock */ + __u64 cld_gid; + /** Lock mode. */ + enum cl_lock_mode cld_mode; + /** + * flags to enqueue lock. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cld_enq_flags; +}; + +#define DDESCR "%s(%d):[%lu, %lu]" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Lock state-machine states. + * + * \htmlonly + * <pre> + * + * Possible state transitions: + * + * +------------------>NEW + * | | + * | | cl_enqueue_try() + * | | + * | cl_unuse_try() V + * | +--------------QUEUING (*) + * | | | + * | | | cl_enqueue_try() + * | | | + * | | cl_unuse_try() V + * sub-lock | +-------------ENQUEUED (*) + * canceled | | | + * | | | cl_wait_try() + * | | | + * | | (R) + * | | | + * | | V + * | | HELD<---------+ + * | | | | + * | | | | cl_use_try() + * | | cl_unuse_try() | | + * | | | | + * | | V ---+ + * | +------------>INTRANSIT (D) <--+ + * | | | + * | cl_unuse_try() | | cached lock found + * | | | cl_use_try() + * | | | + * | V | + * +------------------CACHED---------+ + * | + * (C) + * | + * V + * FREEING + * + * Legend: + * + * In states marked with (*) transition to the same state (i.e., a loop + * in the diagram) is possible. + * + * (R) is the point where Receive call-back is invoked: it allows layers + * to handle arrival of lock reply. + * + * (C) is the point where Cancellation call-back is invoked. + * + * (D) is the transit state which means the lock is changing. + * + * Transition to FREEING state is possible from any other state in the + * diagram in case of unrecoverable error. + * </pre> + * \endhtmlonly + * + * These states are for individual cl_lock object. Top-lock and its sub-locks + * can be in the different states. Another way to say this is that we have + * nested state-machines. + * + * Separate QUEUING and ENQUEUED states are needed to support non-blocking + * operation for locks with multiple sub-locks. Imagine lock on a file F, that + * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send + * enqueue to S0, wait for its completion, then send enqueue for S1, wait for + * its completion and at last enqueue lock for S2, and wait for its + * completion. In that case, top-lock is in QUEUING state while S0, S1 are + * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note + * that in this case, sub-locks move from state to state, and top-lock remains + * in the same state). + */ +enum cl_lock_state { + /** + * Lock that wasn't yet enqueued + */ + CLS_NEW, + /** + * Enqueue is in progress, blocking for some intermediate interaction + * with the other side. + */ + CLS_QUEUING, + /** + * Lock is fully enqueued, waiting for server to reply when it is + * granted. + */ + CLS_ENQUEUED, + /** + * Lock granted, actively used by some IO. + */ + CLS_HELD, + /** + * This state is used to mark the lock is being used, or unused. + * We need this state because the lock may have several sublocks, + * so it's impossible to have an atomic way to bring all sublocks + * into CLS_HELD state at use case, or all sublocks to CLS_CACHED + * at unuse case. + * If a thread is referring to a lock, and it sees the lock is in this + * state, it must wait for the lock. + * See state diagram for details. + */ + CLS_INTRANSIT, + /** + * Lock granted, not used. + */ + CLS_CACHED, + /** + * Lock is being destroyed. + */ + CLS_FREEING, + CLS_NR +}; + +enum cl_lock_flags { + /** + * lock has been cancelled. This flag is never cleared once set (by + * cl_lock_cancel0()). + */ + CLF_CANCELLED = 1 << 0, + /** cancellation is pending for this lock. */ + CLF_CANCELPEND = 1 << 1, + /** destruction is pending for this lock. */ + CLF_DOOMED = 1 << 2, + /** from enqueue RPC reply upcall. */ + CLF_FROM_UPCALL= 1 << 3, +}; + +/** + * Lock closure. + * + * Lock closure is a collection of locks (both top-locks and sub-locks) that + * might be updated in a result of an operation on a certain lock (which lock + * this is a closure of). + * + * Closures are needed to guarantee dead-lock freedom in the presence of + * + * - nested state-machines (top-lock state-machine composed of sub-lock + * state-machines), and + * + * - shared sub-locks. + * + * Specifically, many operations, such as lock enqueue, wait, unlock, + * etc. start from a top-lock, and then operate on a sub-locks of this + * top-lock, holding a top-lock mutex. When sub-lock state changes as a result + * of such operation, this change has to be propagated to all top-locks that + * share this sub-lock. Obviously, no natural lock ordering (e.g., + * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has + * to be used. Lock closure systematizes this try-and-repeat logic. + */ +struct cl_lock_closure { + /** + * Lock that is mutexed when closure construction is started. When + * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on + * origin is released before waiting. + */ + struct cl_lock *clc_origin; + /** + * List of enclosed locks, so far. Locks are linked here through + * cl_lock::cll_inclosure. + */ + struct list_head clc_list; + /** + * True iff closure is in a `wait' mode. This determines what + * cl_lock_enclosure() does when a lock L to be added to the closure + * is currently mutexed by some other thread. + * + * If cl_lock_closure::clc_wait is not set, then closure construction + * fails with CLO_REPEAT immediately. + * + * In wait mode, cl_lock_enclosure() waits until next attempt to build + * a closure might succeed. To this end it releases an origin mutex + * (cl_lock_closure::clc_origin), that has to be the only lock mutex + * owned by the current thread, and then waits on L mutex (by grabbing + * it and immediately releasing), before returning CLO_REPEAT to the + * caller. + */ + int clc_wait; + /** Number of locks in the closure. */ + int clc_nr; +}; + +/** + * Layered client lock. + */ +struct cl_lock { + /** Reference counter. */ + atomic_t cll_ref; + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** + * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected + * by cl_lock::cll_descr::cld_obj::coh_lock_guard. + */ + struct list_head cll_linkage; + /** + * Parameters of this lock. Protected by + * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within + * cl_lock::cll_guard. Modified only on lock creation and in + * cl_lock_modify(). + */ + struct cl_lock_descr cll_descr; + /** Protected by cl_lock::cll_guard. */ + enum cl_lock_state cll_state; + /** signals state changes. */ + wait_queue_head_t cll_wq; + /** + * Recursive lock, most fields in cl_lock{} are protected by this. + * + * Locking rules: this mutex is never held across network + * communication, except when lock is being canceled. + * + * Lock ordering: a mutex of a sub-lock is taken first, then a mutex + * on a top-lock. Other direction is implemented through a + * try-lock-repeat loop. Mutices of unrelated locks can be taken only + * by try-locking. + * + * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait(). + */ + struct mutex cll_guard; + task_t *cll_guarder; + int cll_depth; + + /** + * the owner for INTRANSIT state + */ + task_t *cll_intransit_owner; + int cll_error; + /** + * Number of holds on a lock. A hold prevents a lock from being + * canceled and destroyed. Protected by cl_lock::cll_guard. + * + * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release() + */ + int cll_holds; + /** + * Number of lock users. Valid in cl_lock_state::CLS_HELD state + * only. Lock user pins lock in CLS_HELD state. Protected by + * cl_lock::cll_guard. + * + * \see cl_wait(), cl_unuse(). + */ + int cll_users; + /** + * Flag bit-mask. Values from enum cl_lock_flags. Updates are + * protected by cl_lock::cll_guard. + */ + unsigned long cll_flags; + /** + * A linkage into a list of locks in a closure. + * + * \see cl_lock_closure + */ + struct list_head cll_inclosure; + /** + * Confict lock at queuing time. + */ + struct cl_lock *cll_conflict; + /** + * A list of references to this lock, for debugging. + */ + struct lu_ref cll_reference; + /** + * A list of holds on this lock, for debugging. + */ + struct lu_ref cll_holders; + /** + * A reference for cl_lock::cll_descr::cld_obj. For debugging. + */ + struct lu_ref_link *cll_obj_ref; +#ifdef CONFIG_LOCKDEP + /* "dep_map" name is assumed by lockdep.h macros. */ + struct lockdep_map dep_map; +#endif +}; + +/** + * Per-layer part of cl_lock + * + * \see ccc_lock, lov_lock, lovsub_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}(). + * + * NOTE: lov_subresult() depends on ordering here. + */ +enum cl_lock_transition { + /** operation cannot be completed immediately. Wait for state change. */ + CLO_WAIT = 1, + /** operation had to release lock mutex, restart. */ + CLO_REPEAT = 2, + /** lower layer re-enqueued. */ + CLO_REENQUEUED = 3, +}; + +/** + * + * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** + * \name statemachine + * + * State machine transitions. These 3 methods are called to transfer + * lock from one state to another, as described in the commentary + * above enum #cl_lock_state. + * + * \retval 0 this layer has nothing more to do to before + * transition to the target state happens; + * + * \retval CLO_REPEAT method had to release and re-acquire cl_lock + * mutex, repeat invocation of transition method + * across all layers; + * + * \retval CLO_WAIT this layer cannot move to the target state + * immediately, as it has to wait for certain event + * (e.g., the communication with the server). It + * is guaranteed, that when the state transfer + * becomes possible, cl_lock::cll_wq wait-queue + * is signaled. Caller can wait for this event by + * calling cl_lock_state_wait(); + * + * \retval -ve failure, abort state transition, move the lock + * into cl_lock_state::CLS_FREEING state, and set + * cl_lock::cll_error. + * + * Once all layers voted to agree to transition (by returning 0), lock + * is moved into corresponding target state. All state transition + * methods are optional. + */ + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), + * \see osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); + /** + * Attempts to wait for enqueue result. Called top-to-bottom. + * + * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait() + */ + int (*clo_wait)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Attempts to unlock the lock. Called bottom-to-top. In addition to + * usual return values of lock state-machine methods, this can return + * -ESTALE to indicate that lock cannot be returned to the cache, and + * has to be re-initialized. + * unuse is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see ccc_lock_unuse(), lov_lock_unuse(), osc_lock_unuse() + */ + int (*clo_unuse)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Notifies layer that cached lock is started being used. + * + * \pre lock->cll_state == CLS_CACHED + * + * \see lov_lock_use(), osc_lock_use() + */ + int (*clo_use)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} statemachine */ + /** + * A method invoked when lock state is changed (as a result of state + * transition). This is used, for example, to track when the state of + * a sub-lock changes, to propagate this change to the corresponding + * top-lock. Optional + * + * \see lovsub_lock_state() + */ + void (*clo_state)(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state st); + /** + * Returns true, iff given lock is suitable for the given io, idea + * being, that there are certain "unsafe" locks, e.g., ones acquired + * for O_APPEND writes, that we don't want to re-use for a normal + * write, to avoid the danger of cascading evictions. Optional. Runs + * under cl_object_header::coh_lock_guard. + * + * XXX this should take more information about lock needed by + * io. Probably lock description or something similar. + * + * \see lov_fits_into() + */ + int (*clo_fits_into)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); + /** + * \name ast + * Asynchronous System Traps. All of then are optional, all are + * executed bottom-to-top. + */ + /** @{ */ + + /** + * Cancellation callback. Cancel a lock voluntarily, or under + * the request of server. + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Lock weighting ast. Executed to estimate how precious this lock + * is. The sum of results across all layers is used to determine + * whether lock worth keeping in cache given present memory usage. + * + * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh(). + */ + unsigned long (*clo_weigh)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} ast */ + + /** + * \see lovsub_lock_closure() + */ + int (*clo_closure)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure); + /** + * Executed bottom-to-top when lock description changes (e.g., as a + * result of server granting more generous lock than was requested). + * + * \see lovsub_lock_modify() + */ + int (*clo_modify)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *updated); + /** + * Notifies layers (bottom-to-top) that lock is going to be + * destroyed. Responsibility of layers is to prevent new references on + * this lock from being acquired once this method returns. + * + * This can be called multiple times due to the races. + * + * \see cl_lock_delete() + * \see osc_lock_delete(), lovsub_lock_delete() + */ + void (*clo_delete)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Destructor. Frees resources and the slice. + * + * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), + * \see osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_lock_print(env, &msgdata, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +#define CL_LOCK_ASSERT(expr, env, lock) do { \ + if (likely(expr)) \ + break; \ + \ + CL_LOCK_DEBUG(D_ERROR, env, lock, "failed at %s.\n", #expr); \ + LBUG(); \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; + task_t *pl_owner; +}; + +/** + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * (cl_io_operations::cio_read_page() for read, + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ, + /** write system call */ + CIT_WRITE, + /** truncate, utime system calls */ + CIT_SETATTR, + /** + * page fault handling + */ + CIT_FAULT, + /** + * fsync system call handling + * To write out a range of file + */ + CIT_FSYNC, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - glimpse. An io context to acquire glimpse lock. + * + * - grouplock. An io context to acquire group lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io, ccc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + struct { + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + } req_op[CRT_NR]; + /** + * Read missing page. + * + * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start() + * method, when it hits not-up-to-date page in the range. Optional. + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_page)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page); + /** + * Prepare write of a \a page. Called bottom-to-top by a top-level + * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for + * get data from user-level buffer. + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_prepare_write(), lov_io_prepare_write(), + * osc_io_prepare_write(). + */ + int (*cio_prepare_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_commit_write(), lov_io_commit_write(), + * osc_io_commit_write(). + */ + int (*cio_commit_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EWOULDBLOCK is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * take lock asynchronously (out of order), as it cannot + * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. + */ + CEF_ASYNC = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004, + /** + * tell the sub layers that it must be a `real' lock. This is used for + * mmapped-buffer locks and glimpse locks that must be never converted + * into lockless mode. + * + * \see vvp_mmap_locks(), cl_glimpse_lock(). + */ + CEF_MUST = 0x00000008, + /** + * tell the sub layers that never request a `real' lock. This flag is + * not used currently. + * + * cl_io::ci_lockreq and CEF_{MUST,NEVER} flags specify lockless + * conversion policy: ci_lockreq describes generic information of lock + * requirement for this IO, especially for locks which belong to the + * object doing IO; however, lock itself may have precise requirements + * that are described by the enqueue flags. + */ + CEF_NEVER = 0x00000010, + /** + * for async glimpse lock. + */ + CEF_AGL = 0x00000020, + /** + * mask of enq_flags. + */ + CEF_MASK = 0x0000003f, +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock_descr cill_descr; + struct cl_lock *cill_lock; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks currently being processed. */ + struct list_head cls_curr; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +enum cl_fsync_mode { + /** start writeback, do not wait for them to finish */ + CL_FSYNC_NONE = 0, + /** start writeback and wait for them to finish */ + CL_FSYNC_LOCAL = 1, + /** discard all of dirty pages in a specific file range */ + CL_FSYNC_DISCARD = 2, + /** start writeback and make sure they have reached storage before + * return. OST_SYNC RPC must be issued and finished */ + CL_FSYNC_ALL = 3 +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; + + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + int wr_sync; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_setattr_io { + struct ost_lvb sa_attr; + unsigned int sa_valid; + struct obd_capa *sa_capa; + } ci_setattr; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + int ft_nob; + /** writable page? for nopage() only */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** page_mkwrite() */ + int ft_mkwrite; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + struct cl_fsync_io { + loff_t fi_start; + loff_t fi_end; + struct obd_capa *fi_capa; + /** file system level fid */ + struct lu_fid *fi_fid; + enum cl_fsync_mode fi_mode; + /* how many pages were written/discarded */ + unsigned int fi_nr_written; + } ci_fsync; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + unsigned int ci_continue:1, + /** + * This io has held grouplock, to inform sublayers that + * don't do lockless i/o. + */ + ci_no_srvlock:1, + /** + * The whole IO need to be restarted because layout has been changed + */ + ci_need_restart:1, + /** + * to not refresh layout - the IO issuer knows that the layout won't + * change(page operations, layout change causes all page to be + * discarded), or it doesn't matter if it changes(sync). + */ + ci_ignore_layout:1, + /** + * Check if layout changed after the IO finishes. Mainly for HSM + * requirement. If IO occurs to openning files, it doesn't need to + * verify layout because HSM won't release openning files. + * Right now, only two opertaions need to verify layout: glimpse + * and setattr. + */ + ci_verify_layout:1; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; +}; + +/** @} cl_io */ + +/** \addtogroup cl_req cl_req + * @{ */ +/** \struct cl_req + * Transfer. + * + * There are two possible modes of transfer initiation on the client: + * + * - immediate transfer: this is started when a high level io wants a page + * or a collection of pages to be transferred right away. Examples: + * read-ahead, synchronous read in the case of non-page aligned write, + * page write-out as a part of extent lock cancellation, page write-out + * as a part of memory cleansing. Immediate transfer can be both + * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE; + * + * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens + * when io wants to transfer a page to the server some time later, when + * it can be done efficiently. Example: pages dirtied by the write(2) + * path. + * + * In any case, transfer takes place in the form of a cl_req, which is a + * representation for a network RPC. + * + * Pages queued for an opportunistic transfer are cached until it is decided + * that efficient RPC can be composed of them. This decision is made by "a + * req-formation engine", currently implemented as a part of osc + * layer. Req-formation depends on many factors: the size of the resulting + * RPC, whether or not multi-object RPCs are supported by the server, + * max-rpc-in-flight limitations, size of the dirty cache, etc. + * + * For the immediate transfer io submits a cl_page_list, that req-formation + * engine slices into cl_req's, possibly adding cached pages to some of + * the resulting req's. + * + * Whenever a page from cl_page_list is added to a newly constructed req, its + * cl_page_operations::cpo_prep() layer methods are called. At that moment, + * page state is atomically changed from cl_page_state::CPS_OWNED to + * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner + * is zeroed, and cl_page::cp_req is set to the + * req. cl_page_operations::cpo_prep() method at the particular layer might + * return -EALREADY to indicate that it does not need to submit this page + * at all. This is possible, for example, if page, submitted for read, + * became up-to-date in the meantime; and for write, the page don't have + * dirty bit marked. \see cl_io_submit_rw() + * + * Whenever a cached page is added to a newly constructed req, its + * cl_page_operations::cpo_make_ready() layer methods are called. At that + * moment, page state is atomically changed from cl_page_state::CPS_CACHED to + * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to + * req. cl_page_operations::cpo_make_ready() method at the particular layer + * might return -EAGAIN to indicate that this page is not eligible for the + * transfer right now. + * + * FUTURE + * + * Plan is to divide transfers into "priority bands" (indicated when + * submitting cl_page_list, and queuing a page for the opportunistic transfer) + * and allow glueing of cached pages to immediate transfers only within single + * band. This would make high priority transfers (like lock cancellation or + * memory pressure induced write-out) really high priority. + * + */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Capability. */ + struct obd_capa *cra_capa; + /** Jobid */ + char cra_jobid[JOBSTATS_JOBID_SIZE]; +}; + +/** + * Transfer request operations definable at every layer. + * + * Concurrency: transfer formation engine synchronizes calls to all transfer + * methods. + */ +struct cl_req_operations { + /** + * Invoked top-to-bottom by cl_req_prep() when transfer formation is + * complete (all pages are added). + * + * \see osc_req_prep() + */ + int (*cro_prep)(const struct lu_env *env, + const struct cl_req_slice *slice); + /** + * Called top-to-bottom to fill in \a oa fields. This is called twice + * with different flags, see bug 10150 and osc_build_req(). + * + * \param obj an object from cl_req which attributes are to be set in + * \a oa. + * + * \param oa struct obdo where attributes are placed + * + * \param flags \a oa fields to be filled. + */ + void (*cro_attr_set)(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags); + /** + * Called top-to-bottom from cl_req_completion() to notify layers that + * transfer completed. Has to free all state allocated by + * cl_device_operations::cdo_req_init(). + */ + void (*cro_completion)(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +}; + +/** + * A per-object state that (potentially multi-object) transfer request keeps. + */ +struct cl_req_obj { + /** object itself */ + struct cl_object *ro_obj; + /** reference to cl_req_obj::ro_obj. For debugging. */ + struct lu_ref_link *ro_obj_ref; + /* something else? Number of pages for a given object? */ +}; + +/** + * Transfer request. + * + * Transfer requests are not reference counted, because IO sub-system owns + * them exclusively and knows when to free them. + * + * Life cycle. + * + * cl_req is created by cl_req_alloc() that calls + * cl_device_operations::cdo_req_init() device methods to allocate per-req + * state in every layer. + * + * Then pages are added (cl_req_page_add()), req keeps track of all objects it + * contains pages for. + * + * Once all pages were collected, cl_page_operations::cpo_prep() method is + * called top-to-bottom. At that point layers can modify req, let it pass, or + * deny it completely. This is to support things like SNS that have transfer + * ordering requirements invisible to the individual req-formation engine. + * + * On transfer completion (or transfer timeout, or failure to initiate the + * transfer of an allocated req), cl_req_operations::cro_completion() method + * is called, after execution of cl_page_operations::cpo_completion() of all + * req's pages. + */ +struct cl_req { + enum cl_req_type crq_type; + /** A list of pages being transfered */ + struct list_head crq_pages; + /** Number of pages in cl_req::crq_pages */ + unsigned crq_nrpages; + /** An array of objects which pages are in ->crq_pages */ + struct cl_req_obj *crq_o; + /** Number of elements in cl_req::crq_objs[] */ + unsigned crq_nrobjs; + struct list_head crq_layers; +}; + +/** + * Per-layer state for request. + */ +struct cl_req_slice { + struct cl_req *crs_req; + struct cl_device *crs_dev; + struct list_head crs_linkage; + const struct cl_req_operations *crs_ops; +}; + +/* @} cl_req */ + +enum cache_stats_item { + /** how many cache lookups were performed */ + CS_lookup = 0, + /** how many times cache lookup resulted in a hit */ + CS_hit, + /** how many entities are in the cache right now */ + CS_total, + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + CS_busy, + /** how many entities were created at all */ + CS_create, + CS_NR +}; + +#define CS_NAMES { "lookup", "hit", "total", "busy", "create" } + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + atomic_t cs_stats[CS_NR]; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + struct cache_stats cs_locks; + atomic_t cs_pages_state[CPS_NR]; + atomic_t cs_locks_state[CLS_NR]; +}; + +int cl_site_init (struct cl_site *s, struct cl_device *top); +void cl_site_fini (struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline int lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of0(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of0(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of0(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_device *cl_object_device(const struct cl_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); + return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of0(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +void cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); +int cl_object_has_locks (struct cl_object *obj); + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +static inline void cl_object_page_init(struct cl_object *clob, int size) +{ + clob->co_slice_off = cl_object_header(clob)->coh_page_bufsize; + cl_object_header(clob)->coh_page_bufsize += ALIGN(size, 8); +} + +static inline void *cl_object_page_slice(struct cl_object *clob, + struct cl_page *page) +{ + return (void *)((char *)page + clob->co_slice_off); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +enum { + CLP_GANG_OKAY = 0, + CLP_GANG_RESCHED, + CLP_GANG_AGAIN, + CLP_GANG_ABORT +}; + +/* callback of cl_page_gang_lookup() */ +typedef int (*cl_page_gang_cb_t) (const struct lu_env *, struct cl_io *, + struct cl_page *, void *); +int cl_page_gang_lookup (const struct lu_env *env, + struct cl_object *obj, + struct cl_io *io, + pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata); +struct cl_page *cl_page_lookup (struct cl_object_header *hdr, + pgoff_t index); +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +struct cl_page *cl_page_find_sub (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +struct page *cl_page_vmpage (const struct lu_env *env, + struct cl_page *page); +struct cl_page *cl_vmpage_page (struct page *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_own_try (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_cancel (const struct lu_env *env, struct cl_page *page); +int cl_page_flush (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete (const struct lu_env *env, struct cl_page *pg); +int cl_page_unmap (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +int cl_page_is_vmlocked (const struct lu_env *env, + const struct cl_page *pg); +void cl_page_export (const struct lu_env *env, + struct cl_page *pg, int uptodate); +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +loff_t cl_offset (const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index (const struct cl_object *obj, loff_t offset); +int cl_page_size (const struct cl_object *obj); +int cl_pages_prune (const struct lu_env *env, struct cl_object *obj); + +void cl_lock_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ + +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, int pending, + int canceld); +static inline struct cl_lock *cl_lock_at_page(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, + struct cl_lock *except, + int pending, int canceld) +{ + LASSERT(cl_object_header(obj) == cl_object_header(page->cp_obj)); + return cl_lock_at_pgoff(env, obj, page->cp_index, except, + pending, canceld); +} + +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); + +void cl_lock_get (struct cl_lock *lock); +void cl_lock_get_trust (struct cl_lock *lock); +void cl_lock_put (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_release (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock); + +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock); +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_lock_is_intransit(struct cl_lock *lock); + +int cl_lock_enqueue_wait(const struct lu_env *env, struct cl_lock *lock, + int keep_mutex); + +/** \name statemachine statemachine + * Interface to lock state machine consists of 3 parts: + * + * - "try" functions that attempt to effect a state transition. If state + * transition is not possible right now (e.g., if it has to wait for some + * asynchronous event to occur), these functions return + * cl_lock_transition::CLO_WAIT. + * + * - "non-try" functions that implement synchronous blocking interface on + * top of non-blocking "try" functions. These functions repeatedly call + * corresponding "try" versions, and if state transition is not possible + * immediately, wait for lock state change. + * + * - methods from cl_lock_operations, called by "try" functions. Lock can + * be advanced to the target state only when all layers voted that they + * are ready for this transition. "Try" functions call methods under lock + * mutex. If a layer had to release a mutex, it re-acquires it and returns + * cl_lock_transition::CLO_REPEAT, causing "try" function to call all + * layers again. + * + * TRY NON-TRY METHOD FINAL STATE + * + * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED + * + * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD + * + * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED + * + * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD + * + * @{ */ + +int cl_enqueue (const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_unuse (const struct lu_env *env, struct cl_lock *lock); +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock); +int cl_wait_try (const struct lu_env *env, struct cl_lock *lock); +int cl_use_try (const struct lu_env *env, struct cl_lock *lock, int atomic); + +/** @} statemachine */ + +void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_queue_match (const struct list_head *queue, + const struct cl_lock_descr *need); + +void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_is_mutexed (struct cl_lock *lock); +int cl_lock_nr_mutexed (const struct lu_env *env); +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock); +int cl_lock_ext_match (const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need); +int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc); + +void cl_lock_closure_init (const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait); +void cl_lock_closure_fini (struct cl_lock_closure *closure); +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); +void cl_lock_disclosure (const struct lu_env *env, + struct cl_lock_closure *closure); +int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error); +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait); + +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_read_page (const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_commit_write (const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +int cl_io_submit_sync (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); +int cl_io_cancel (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue); +int cl_io_is_going (const struct lu_env *env); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +static inline int cl_io_is_sync_write(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_sync; +} + +static inline int cl_io_is_mkwrite(const struct cl_io *io) +{ + return io->ci_type == CIT_FAULT && io->u.ci_fault.ft_mkwrite; +} + +/** + * True, iff \a io is a truncate(2). + */ +static inline int cl_io_is_trunc(const struct cl_io *io) +{ + return io->ci_type == CIT_SETATTR && + (io->u.ci_setattr.sa_valid & ATTR_SIZE); +} + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(foo_io, base) \ +do { \ + typeof(foo_io) __foo_io = (foo_io); \ + \ + CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \ + memset(&__foo_io->base + 1, 0, \ + (sizeof *__foo_io) - sizeof __foo_io->base); \ +} while (0) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Last page in the page list. + */ +static inline struct cl_page *cl_page_list_last(struct cl_page_list *plist) +{ + LASSERT(plist->pl_nr > 0); + return list_entry(plist->pl_pages.prev, struct cl_page, cp_batch); +} + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init (struct cl_page_list *plist); +void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice (struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del (const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_own (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_unmap (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init (struct cl_2queue *queue); +void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); +void cl_2queue_disown (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_assume (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_discard (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +/** \defgroup cl_req cl_req + * @{ */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects); + +void cl_req_page_add (const struct lu_env *env, struct cl_req *req, + struct cl_page *page); +void cl_req_page_done (const struct lu_env *env, struct cl_page *page); +int cl_req_prep (const struct lu_env *env, struct cl_req *req); +void cl_req_attr_set (const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags); +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** error code. */ + int csi_sync_rc; + /** barrier of destroy this structure */ + atomic_t csi_barrier; + /** completion to be signaled when transfer is complete. */ + wait_queue_head_t csi_waitq; +}; + +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages); +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout); +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret); + +/** @} cl_sync_io */ + +/** @} cl_req */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumtpion fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * - there is a notion of "current" environment, attached to the kernel + * data structure representing current thread Top-level lustre code + * allocates an environment and makes it current, then calls into + * non-lustre code, that in turn calls lustre back. Low-level lustre + * code thus called can fetch environment created by the top-level code + * and reuse it, avoiding additional environment allocation. + * Right now, three interfaces can attach the cl_env to running thread: + * - cl_env_get + * - cl_env_implant + * - cl_env_reexit(cl_env_reenter had to be called priorly) + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct cl_env_nest { + int cen_refcheck; + void *cen_cookie; +}; + +struct lu_env *cl_env_peek (int *refcheck); +struct lu_env *cl_env_get (int *refcheck); +struct lu_env *cl_env_alloc (int *refcheck, __u32 tags); +struct lu_env *cl_env_nested_get (struct cl_env_nest *nest); +void cl_env_put (struct lu_env *env, int *refcheck); +void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env); +void *cl_env_reenter (void); +void cl_env_reexit (void *cookie); +void cl_env_implant (struct lu_env *env, int *refcheck); +void cl_env_unplant (struct lu_env *env, int *refcheck); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +int cl_global_init(void); +void cl_global_fini(void); + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/dt_object.h b/drivers/staging/lustre/lustre/include/dt_object.h new file mode 100644 index 000000000000..e116bb21b529 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/dt_object.h @@ -0,0 +1,1498 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_DT_OBJECT_H +#define __LUSTRE_DT_OBJECT_H + +/** \defgroup dt dt + * Sub-class of lu_object with methods common for "data" objects in OST stack. + * + * Data objects behave like regular files: you can read/write them, get and + * set their attributes. Implementation of dt interface is supposed to + * implement some form of garbage collection, normally reference counting + * (nlink) based one. + * + * Examples: osd (lustre/osd) is an implementation of dt interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include <lu_object.h> + +#include <linux/libcfs/libcfs.h> + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; + +struct thandle; +struct dt_device; +struct dt_object; +struct dt_index_features; +struct niobuf_local; +struct niobuf_remote; +struct ldlm_enqueue_info; + +typedef enum { + MNTOPT_USERXATTR = 0x00000001, + MNTOPT_ACL = 0x00000002, +} mntopt_t; + +struct dt_device_param { + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_block_shift; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + void *ddp_mnt; /* XXX: old code can retrieve mnt -bzzz */ + int ddp_mount_type; + unsigned long long ddp_maxbytes; + /* percentage of available space to reserve for grant error margin */ + int ddp_grant_reserved; + /* per-inode space consumption */ + short ddp_inodespace; + /* per-fragment grant overhead to be used by client for grant + * calculation */ + int ddp_grant_frag; +}; + +/** + * Per-transaction commit callback function + */ +struct dt_txn_commit_cb; +typedef void (*dt_cb_t)(struct lu_env *env, struct thandle *th, + struct dt_txn_commit_cb *cb, int err); +/** + * Special per-transaction callback for cases when just commit callback + * is needed and per-device callback are not convenient to use + */ +#define TRANS_COMMIT_CB_MAGIC 0xa0a00a0a +#define MAX_COMMIT_CB_STR_LEN 32 + +struct dt_txn_commit_cb { + struct list_head dcb_linkage; + dt_cb_t dcb_func; + __u32 dcb_magic; + char dcb_name[MAX_COMMIT_CB_STR_LEN]; +}; + +/** + * Operations on dt device. + */ +struct dt_device_operations { + /** + * Return device-wide statistics. + */ + int (*dt_statfs)(const struct lu_env *env, + struct dt_device *dev, struct obd_statfs *osfs); + /** + * Create transaction, described by \a param. + */ + struct thandle *(*dt_trans_create)(const struct lu_env *env, + struct dt_device *dev); + /** + * Start transaction, described by \a param. + */ + int (*dt_trans_start)(const struct lu_env *env, + struct dt_device *dev, struct thandle *th); + /** + * Finish previously started transaction. + */ + int (*dt_trans_stop)(const struct lu_env *env, + struct thandle *th); + /** + * Add commit callback to the transaction. + */ + int (*dt_trans_cb_add)(struct thandle *th, + struct dt_txn_commit_cb *dcb); + /** + * Return fid of root index object. + */ + int (*dt_root_get)(const struct lu_env *env, + struct dt_device *dev, struct lu_fid *f); + /** + * Return device configuration data. + */ + void (*dt_conf_get)(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param); + /** + * handling device state, mostly for tests + */ + int (*dt_sync)(const struct lu_env *env, struct dt_device *dev); + int (*dt_ro)(const struct lu_env *env, struct dt_device *dev); + /** + * Start a transaction commit asynchronously + * + * \param env environment + * \param dev dt_device to start commit on + * + * \return 0 success, negative value if error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); + /** + * Initialize capability context. + */ + int (*dt_init_capa_ctxt)(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys); +}; + +struct dt_index_features { + /** required feature flags from enum dt_index_flags */ + __u32 dif_flags; + /** minimal required key size */ + size_t dif_keysize_min; + /** maximal required key size, 0 if no limit */ + size_t dif_keysize_max; + /** minimal required record size */ + size_t dif_recsize_min; + /** maximal required record size, 0 if no limit */ + size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; +}; + +enum dt_index_flags { + /** index supports variable sized keys */ + DT_IND_VARKEY = 1 << 0, + /** index supports variable sized records */ + DT_IND_VARREC = 1 << 1, + /** index can be modified */ + DT_IND_UPDATE = 1 << 2, + /** index supports records with non-unique (duplicate) keys */ + DT_IND_NONUNQ = 1 << 3, + /** + * index support fixed-size keys sorted with natural numerical way + * and is able to return left-side value if no exact value found + */ + DT_IND_RANGE = 1 << 4, +}; + +/** + * Features, required from index to support file system directories (mapping + * names to fids). + */ +extern const struct dt_index_features dt_directory_features; +extern const struct dt_index_features dt_otable_features; +extern const struct dt_index_features dt_lfsck_features; + +/* index features supported by the accounting objects */ +extern const struct dt_index_features dt_acct_features; + +/* index features supported by the quota global indexes */ +extern const struct dt_index_features dt_quota_glb_features; + +/* index features supported by the quota slave indexes */ +extern const struct dt_index_features dt_quota_slv_features; + +/** + * This is a general purpose dt allocation hint. + * It now contains the parent object. + * It can contain any allocation hint in the future. + */ +struct dt_allocation_hint { + struct dt_object *dah_parent; + __u32 dah_mode; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + int striped; + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; +}; + +enum dt_format_type dt_mode_to_dft(__u32 mode); + +typedef __u64 dt_obj_version_t; + +/** + * Per-dt-object operations. + */ +struct dt_object_operations { + void (*do_read_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_write_lock)(const struct lu_env *env, + struct dt_object *dt, unsigned role); + void (*do_read_unlock)(const struct lu_env *env, + struct dt_object *dt); + void (*do_write_unlock)(const struct lu_env *env, + struct dt_object *dt); + int (*do_write_locked)(const struct lu_env *env, + struct dt_object *dt); + /** + * Note: following ->do_{x,}attr_{set,get}() operations are very + * similar to ->moo_{x,}attr_{set,get}() operations in struct + * md_object_operations (see md_object.h). These operations are not in + * lu_object_operations, because ->do_{x,}attr_set() versions take + * transaction handle as an argument (this transaction is started by + * caller). We might factor ->do_{x,}attr_get() into + * lu_object_operations, but that would break existing symmetry. + */ + + /** + * Return standard attributes. + * + * precondition: lu_object_exists(&dt->do_lu); + */ + int (*do_attr_get)(const struct lu_env *env, + struct dt_object *dt, struct lu_attr *attr, + struct lustre_capa *capa); + /** + * Set standard attributes. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle); + int (*do_attr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *attr, + struct thandle *handle, + struct lustre_capa *capa); + /** + * Return a value of an extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_get)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name, + struct lustre_capa *capa); + /** + * Set value of an extended attribute. + * + * \a fl - flags from enum lu_xattr_flags + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_set)(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *handle); + int (*do_xattr_set)(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *handle, + struct lustre_capa *capa); + /** + * Delete existing extended attribute. + * + * precondition: dt_object_exists(dt); + */ + int (*do_declare_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle); + int (*do_xattr_del)(const struct lu_env *env, + struct dt_object *dt, + const char *name, struct thandle *handle, + struct lustre_capa *capa); + /** + * Place list of existing extended attributes into \a buf (which has + * length len). + * + * precondition: dt_object_exists(dt); + */ + int (*do_xattr_list)(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa); + /** + * Init allocation hint using parent object and child mode. + * (1) The \a parent might be NULL if this is a partial creation for + * remote object. + * (2) The type of child is in \a child_mode. + * (3) The result hint is stored in \a ah; + */ + void (*do_ah_init)(const struct lu_env *env, + struct dt_allocation_hint *ah, + struct dt_object *parent, + struct dt_object *child, + umode_t child_mode); + /** + * Create new object on this device. + * + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_create)(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + int (*do_create)(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); + + /** + Destroy object on this device + * precondition: !dt_object_exists(dt); + * postcondition: ergo(result == 0, dt_object_exists(dt)); + */ + int (*do_declare_destroy)(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th); + int (*do_destroy)(const struct lu_env *env, struct dt_object *dt, + struct thandle *th); + + /** + * Announce that this object is going to be used as an index. This + * operation check that object supports indexing operations and + * installs appropriate dt_index_operations vector on success. + * + * Also probes for features. Operation is successful if all required + * features are supported. + */ + int (*do_index_try)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_index_features *feat); + /** + * Add nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_add)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + /** + * Del nlink of the object + * precondition: dt_object_exists(dt); + */ + int (*do_declare_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + int (*do_ref_del)(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); + + struct obd_capa *(*do_capa_get)(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, + __u64 opc); + int (*do_object_sync)(const struct lu_env *, struct dt_object *); + /** + * Get object info of next level. Currently, only get inode from osd. + * This is only used by quota b=16542 + * precondition: dt_object_exists(dt); + */ + int (*do_data_get)(const struct lu_env *env, struct dt_object *dt, + void **data); + + /** + * Lock object. + */ + int (*do_object_lock)(const struct lu_env *env, struct dt_object *dt, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy); +}; + +/** + * Per-dt-object operations on "file body". + */ +struct dt_body_operations { + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_read)(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + ssize_t (*dbo_declare_write)(const struct lu_env *env, + struct dt_object *dt, + const loff_t size, loff_t pos, + struct thandle *handle); + ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /* + * methods for zero-copy IO + */ + + /* + * precondition: dt_object_exists(dt); + * returns: + * < 0 - error code + * = 0 - illegal + * > 0 - number of local buffers prepared + */ + int (*dbo_bufs_get)(const struct lu_env *env, struct dt_object *dt, + loff_t pos, ssize_t len, struct niobuf_local *lb, + int rw, struct lustre_capa *capa); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_bufs_put)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lb, int nr); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_declare_write_commit)(const struct lu_env *env, + struct dt_object *dt, + struct niobuf_local *, + int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_write_commit)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *, int, struct thandle *); + /* + * precondition: dt_object_exists(dt); + */ + int (*dbo_read_prep)(const struct lu_env *env, struct dt_object *dt, + struct niobuf_local *lnb, int nr); + int (*dbo_fiemap_get)(const struct lu_env *env, struct dt_object *dt, + struct ll_user_fiemap *fm); + /** + * Punch object's content + * precondition: regular object, not index + */ + int (*dbo_declare_punch)(const struct lu_env *, struct dt_object *, + __u64, __u64, struct thandle *th); + int (*dbo_punch)(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa); +}; + +/** + * Incomplete type of index record. + */ +struct dt_rec; + +/** + * Incomplete type of index key. + */ +struct dt_key; + +/** + * Incomplete type of dt iterator. + */ +struct dt_it; + +/** + * Per-dt-object operations on object as index. + */ +struct dt_index_operations { + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_lookup)(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_insert)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *handle); + int (*dio_insert)(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, const struct dt_key *key, + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); + /** + * precondition: dt_object_exists(dt); + */ + int (*dio_declare_delete)(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *handle); + int (*dio_delete)(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *handle, + struct lustre_capa *capa); + /** + * Iterator interface + */ + struct dt_it_ops { + /** + * Allocate and initialize new iterator. + * + * precondition: dt_object_exists(dt); + */ + struct dt_it *(*init)(const struct lu_env *env, + struct dt_object *dt, + __u32 attr, + struct lustre_capa *capa); + void (*fini)(const struct lu_env *env, + struct dt_it *di); + int (*get)(const struct lu_env *env, + struct dt_it *di, + const struct dt_key *key); + void (*put)(const struct lu_env *env, + struct dt_it *di); + int (*next)(const struct lu_env *env, + struct dt_it *di); + struct dt_key *(*key)(const struct lu_env *env, + const struct dt_it *di); + int (*key_size)(const struct lu_env *env, + const struct dt_it *di); + int (*rec)(const struct lu_env *env, + const struct dt_it *di, + struct dt_rec *rec, + __u32 attr); + __u64 (*store)(const struct lu_env *env, + const struct dt_it *di); + int (*load)(const struct lu_env *env, + const struct dt_it *di, __u64 hash); + int (*key_rec)(const struct lu_env *env, + const struct dt_it *di, void* key_rec); + } dio_it; +}; + +enum dt_otable_it_valid { + DOIV_ERROR_HANDLE = 0x0001, +}; + +enum dt_otable_it_flags { + /* Exit when fail. */ + DOIF_FAILOUT = 0x0001, + + /* Reset iteration position to the device beginning. */ + DOIF_RESET = 0x0002, + + /* There is up layer component uses the iteration. */ + DOIF_OUTUSED = 0x0004, +}; + +/* otable based iteration needs to use the common DT interation APIs. + * To initialize the iteration, it needs call dio_it::init() firstly. + * Here is how the otable based iteration should prepare arguments to + * call dt_it_ops::init(). + * + * For otable based iteration, the 32-bits 'attr' for dt_it_ops::init() + * is composed of two parts: + * low 16-bits is for valid bits, high 16-bits is for flags bits. */ +#define DT_OTABLE_IT_FLAGS_SHIFT 16 +#define DT_OTABLE_IT_FLAGS_MASK 0xffff0000 + +struct dt_device { + struct lu_device dd_lu_dev; + const struct dt_device_operations *dd_ops; + + /** + * List of dt_txn_callback (see below). This is not protected in any + * way, because callbacks are supposed to be added/deleted only during + * single-threaded start-up shut-down procedures. + */ + struct list_head dd_txn_callbacks; +}; + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t); +void dt_device_fini(struct dt_device *dev); + +static inline int lu_device_is_dt(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_DT); +} + +static inline struct dt_device * lu2dt_dev(struct lu_device *l) +{ + LASSERT(lu_device_is_dt(l)); + return container_of0(l, struct dt_device, dd_lu_dev); +} + +struct dt_object { + struct lu_object do_lu; + const struct dt_object_operations *do_ops; + const struct dt_body_operations *do_body_ops; + const struct dt_index_operations *do_index_ops; +}; + +/* + * In-core representation of per-device local object OID storage + */ +struct local_oid_storage { + /* all initialized llog systems on this node linked by this */ + struct list_head los_list; + + /* how many handle's reference this los has */ + atomic_t los_refcount; + struct dt_device *los_dev; + struct dt_object *los_obj; + + /* data used to generate new fids */ + struct mutex los_id_lock; + __u64 los_seq; + __u32 los_last_oid; +}; + +static inline struct dt_object *lu2dt(struct lu_object *l) +{ + LASSERT(l == NULL || IS_ERR(l) || lu_device_is_dt(l->lo_dev)); + return container_of0(l, struct dt_object, do_lu); +} + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d); + +void dt_object_fini(struct dt_object *obj); + +static inline int dt_object_exists(const struct dt_object *dt) +{ + return lu_object_exists(&dt->do_lu); +} + +static inline int dt_object_remote(const struct dt_object *dt) +{ + return lu_object_remote(&dt->do_lu); +} + +static inline struct dt_object *lu2dt_obj(struct lu_object *o) +{ + LASSERT(ergo(o != NULL, lu_device_is_dt(o->lo_dev))); + return container_of0(o, struct dt_object, do_lu); +} + +/** + * This is the general purpose transaction handle. + * 1. Transaction Life Cycle + * This transaction handle is allocated upon starting a new transaction, + * and deallocated after this transaction is committed. + * 2. Transaction Nesting + * We do _NOT_ support nested transaction. So, every thread should only + * have one active transaction, and a transaction only belongs to one + * thread. Due to this, transaction handle need no reference count. + * 3. Transaction & dt_object locking + * dt_object locks should be taken inside transaction. + * 4. Transaction & RPC + * No RPC request should be issued inside transaction. + */ +struct thandle { + /** the dt device on which the transactions are executed */ + struct dt_device *th_dev; + + /** context for this transaction, tag is LCT_TX_HANDLE */ + struct lu_context th_ctx; + + /** additional tags (layers can add in declare) */ + __u32 th_tags; + + /** the last operation result in this transaction. + * this value is used in recovery */ + __s32 th_result; + + /** whether we need sync commit */ + unsigned int th_sync:1; + + /* local transation, no need to inform other layers */ + unsigned int th_local:1; + + /* In DNE, one transaction can be disassemblied into + * updates on several different MDTs, and these updates + * will be attached to th_remote_update_list per target. + * Only single thread will access the list, no need lock + */ + struct list_head th_remote_update_list; + struct update_request *th_current_request; +}; + +/** + * Transaction call-backs. + * + * These are invoked by osd (or underlying transaction engine) when + * transaction changes state. + * + * Call-backs are used by upper layers to modify transaction parameters and to + * perform some actions on for each transaction state transition. Typical + * example is mdt registering call-back to write into last-received file + * before each transaction commit. + */ +struct dt_txn_callback { + int (*dtc_txn_start)(const struct lu_env *env, + struct thandle *txn, void *cookie); + int (*dtc_txn_stop)(const struct lu_env *env, + struct thandle *txn, void *cookie); + void (*dtc_txn_commit)(struct thandle *txn, void *cookie); + void *dtc_cookie; + __u32 dtc_tag; + struct list_head dtc_linkage; +}; + +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb); +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *txn); +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); +void dt_txn_hook_commit(struct thandle *txn); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid); + +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *attr); + +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid, + struct lu_device *top_dev); +static inline struct dt_object * +dt_locate(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *fid) +{ + return dt_locate_at(env, dev, fid, dev->dd_lu_dev.ld_site->ls_top_dev); +} + + +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los); +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los); +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid); +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th); +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct lu_attr *attr, struct dt_object_format *dof, + struct thandle *th); +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode); +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode); +struct dt_object * +local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft); +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name); + +static inline int dt_object_lock(const struct lu_env *env, + struct dt_object *o, struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_lock); + return o->do_ops->do_object_lock(env, o, lh, einfo, policy); +} + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid); + +static inline int dt_object_sync(const struct lu_env *env, + struct dt_object *o) +{ + LASSERT(o); + LASSERT(o->do_ops); + LASSERT(o->do_ops->do_object_sync); + return o->do_ops->do_object_sync(env, o); +} + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th); +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th); +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o); + + +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos); +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th); +typedef int (*dt_index_page_build_t)(const struct lu_env *env, + union lu_page *lp, int nob, + const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg); +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg); +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg); + +static inline struct thandle *dt_trans_create(const struct lu_env *env, + struct dt_device *d) +{ + LASSERT(d->dd_ops->dt_trans_create); + return d->dd_ops->dt_trans_create(env, d); +} + +static inline int dt_trans_start(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + return d->dd_ops->dt_trans_start(env, d, th); +} + +/* for this transaction hooks shouldn't be called */ +static inline int dt_trans_start_local(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_start); + th->th_local = 1; + return d->dd_ops->dt_trans_start(env, d, th); +} + +static inline int dt_trans_stop(const struct lu_env *env, + struct dt_device *d, struct thandle *th) +{ + LASSERT(d->dd_ops->dt_trans_stop); + return d->dd_ops->dt_trans_stop(env, th); +} + +static inline int dt_trans_cb_add(struct thandle *th, + struct dt_txn_commit_cb *dcb) +{ + LASSERT(th->th_dev->dd_ops->dt_trans_cb_add); + dcb->dcb_magic = TRANS_COMMIT_CB_MAGIC; + return th->th_dev->dd_ops->dt_trans_cb_add(th, dcb); +} +/** @} dt */ + + +static inline int dt_declare_record_write(const struct lu_env *env, + struct dt_object *dt, + int size, loff_t pos, + struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_write); + rc = dt->do_body_ops->dbo_declare_write(env, dt, size, pos, th); + return rc; +} + +static inline int dt_declare_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_create); + return dt->do_ops->do_declare_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_create); + return dt->do_ops->do_create(env, dt, attr, hint, dof, th); +} + +static inline int dt_declare_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_destroy); + return dt->do_ops->do_declare_destroy(env, dt, th); +} + +static inline int dt_destroy(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_destroy); + return dt->do_ops->do_destroy(env, dt, th); +} + +static inline void dt_read_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_lock); + dt->do_ops->do_read_lock(env, dt, role); +} + +static inline void dt_write_lock(const struct lu_env *env, + struct dt_object *dt, + unsigned role) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_lock); + dt->do_ops->do_write_lock(env, dt, role); +} + +static inline void dt_read_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_read_unlock); + dt->do_ops->do_read_unlock(env, dt); +} + +static inline void dt_write_unlock(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_unlock); + dt->do_ops->do_write_unlock(env, dt); +} + +static inline int dt_write_locked(const struct lu_env *env, + struct dt_object *dt) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_write_locked); + return dt->do_ops->do_write_locked(env, dt); +} + +static inline int dt_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *la, void *arg) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_get); + return dt->do_ops->do_attr_get(env, dt, la, arg); +} + +static inline int dt_declare_attr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_attr *la, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_attr_set); + return dt->do_ops->do_declare_attr_set(env, dt, la, th); +} + +static inline int dt_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *la, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_attr_set); + return dt->do_ops->do_attr_set(env, dt, la, th, capa); +} + +static inline int dt_declare_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_add); + return dt->do_ops->do_declare_ref_add(env, dt, th); +} + +static inline int dt_ref_add(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_add); + return dt->do_ops->do_ref_add(env, dt, th); +} + +static inline int dt_declare_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_ref_del); + return dt->do_ops->do_declare_ref_del(env, dt, th); +} + +static inline int dt_ref_del(const struct lu_env *env, + struct dt_object *dt, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_ref_del(env, dt, th); +} + +static inline struct obd_capa *dt_capa_get(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *old, __u64 opc) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_ref_del); + return dt->do_ops->do_capa_get(env, dt, old, opc); +} + +static inline int dt_bufs_get(const struct lu_env *env, struct dt_object *d, + struct niobuf_remote *rnb, + struct niobuf_local *lnb, int rw, + struct lustre_capa *capa) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_get); + return d->do_body_ops->dbo_bufs_get(env, d, rnb->offset, + rnb->len, lnb, rw, capa); +} + +static inline int dt_bufs_put(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_bufs_put); + return d->do_body_ops->dbo_bufs_put(env, d, lnb, n); +} + +static inline int dt_write_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_prep); + return d->do_body_ops->dbo_write_prep(env, d, lnb, n); +} + +static inline int dt_declare_write_commit(const struct lu_env *env, + struct dt_object *d, + struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERTF(d != NULL, "dt is NULL when we want to declare write\n"); + LASSERT(th != NULL); + return d->do_body_ops->dbo_declare_write_commit(env, d, lnb, n, th); +} + + +static inline int dt_write_commit(const struct lu_env *env, + struct dt_object *d, struct niobuf_local *lnb, + int n, struct thandle *th) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_write_commit); + return d->do_body_ops->dbo_write_commit(env, d, lnb, n, th); +} + +static inline int dt_read_prep(const struct lu_env *env, struct dt_object *d, + struct niobuf_local *lnb, int n) +{ + LASSERT(d); + LASSERT(d->do_body_ops); + LASSERT(d->do_body_ops->dbo_read_prep); + return d->do_body_ops->dbo_read_prep(env, d, lnb, n); +} + +static inline int dt_declare_punch(const struct lu_env *env, + struct dt_object *dt, __u64 start, + __u64 end, struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_declare_punch); + return dt->do_body_ops->dbo_declare_punch(env, dt, start, end, th); +} + +static inline int dt_punch(const struct lu_env *env, struct dt_object *dt, + __u64 start, __u64 end, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_punch); + return dt->do_body_ops->dbo_punch(env, dt, start, end, th, capa); +} + +static inline int dt_fiemap_get(const struct lu_env *env, struct dt_object *d, + struct ll_user_fiemap *fm) +{ + LASSERT(d); + if (d->do_body_ops == NULL) + return -EPROTO; + if (d->do_body_ops->dbo_fiemap_get == NULL) + return -EOPNOTSUPP; + return d->do_body_ops->dbo_fiemap_get(env, d, fm); +} + +static inline int dt_statfs(const struct lu_env *env, struct dt_device *dev, + struct obd_statfs *osfs) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_statfs); + return dev->dd_ops->dt_statfs(env, dev, osfs); +} + +static inline int dt_root_get(const struct lu_env *env, struct dt_device *dev, + struct lu_fid *f) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_root_get); + return dev->dd_ops->dt_root_get(env, dev, f); +} + +static inline void dt_conf_get(const struct lu_env *env, + const struct dt_device *dev, + struct dt_device_param *param) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_conf_get); + return dev->dd_ops->dt_conf_get(env, dev, param); +} + +static inline int dt_sync(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_sync); + return dev->dd_ops->dt_sync(env, dev); +} + +static inline int dt_ro(const struct lu_env *env, struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_ro); + return dev->dd_ops->dt_ro(env, dev); +} + +static inline int dt_declare_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_insert); + return dt->do_index_ops->dio_declare_insert(env, dt, rec, key, th); +} + +static inline int dt_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa, + int noquota) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_insert); + return dt->do_index_ops->dio_insert(env, dt, rec, key, th, + capa, noquota); +} + +static inline int dt_declare_xattr_del(const struct lu_env *env, + struct dt_object *dt, + const char *name, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_del); + return dt->do_ops->do_declare_xattr_del(env, dt, name, th); +} + +static inline int dt_xattr_del(const struct lu_env *env, + struct dt_object *dt, const char *name, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_del); + return dt->do_ops->do_xattr_del(env, dt, name, th, capa); +} + +static inline int dt_declare_xattr_set(const struct lu_env *env, + struct dt_object *dt, + const struct lu_buf *buf, + const char *name, int fl, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_declare_xattr_set); + return dt->do_ops->do_declare_xattr_set(env, dt, buf, name, fl, th); +} + +static inline int dt_xattr_set(const struct lu_env *env, + struct dt_object *dt, const struct lu_buf *buf, + const char *name, int fl, struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_set); + return dt->do_ops->do_xattr_set(env, dt, buf, name, fl, th, capa); +} + +static inline int dt_xattr_get(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + const char *name, struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_get); + return dt->do_ops->do_xattr_get(env, dt, buf, name, capa); +} + +static inline int dt_xattr_list(const struct lu_env *env, + struct dt_object *dt, struct lu_buf *buf, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_ops); + LASSERT(dt->do_ops->do_xattr_list); + return dt->do_ops->do_xattr_list(env, dt, buf, capa); +} + +static inline int dt_declare_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_declare_delete); + return dt->do_index_ops->dio_declare_delete(env, dt, key, th); +} + +static inline int dt_delete(const struct lu_env *env, + struct dt_object *dt, + const struct dt_key *key, + struct thandle *th, + struct lustre_capa *capa) +{ + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_delete); + return dt->do_index_ops->dio_delete(env, dt, key, th, capa); +} + +static inline int dt_commit_async(const struct lu_env *env, + struct dt_device *dev) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_commit_async); + return dev->dd_ops->dt_commit_async(env, dev); +} + +static inline int dt_init_capa_ctxt(const struct lu_env *env, + struct dt_device *dev, + int mode, unsigned long timeout, + __u32 alg, struct lustre_capa_key *keys) +{ + LASSERT(dev); + LASSERT(dev->dd_ops); + LASSERT(dev->dd_ops->dt_init_capa_ctxt); + return dev->dd_ops->dt_init_capa_ctxt(env, dev, mode, + timeout, alg, keys); +} + +static inline int dt_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, + const struct dt_key *key, + struct lustre_capa *capa) +{ + int ret; + + LASSERT(dt); + LASSERT(dt->do_index_ops); + LASSERT(dt->do_index_ops->dio_lookup); + + ret = dt->do_index_ops->dio_lookup(env, dt, rec, key, capa); + if (ret > 0) + ret = 0; + else if (ret == 0) + ret = -ENOENT; + return ret; +} + +#define LU221_BAD_TIME (0x80000000U + 24 * 3600) + +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct dt_find_hint dti_dfh; + struct lu_attr dti_attr; + struct lu_fid dti_fid; + struct dt_object_format dti_dof; + struct lustre_mdt_attrs dti_lma; + struct lu_buf dti_lb; + loff_t dti_off; +}; + +extern struct lu_context_key dt_key; + +static inline struct dt_thread_info *dt_info(const struct lu_env *env) +{ + struct dt_thread_info *dti; + + dti = lu_context_key_get(&env->le_ctx, &dt_key); + LASSERT(dti); + return dti; +} + +int dt_global_init(void); +void dt_global_fini(void); + +# ifdef LPROCFS +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data); +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data); +# endif /* LPROCFS */ + +#endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/interval_tree.h b/drivers/staging/lustre/lustre/include/interval_tree.h new file mode 100644 index 000000000000..dfdb8aa4e035 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/interval_tree.h @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/interval_tree.h + * + * Author: Huang Wei <huangwei@clusterfs.com> + * Author: Jay Xiong <jinshan.xiong@sun.com> + */ + +#ifndef _INTERVAL_H__ +#define _INTERVAL_H__ + +#include <linux/libcfs/libcfs.h> /* LASSERT. */ + +struct interval_node { + struct interval_node *in_left; + struct interval_node *in_right; + struct interval_node *in_parent; + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ + __u64 in_max_high; + struct interval_node_extent { + __u64 start; + __u64 end; + } in_extent; +}; + +enum interval_iter { + INTERVAL_ITER_CONT = 1, + INTERVAL_ITER_STOP = 2 +}; + +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + +static inline __u64 interval_low(struct interval_node *node) +{ + return node->in_extent.start; +} + +static inline __u64 interval_high(struct interval_node *node) +{ + return node->in_extent.end; +} + +static inline void interval_set(struct interval_node *node, + __u64 start, __u64 end) +{ + LASSERT(start <= end); + node->in_extent.start = start; + node->in_extent.end = end; + node->in_max_high = end; +} + +/* Rules to write an interval callback. + * - the callback returns INTERVAL_ITER_STOP when it thinks the iteration + * should be stopped. It will then cause the iteration function to return + * immediately with return value INTERVAL_ITER_STOP. + * - callbacks for interval_iterate and interval_iterate_reverse: Every + * nodes in the tree will be set to @node before the callback being called + * - callback for interval_search: Only overlapped node will be set to @node + * before the callback being called. + */ +typedef enum interval_iter (*interval_callback_t)(struct interval_node *node, + void *args); + +struct interval_node *interval_insert(struct interval_node *node, + struct interval_node **root); +void interval_erase(struct interval_node *node, struct interval_node **root); + +/* Search the extents in the tree and call @func for each overlapped + * extents. */ +enum interval_iter interval_search(struct interval_node *root, + struct interval_node_extent *ex, + interval_callback_t func, void *data); + +/* Iterate every node in the tree - by reverse order or regular order. */ +enum interval_iter interval_iterate(struct interval_node *root, + interval_callback_t func, void *data); +enum interval_iter interval_iterate_reverse(struct interval_node *root, + interval_callback_t func,void *data); + +void interval_expand(struct interval_node *root, + struct interval_node_extent *ext, + struct interval_node_extent *limiter); +int interval_is_overlapped(struct interval_node *root, + struct interval_node_extent *ex); +struct interval_node *interval_find(struct interval_node *root, + struct interval_node_extent *ex); +#endif diff --git a/drivers/staging/lustre/lustre/include/ioctl.h b/drivers/staging/lustre/lustre/include/ioctl.h new file mode 100644 index 000000000000..227c261b2ae9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/ioctl.h @@ -0,0 +1,106 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _IOWR + +/* On i386 and x86_64, _ASM_I386_IOCTL_H is defined by the kernel's ioctl.h, + * and on newer kernels this header is shared as _ASM_GENERIC_IOCTL_H. + * + * We can avoid any problems with the kernel header being included again by + * defining _ASM_I386_IOCTL_H here so that a later occurence of <asm/ioctl.h> + * does not include the kernel's ioctl.h after this one. b=14746 */ +#define _ASM_I386_IOCTL_H +#define _ASM_GENERIC_IOCTL_H + +/* ioctl command encoding: 32 bits total, command in lower 16 bits, + * size of the parameter structure in the lower 14 bits of the + * upper 16 bits. + * Encoding the size of the parameter structure in the ioctl request + * The highest 2 bits are reserved for indicating the ``access mode''. + * NOTE: This limits the max parameter size to 16kB -1 ! + */ + +/* + * The following is for compatibility across the various Linux + * platforms. The i386 ioctl numbering scheme doesn't really enforce + * a type field. De facto, however, the top 8 bits of the lower 16 + * bits are indeed used as a type field, so we might just as well make + * this explicit here. Please be sure to use the decoding macros + * below from now on. + */ +#define _IOC_NRBITS 8 +#define _IOC_TYPEBITS 8 +#define _IOC_SIZEBITS 14 +#define _IOC_DIRBITS 2 + +#define _IOC_NRMASK ((1 << _IOC_NRBITS)-1) +#define _IOC_TYPEMASK ((1 << _IOC_TYPEBITS)-1) +#define _IOC_SIZEMASK ((1 << _IOC_SIZEBITS)-1) +#define _IOC_DIRMASK ((1 << _IOC_DIRBITS)-1) + +#define _IOC_NRSHIFT 0 +#define _IOC_TYPESHIFT (_IOC_NRSHIFT+_IOC_NRBITS) +#define _IOC_SIZESHIFT (_IOC_TYPESHIFT+_IOC_TYPEBITS) +#define _IOC_DIRSHIFT (_IOC_SIZESHIFT+_IOC_SIZEBITS) + +/* + * Direction bits. + */ +#define _IOC_NONE 0U +#define _IOC_WRITE 1U +#define _IOC_READ 2U + +#define _IOC(dir,type,nr,size) (((dir) << _IOC_DIRSHIFT) | ((type) << _IOC_TYPESHIFT) | ((nr) << _IOC_NRSHIFT) | ((size) << _IOC_SIZESHIFT)) + +/* used to create numbers */ +#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0) +#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),sizeof(size)) +#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),sizeof(size)) +#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),sizeof(size)) + +/* used to decode ioctl numbers.. */ +#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK) +#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK) +#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK) +#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK) + +/* ...and for the drivers/sound files... */ + +#define IOC_IN (_IOC_WRITE << _IOC_DIRSHIFT) +#define IOC_OUT (_IOC_READ << _IOC_DIRSHIFT) +#define IOC_INOUT ((_IOC_WRITE|_IOC_READ) << _IOC_DIRSHIFT) +#define IOCSIZE_MASK (_IOC_SIZEMASK << _IOC_SIZESHIFT) +#define IOCSIZE_SHIFT (_IOC_SIZESHIFT) + +#endif /* _IOWR */ diff --git a/drivers/staging/lustre/lustre/include/lclient.h b/drivers/staging/lustre/lustre/include/lclient.h new file mode 100644 index 000000000000..9d4011f2908b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lclient.h @@ -0,0 +1,437 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Definitions shared between vvp and liblustre, and other clients in the + * future. + * + * Author: Oleg Drokin <oleg.drokin@sun.com> + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#ifndef LCLIENT_H +#define LCLIENT_H + +blkcnt_t dirty_cnt(struct inode *inode); + +int cl_glimpse_size0(struct inode *inode, int agl); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob, int agl); + +static inline int cl_glimpse_size(struct inode *inode) +{ + return cl_glimpse_size0(inode, 0); +} + +static inline int cl_agl(struct inode *inode) +{ + return cl_glimpse_size0(inode, 1); +} + +/** + * Locking policy for setattr. + */ +enum ccc_setattr_lock_type { + /** Locking is done by server */ + SETATTR_NOLOCK, + /** Extent lock is enqueued */ + SETATTR_EXTENT_LOCK, + /** Existing local extent lock is used */ + SETATTR_MATCH_LOCK +}; + + +/** + * IO state private to vvp or slp layers. + */ +struct ccc_io { + /** super class */ + struct cl_io_slice cui_cl; + struct cl_io_lock_link cui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iovec *cui_iov; + unsigned long cui_nrsegs; + /** + * Total iov count for left IO. + */ + unsigned long cui_tot_nrsegs; + /** + * Old length for iov that was truncated partially. + */ + size_t cui_iov_olen; + /** + * Total size for the left IO. + */ + size_t cui_tot_count; + + union { + struct { + enum ccc_setattr_lock_type cui_local_lock; + } setattr; + } u; + /** + * True iff io is processing glimpse right now. + */ + int cui_glimpse; + /** + * Layout version when this IO is initialized + */ + __u32 cui_layout_gen; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *cui_fd; + struct kiocb *cui_iocb; +}; + +/** + * True, if \a io is a normal io, False for other (sendfile, splice*). + * must be impementated in arch specific code. + */ +int cl_is_normalio(const struct lu_env *env, const struct cl_io *io); + +extern struct lu_context_key ccc_key; +extern struct lu_context_key ccc_session_key; + +struct ccc_thread_info { + struct cl_lock_descr cti_descr; + struct cl_io cti_io; + struct cl_attr cti_attr; +}; + +static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env) +{ + struct ccc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &ccc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct cl_attr *ccc_env_thread_attr(const struct lu_env *env) +{ + struct cl_attr *attr = &ccc_env_info(env)->cti_attr; + memset(attr, 0, sizeof(*attr)); + return attr; +} + +static inline struct cl_io *ccc_env_thread_io(const struct lu_env *env) +{ + struct cl_io *io = &ccc_env_info(env)->cti_io; + memset(io, 0, sizeof(*io)); + return io; +} + +struct ccc_session { + struct ccc_io cs_ios; +}; + +static inline struct ccc_session *ccc_env_session(const struct lu_env *env) +{ + struct ccc_session *ses; + + ses = lu_context_key_get(env->le_ses, &ccc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct ccc_io *ccc_env_io(const struct lu_env *env) +{ + return &ccc_env_session(env)->cs_ios; +} + +/** + * ccc-private object state. + */ +struct ccc_object { + struct cl_object_header cob_header; + struct cl_object cob_cl; + struct inode *cob_inode; + + /** + * A list of dirty pages pending IO in the cache. Used by + * SOM. Protected by ll_inode_info::lli_lock. + * + * \see ccc_page::cpg_pending_linkage + */ + struct list_head cob_pending_list; + + /** + * Access this counter is protected by inode->i_sem. Now that + * the lifetime of transient pages must be covered by inode sem, + * we don't need to hold any lock.. + */ + int cob_transient_pages; + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t cob_mmap_cnt; + + /** + * various flags + * cob_discard_page_warned + * if pages belonging to this object are discarded when a client + * is evicted, some debug info will be printed, this flag will be set + * during processing the first discarded page, then avoid flooding + * debug message for lots of discarded pages. + * + * \see ll_dirty_page_discard_warn. + */ + unsigned int cob_discard_page_warned:1; +}; + +/** + * ccc-private page state. + */ +struct ccc_page { + struct cl_page_slice cpg_cl; + int cpg_defer_uptodate; + int cpg_ra_used; + int cpg_write_queued; + /** + * Non-empty iff this page is already counted in + * ccc_object::cob_pending_list. Protected by + * ccc_object::cob_pending_guard. This list is only used as a flag, + * that is, never iterated through, only checked for list_empty(), but + * having a list is useful for debugging. + */ + struct list_head cpg_pending_linkage; + /** VM page */ + struct page *cpg_page; +}; + +static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct ccc_page, cpg_cl); +} + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); + +struct ccc_device { + struct cl_device cdv_cl; + struct super_block *cdv_sb; + struct cl_device *cdv_next; +}; + +struct ccc_lock { + struct cl_lock_slice clk_cl; +}; + +struct ccc_req { + struct cl_req_slice crq_cl; +}; + +void *ccc_key_init (const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_key_fini (const struct lu_context *ctx, + struct lu_context_key *key, void *data); +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + +int ccc_device_init (const struct lu_env *env, + struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *ccc_device_fini (const struct lu_env *env, + struct lu_device *d); +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops); +struct lu_device *ccc_device_free (const struct lu_env *env, + struct lu_device *d); +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops); + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +void ccc_umount(const struct lu_env *env, struct cl_device *dev); +int ccc_global_init(struct lu_device_type *device_type); +void ccc_global_fini(struct lu_device_type *device_type); +int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob, + const struct cl_object_conf *conf); +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void ccc_object_free(const struct lu_env *env, struct lu_object *obj); +int ccc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io, + const struct cl_lock_operations *lkops); +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +struct page *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice); +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice); +void ccc_transient_page_verify(const struct cl_page *page); +int ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io, int nonblock); +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice); +void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice); +int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); +int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state); + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios); +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end); +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end); +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios); +void ccc_io_advance(const struct lu_env *env, const struct cl_io_slice *ios, + size_t nob); +void ccc_io_update_iov(const struct lu_env *env, struct ccc_io *cio, + struct cl_io *io); +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t start, size_t count, int *exceed); +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *oa, obd_valid flags); + +struct lu_device *ccc2lu_dev (struct ccc_device *vdv); +struct lu_object *ccc2lu (struct ccc_object *vob); +struct ccc_device *lu2ccc_dev (const struct lu_device *d); +struct ccc_device *cl2ccc_dev (const struct cl_device *d); +struct ccc_object *lu2ccc (const struct lu_object *obj); +struct ccc_object *cl2ccc (const struct cl_object *obj); +struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice); +struct ccc_io *cl2ccc_io (const struct lu_env *env, + const struct cl_io_slice *slice); +struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice); +struct page *cl2vm_page (const struct cl_page_slice *slice); +struct inode *ccc_object_inode(const struct cl_object *obj); +struct ccc_object *cl_inode2ccc (struct inode *inode); + +int cl_setattr_ost(struct inode *inode, const struct iattr *attr, + struct obd_capa *capa); + +struct cl_page *ccc_vmpage_page_transient(struct page *vmpage); +int ccc_object_invariant(const struct cl_object *obj); +int cl_file_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); +int cl_local_size(struct inode *inode); + +__u16 ll_dirent_type_get(struct lu_dirent *ent); +__u64 cl_fid_build_ino(const struct lu_fid *fid, int api32); +__u32 cl_fid_build_gen(const struct lu_fid *fid); + +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr)) + +int cl_init_ea_size(struct obd_export *md_exp, struct obd_export *dt_exp); +int cl_ocd_update(struct obd_device *host, + struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + +struct ccc_grouplock { + struct lu_env *cg_env; + struct cl_io *cg_io; + struct cl_lock *cg_lock; + unsigned long cg_gid; +}; + +int cl_get_grouplock(struct cl_object *obj, unsigned long gid, int nonblock, + struct ccc_grouplock *cg); +void cl_put_grouplock(struct ccc_grouplock *cg); + +/** + * New interfaces to get and put lov_stripe_md from lov layer. This violates + * layering because lov_stripe_md is supposed to be a private data in lov. + * + * NB: If you find you have to use these interfaces for your new code, please + * think about it again. These interfaces may be removed in the future for + * better layering. */ +struct lov_stripe_md *lov_lsm_get(struct cl_object *clobj); +void lov_lsm_put(struct cl_object *clobj, struct lov_stripe_md *lsm); +int lov_read_and_clear_async_rc(struct cl_object *clob); + +struct lov_stripe_md *ccc_inode_lsm_get(struct inode *inode); +void ccc_inode_lsm_put(struct inode *inode, struct lov_stripe_md *lsm); + +/** + * Data structure managing a client's cached clean pages. An LRU of + * pages is maintained, along with other statistics. + */ +struct cl_client_cache { + atomic_t ccc_users; /* # of users (OSCs) of this data */ + struct list_head ccc_lru; /* LRU list of cached clean pages */ + spinlock_t ccc_lru_lock; /* lock for list */ + atomic_t ccc_lru_left; /* # of LRU entries available */ + unsigned long ccc_lru_max; /* Max # of LRU entries possible */ + unsigned int ccc_lru_shrinkers; /* # of threads reclaiming */ +}; + +#endif /*LCLIENT_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h new file mode 100644 index 000000000000..586692272d78 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lprocfs_status.h @@ -0,0 +1,58 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lprocfs_status.h + * + * Top level header file for LProc SNMP + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LINUX_LPROCFS_SNMP_H +#define _LINUX_LPROCFS_SNMP_H + +#ifndef _LPROCFS_SNMP_H +#error Do not #include this file directly. #include <lprocfs_status.h> instead +#endif + +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/version.h> +#include <linux/smp.h> +#include <linux/rwsem.h> +#include <linux/libcfs/libcfs.h> +#include <linux/statfs.h> + + +#endif /* LPROCFS_SNMP_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_acl.h b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h new file mode 100644 index 000000000000..ff4fc4ff2894 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_acl.h @@ -0,0 +1,66 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_acl.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_LINUX_ACL_H +#define _LUSTRE_LINUX_ACL_H + +#ifndef _LUSTRE_ACL_H +#error Shoud not include direectly. use #include <lustre_acl.h> instead +#endif + +# include <linux/fs.h> +# include <linux/dcache.h> +# ifdef CONFIG_FS_POSIX_ACL +# include <linux/posix_acl_xattr.h> +# define LUSTRE_POSIX_ACL_MAX_ENTRIES 32 +# define LUSTRE_POSIX_ACL_MAX_SIZE \ + (sizeof(posix_acl_xattr_header) + \ + LUSTRE_POSIX_ACL_MAX_ENTRIES * sizeof(posix_acl_xattr_entry)) +# endif /* CONFIG_FS_POSIX_ACL */ +# include <linux/lustre_intent.h> +# include <linux/xattr.h> /* XATTR_{REPLACE,CREATE} */ + +#ifndef LUSTRE_POSIX_ACL_MAX_SIZE +# define LUSTRE_POSIX_ACL_MAX_SIZE 0 +#endif + +#endif /* _LUSTRE_LINUX_ACL_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_common.h b/drivers/staging/lustre/lustre/include/linux/lustre_common.h new file mode 100644 index 000000000000..d1783a33d8ca --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_common.h @@ -0,0 +1,22 @@ +#ifndef LUSTRE_COMMON_H +#define LUSTRE_COMMON_H + +#include <linux/sched.h> + +static inline int cfs_cleanup_group_info(void) +{ + struct group_info *ginfo; + + ginfo = groups_alloc(0); + if (!ginfo) + return -ENOMEM; + + set_current_groups(ginfo); + put_group_info(ginfo); + + return 0; +} + +#define ll_inode_blksize(a) (1<<(a)->i_blkbits) + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h new file mode 100644 index 000000000000..dff04688945b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_compat25.h @@ -0,0 +1,349 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_COMPAT25_H +#define _LINUX_COMPAT25_H + +#include <linux/fs_struct.h> +#include <linux/namei.h> +#include <linux/libcfs/linux/portals_compat25.h> + +#include <linux/lustre_patchless_compat.h> + +# define LOCK_FS_STRUCT(fs) spin_lock(&(fs)->lock) +# define UNLOCK_FS_STRUCT(fs) spin_unlock(&(fs)->lock) + +static inline void ll_set_fs_pwd(struct fs_struct *fs, struct vfsmount *mnt, + struct dentry *dentry) +{ + struct path path; + struct path old_pwd; + + path.mnt = mnt; + path.dentry = dentry; + LOCK_FS_STRUCT(fs); + old_pwd = fs->pwd; + path_get(&path); + fs->pwd = path; + UNLOCK_FS_STRUCT(fs); + + if (old_pwd.dentry) + path_put(&old_pwd); +} + + +/* + * set ATTR_BLOCKS to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_BLOCKS (1 << 27) + +#define current_ngroups current_cred()->group_info->ngroups +#define current_groups current_cred()->group_info->small_block + +/* + * OBD need working random driver, thus all our + * initialization routines must be called after device + * driver initialization + */ +#ifndef MODULE +#undef module_init +#define module_init(a) late_initcall(a) +#endif + + +#define LTIME_S(time) (time.tv_sec) + +#define ll_permission(inode,mask,nd) inode_permission(inode,mask) + +# define ll_generic_permission(inode, mask, flags, check_acl) \ + generic_permission(inode, mask) + +#define ll_blkdev_put(a, b) blkdev_put(a, b) + +#define ll_dentry_open(a,b,c) dentry_open(a,b,c) + +#define ll_vfs_symlink(dir, dentry, mnt, path, mode) \ + vfs_symlink(dir, dentry, path) + + +#define ll_generic_file_llseek_size(file, offset, origin, maxbytes, eof) \ + generic_file_llseek_size(file, offset, origin, maxbytes, eof); + +/* inode_dio_wait(i) use as-is for write lock */ +# define inode_dio_write_done(i) do {} while (0) /* for write unlock */ +# define inode_dio_read(i) atomic_inc(&(i)->i_dio_count) +/* inode_dio_done(i) use as-is for read unlock */ + +#define TREE_READ_LOCK_IRQ(mapping) spin_lock_irq(&(mapping)->tree_lock) +#define TREE_READ_UNLOCK_IRQ(mapping) spin_unlock_irq(&(mapping)->tree_lock) + +static inline +int ll_unregister_blkdev(unsigned int dev, const char *name) +{ + unregister_blkdev(dev, name); + return 0; +} + +#define ll_invalidate_bdev(a,b) invalidate_bdev((a)) + +#ifndef FS_HAS_FIEMAP +#define FS_HAS_FIEMAP (0) +#endif + + + +/* add a lustre compatible layer for crypto API */ +#include <linux/crypto.h> +#define ll_crypto_hash crypto_hash +#define ll_crypto_cipher crypto_blkcipher +#define ll_crypto_alloc_hash(name, type, mask) crypto_alloc_hash(name, type, mask) +#define ll_crypto_hash_setkey(tfm, key, keylen) crypto_hash_setkey(tfm, key, keylen) +#define ll_crypto_hash_init(desc) crypto_hash_init(desc) +#define ll_crypto_hash_update(desc, sl, bytes) crypto_hash_update(desc, sl, bytes) +#define ll_crypto_hash_final(desc, out) crypto_hash_final(desc, out) +#define ll_crypto_blkcipher_setkey(tfm, key, keylen) \ + crypto_blkcipher_setkey(tfm, key, keylen) +#define ll_crypto_blkcipher_set_iv(tfm, src, len) \ + crypto_blkcipher_set_iv(tfm, src, len) +#define ll_crypto_blkcipher_get_iv(tfm, dst, len) \ + crypto_blkcipher_get_iv(tfm, dst, len) +#define ll_crypto_blkcipher_encrypt(desc, dst, src, bytes) \ + crypto_blkcipher_encrypt(desc, dst, src, bytes) +#define ll_crypto_blkcipher_decrypt(desc, dst, src, bytes) \ + crypto_blkcipher_decrypt(desc, dst, src, bytes) +#define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \ + crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) +#define ll_crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) \ + crypto_blkcipher_decrypt_iv(desc, dst, src, bytes) + +static inline +struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char *name, + u32 type, u32 mask) +{ + struct ll_crypto_cipher *rtn = crypto_alloc_blkcipher(name, type, mask); + + return (rtn == NULL ? ERR_PTR(-ENOMEM) : rtn); +} + +static inline int ll_crypto_hmac(struct ll_crypto_hash *tfm, + u8 *key, unsigned int *keylen, + struct scatterlist *sg, + unsigned int size, u8 *result) +{ + struct hash_desc desc; + int rv; + desc.tfm = tfm; + desc.flags = 0; + rv = crypto_hash_setkey(desc.tfm, key, *keylen); + if (rv) { + CERROR("failed to hash setkey: %d\n", rv); + return rv; + } + return crypto_hash_digest(&desc, sg, size, result); +} +static inline +unsigned int ll_crypto_tfm_alg_max_keysize(struct crypto_blkcipher *tfm) +{ + return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.max_keysize; +} +static inline +unsigned int ll_crypto_tfm_alg_min_keysize(struct crypto_blkcipher *tfm) +{ + return crypto_blkcipher_tfm(tfm)->__crt_alg->cra_blkcipher.min_keysize; +} + +#define ll_crypto_hash_blocksize(tfm) crypto_hash_blocksize(tfm) +#define ll_crypto_hash_digestsize(tfm) crypto_hash_digestsize(tfm) +#define ll_crypto_blkcipher_ivsize(tfm) crypto_blkcipher_ivsize(tfm) +#define ll_crypto_blkcipher_blocksize(tfm) crypto_blkcipher_blocksize(tfm) +#define ll_crypto_free_hash(tfm) crypto_free_hash(tfm) +#define ll_crypto_free_blkcipher(tfm) crypto_free_blkcipher(tfm) + +#define ll_vfs_rmdir(dir,entry,mnt) vfs_rmdir(dir,entry) +#define ll_vfs_mkdir(inode,dir,mnt,mode) vfs_mkdir(inode,dir,mode) +#define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new) +#define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry) +#define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev) +#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) +#define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \ + vfs_rename(old,old_dir,new,new_dir) + +#ifdef for_each_possible_cpu +#define cfs_for_each_possible_cpu(cpu) for_each_possible_cpu(cpu) +#elif defined(for_each_cpu) +#define cfs_for_each_possible_cpu(cpu) for_each_cpu(cpu) +#endif + +#define cfs_bio_io_error(a,b) bio_io_error((a)) +#define cfs_bio_endio(a,b,c) bio_endio((a),(c)) + +#define cfs_fs_pwd(fs) ((fs)->pwd.dentry) +#define cfs_fs_mnt(fs) ((fs)->pwd.mnt) +#define cfs_path_put(nd) path_put(&(nd)->path) + + +#ifndef SLAB_DESTROY_BY_RCU +#define SLAB_DESTROY_BY_RCU 0 +#endif + + + +static inline int +ll_quota_on(struct super_block *sb, int off, int ver, char *name, int remount) +{ + int rc; + + if (sb->s_qcop->quota_on) { + struct path path; + + rc = kern_path(name, LOOKUP_FOLLOW, &path); + if (!rc) + return rc; + rc = sb->s_qcop->quota_on(sb, off, ver + , &path + ); + path_put(&path); + return rc; + } + else + return -ENOSYS; +} + +static inline int ll_quota_off(struct super_block *sb, int off, int remount) +{ + if (sb->s_qcop->quota_off) { + return sb->s_qcop->quota_off(sb, off + ); + } + else + return -ENOSYS; +} + + +# define ll_vfs_dq_init dquot_initialize +# define ll_vfs_dq_drop dquot_drop +# define ll_vfs_dq_transfer dquot_transfer +# define ll_vfs_dq_off(sb, remount) dquot_suspend(sb, -1) + + + + + +#define queue_max_phys_segments(rq) queue_max_segments(rq) +#define queue_max_hw_segments(rq) queue_max_segments(rq) + +#define ll_kmap_atomic(a, b) kmap_atomic(a) +#define ll_kunmap_atomic(a, b) kunmap_atomic(a) + + +#define ll_d_hlist_node hlist_node +#define ll_d_hlist_empty(list) hlist_empty(list) +#define ll_d_hlist_entry(ptr, type, name) hlist_entry(ptr.first, type, name) +#define ll_d_hlist_for_each(tmp, i_dentry) hlist_for_each(tmp, i_dentry) +#define ll_d_hlist_for_each_entry(dentry, p, i_dentry, alias) \ + p = NULL; hlist_for_each_entry(dentry, i_dentry, alias) + + +#define bio_hw_segments(q, bio) 0 + + +#define ll_pagevec_init(pv, cold) do {} while (0) +#define ll_pagevec_add(pv, pg) (0) +#define ll_pagevec_lru_add_file(pv) do {} while (0) + + +#ifndef QUOTA_OK +# define QUOTA_OK 0 +#endif +#ifndef NO_QUOTA +# define NO_QUOTA (-EDQUOT) +#endif + +#ifndef SEEK_DATA +#define SEEK_DATA 3 /* seek to the next data */ +#endif +#ifndef SEEK_HOLE +#define SEEK_HOLE 4 /* seek to the next hole */ +#endif + +#ifndef FMODE_UNSIGNED_OFFSET +#define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) +#endif + +#if !defined(_ASM_GENERIC_BITOPS_EXT2_NON_ATOMIC_H_) && !defined(ext2_set_bit) +# define ext2_set_bit __test_and_set_bit_le +# define ext2_clear_bit __test_and_clear_bit_le +# define ext2_test_bit test_bit_le +# define ext2_find_first_zero_bit find_first_zero_bit_le +# define ext2_find_next_zero_bit find_next_zero_bit_le +#endif + +#ifdef ATTR_TIMES_SET +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET) +#else +# define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET) +#endif + + + +/* + * After 3.1, kernel's nameidata.intent.open.flags is different + * with lustre's lookup_intent.it_flags, as lustre's it_flags' + * lower bits equal to FMODE_xxx while kernel doesn't transliterate + * lower bits of nameidata.intent.open.flags to FMODE_xxx. + * */ +#include <linux/version.h> +static inline int ll_namei_to_lookup_intent_flag(int flag) +{ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 1, 0) + flag = (flag & ~O_ACCMODE) | OPEN_FMODE(flag); +#endif + return flag; +} + +# define ll_mrf_ret void +# define LL_MRF_RETURN(rc) + +#include <linux/fs.h> + +# define ll_umode_t umode_t + +#include <linux/dcache.h> + +# define ll_dirty_inode(inode, flag) (inode)->i_sb->s_op->dirty_inode((inode), flag) + +#endif /* _COMPAT25_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_debug.h b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h new file mode 100644 index 000000000000..11deac7248ae --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_debug.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_DEBUG_H +#define _LINUX_LUSTRE_DEBUG_H + +#ifndef _LUSTRE_DEBUG_H +#error Do not #include this file directly. #include <lprocfs_status.h> instead +#endif + +#define LL_CDEBUG_PAGE(mask, page, fmt, arg...) \ + CDEBUG(mask, "page %p map %p index %lu flags %lx count %u priv %0lx: "\ + fmt, page, page->mapping, page->index, (long)page->flags, \ + page_count(page), page_private(page), ## arg) + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h new file mode 100644 index 000000000000..207df03f6149 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_dlm.h @@ -0,0 +1,46 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_DLM_H__ +#define _LINUX_LUSTRE_DLM_H__ + +#ifndef _LUSTRE_DLM_H__ +#error Do not #include this file directly. #include <lprocfs_status.h> instead +#endif + +# include <linux/proc_fs.h> +# include <asm/processor.h> +# include <linux/bit_spinlock.h> + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h new file mode 100644 index 000000000000..6c7260957383 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_fsfilt.h @@ -0,0 +1,181 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_fsfilt.h + * + * Filesystem interface helper. + */ + +#ifndef _LINUX_LUSTRE_FSFILT_H +#define _LINUX_LUSTRE_FSFILT_H + +#ifndef _LUSTRE_FSFILT_H +#error Do not #include this file directly. #include <lustre_fsfilt.h> instead +#endif + + +#include <obd.h> +#include <obd_class.h> + +typedef void (*fsfilt_cb_t)(struct obd_device *obd, __u64 last_rcvd, + void *data, int error); + +struct fsfilt_operations { + struct list_head fs_list; + module_t *fs_owner; + char *fs_type; + char *(* fs_getlabel)(struct super_block *sb); + void *(* fs_start)(struct inode *inode, int op, void *desc_private, + int logs); + int (* fs_commit)(struct inode *inode, void *handle,int force_sync); + int (* fs_map_inode_pages)(struct inode *inode, struct page **page, + int pages, unsigned long *blocks, + int create, struct mutex *sem); + int (* fs_write_record)(struct file *, void *, int size, loff_t *, + int force_sync); + int (* fs_read_record)(struct file *, void *, int size, loff_t *); + int (* fs_setup)(struct super_block *sb); +}; + +extern int fsfilt_register_ops(struct fsfilt_operations *fs_ops); +extern void fsfilt_unregister_ops(struct fsfilt_operations *fs_ops); +extern struct fsfilt_operations *fsfilt_get_ops(const char *type); +extern void fsfilt_put_ops(struct fsfilt_operations *fs_ops); + +static inline char *fsfilt_get_label(struct obd_device *obd, + struct super_block *sb) +{ + if (obd->obd_fsops->fs_getlabel == NULL) + return NULL; + if (obd->obd_fsops->fs_getlabel(sb)[0] == '\0') + return NULL; + + return obd->obd_fsops->fs_getlabel(sb); +} + +#define FSFILT_OP_UNLINK 1 +#define FSFILT_OP_CANCEL_UNLINK 10 + +#define __fsfilt_check_slow(obd, start, msg) \ +do { \ + if (cfs_time_before(jiffies, start + 15 * HZ)) \ + break; \ + else if (cfs_time_before(jiffies, start + 30 * HZ)) \ + CDEBUG(D_VFSTRACE, "%s: slow %s %lus\n", obd->obd_name, \ + msg, (jiffies-start) / HZ); \ + else if (cfs_time_before(jiffies, start + DISK_TIMEOUT * HZ)) \ + CWARN("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ + else \ + CERROR("%s: slow %s %lus\n", obd->obd_name, msg, \ + (jiffies - start) / HZ); \ +} while (0) + +#define fsfilt_check_slow(obd, start, msg) \ +do { \ + __fsfilt_check_slow(obd, start, msg); \ + start = jiffies; \ +} while (0) + +static inline void *fsfilt_start_log(struct obd_device *obd, + struct inode *inode, int op, + struct obd_trans_info *oti, int logs) +{ + unsigned long now = jiffies; + void *parent_handle = oti ? oti->oti_handle : NULL; + void *handle; + + handle = obd->obd_fsops->fs_start(inode, op, parent_handle, logs); + CDEBUG(D_INFO, "started handle %p (%p)\n", handle, parent_handle); + + if (oti != NULL) { + if (parent_handle == NULL) { + oti->oti_handle = handle; + } else if (handle != parent_handle) { + CERROR("mismatch: parent %p, handle %p, oti %p\n", + parent_handle, handle, oti); + LBUG(); + } + } + fsfilt_check_slow(obd, now, "journal start"); + return handle; +} + +static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, + void *handle, int force_sync) +{ + unsigned long now = jiffies; + int rc = obd->obd_fsops->fs_commit(inode, handle, force_sync); + CDEBUG(D_INFO, "committing handle %p\n", handle); + + fsfilt_check_slow(obd, now, "journal start"); + + return rc; +} + +static inline int fsfilt_map_inode_pages(struct obd_device *obd, + struct inode *inode, + struct page **page, int pages, + unsigned long *blocks, + int create, struct mutex *mutex) +{ + return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks, + create, mutex); +} + +static inline int fsfilt_read_record(struct obd_device *obd, struct file *file, + void *buf, loff_t size, loff_t *offs) +{ + return obd->obd_fsops->fs_read_record(file, buf, size, offs); +} + +static inline int fsfilt_write_record(struct obd_device *obd, struct file *file, + void *buf, loff_t size, loff_t *offs, + int force_sync) +{ + return obd->obd_fsops->fs_write_record(file, buf, size,offs,force_sync); +} + +static inline int fsfilt_setup(struct obd_device *obd, struct super_block *fs) +{ + if (obd->obd_fsops->fs_setup) + return obd->obd_fsops->fs_setup(fs); + return 0; +} + + + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_handles.h b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h new file mode 100644 index 000000000000..ecf184051252 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_handles.h @@ -0,0 +1,53 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_LUSTRE_HANDLES_H_ +#define __LINUX_LUSTRE_HANDLES_H_ + +#ifndef __LUSTRE_HANDLES_H_ +#error Do not #include this file directly. #include <lustre_handles.h> instead +#endif + +#include <asm/types.h> +#include <asm/atomic.h> +#include <linux/list.h> +#include <linux/version.h> +#include <linux/spinlock.h> +#include <linux/types.h> + +#include <linux/rcupdate.h> /* for rcu_head{} */ +typedef struct rcu_head cfs_rcu_head_t; + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_intent.h b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h new file mode 100644 index 000000000000..b10ddfa7df29 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_intent.h @@ -0,0 +1,62 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_INTENT_H +#define LUSTRE_INTENT_H + +/* intent IT_XXX are defined in lustre/include/obd.h */ +struct lustre_intent_data { + int it_disposition; + int it_status; + __u64 it_lock_handle; + __u64 it_lock_bits; + int it_lock_mode; + int it_remote_lock_mode; + __u64 it_remote_lock_handle; + void *it_data; + unsigned int it_lock_set:1; +}; + +struct lookup_intent { + int it_op; + int it_flags; + int it_create_mode; + union { + struct lustre_intent_data lustre; + } d; +}; + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lib.h b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h new file mode 100644 index 000000000000..b2f755acadf6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_lib.h @@ -0,0 +1,87 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LINUX_LUSTRE_LIB_H +#define _LINUX_LUSTRE_LIB_H + +#ifndef _LUSTRE_LIB_H +#error Do not #include this file directly. #include <lustre_lib.h> instead +#endif + +# include <linux/rwsem.h> +# include <linux/sched.h> +# include <linux/signal.h> +# include <linux/types.h> +# include <linux/lustre_compat25.h> +# include <linux/lustre_common.h> + +#ifndef LP_POISON +#if BITS_PER_LONG > 32 +# define LI_POISON ((int)0x5a5a5a5a5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a5a5a5a5a) +#else +# define LI_POISON ((int)0x5a5a5a5a) +# define LL_POISON ((long)0x5a5a5a5a) +# define LP_POISON ((void *)(long)0x5a5a5a5a) +#endif +#endif + +/* This macro is only for compatibility reasons with older Linux Lustre user + * tools. New ioctls should NOT use this macro as the ioctl "size". Instead + * the ioctl should get a "size" argument which is the actual data type used + * by the ioctl, to ensure the ioctl interface is versioned correctly. */ +#define OBD_IOC_DATA_TYPE long + +#define LUSTRE_FATAL_SIGS (sigmask(SIGKILL) | sigmask(SIGINT) | \ + sigmask(SIGTERM) | sigmask(SIGQUIT) | \ + sigmask(SIGALRM)) + +/* initialize ost_lvb according to inode */ +static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb) +{ + lvb->lvb_size = i_size_read(inode); + lvb->lvb_blocks = inode->i_blocks; + lvb->lvb_mtime = LTIME_S(inode->i_mtime); + lvb->lvb_atime = LTIME_S(inode->i_atime); + lvb->lvb_ctime = LTIME_S(inode->i_ctime); +} + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_lite.h b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h new file mode 100644 index 000000000000..c95dff900b58 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_lite.h @@ -0,0 +1,100 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LL_H +#define _LINUX_LL_H + +#ifndef _LL_H +#error Do not #include this file directly. #include <lustre_lite.h> instead +#endif + + +#include <linux/version.h> + +#include <asm/statfs.h> + +#include <linux/fs.h> +#include <linux/dcache.h> +#include <linux/proc_fs.h> + +#include <obd_class.h> +#include <lustre_net.h> +#include <lustre_ha.h> + +#include <linux/rbtree.h> +#include <linux/lustre_compat25.h> +#include <linux/lustre_common.h> +#include <linux/pagemap.h> + +/* lprocfs.c */ +enum { + LPROC_LL_DIRTY_HITS = 0, + LPROC_LL_DIRTY_MISSES, + LPROC_LL_READ_BYTES, + LPROC_LL_WRITE_BYTES, + LPROC_LL_BRW_READ, + LPROC_LL_BRW_WRITE, + LPROC_LL_OSC_READ, + LPROC_LL_OSC_WRITE, + LPROC_LL_IOCTL, + LPROC_LL_OPEN, + LPROC_LL_RELEASE, + LPROC_LL_MAP, + LPROC_LL_LLSEEK, + LPROC_LL_FSYNC, + LPROC_LL_READDIR, + LPROC_LL_SETATTR, + LPROC_LL_TRUNC, + LPROC_LL_FLOCK, + LPROC_LL_GETATTR, + LPROC_LL_CREATE, + LPROC_LL_LINK, + LPROC_LL_UNLINK, + LPROC_LL_SYMLINK, + LPROC_LL_MKDIR, + LPROC_LL_RMDIR, + LPROC_LL_MKNOD, + LPROC_LL_RENAME, + LPROC_LL_STAFS, + LPROC_LL_ALLOC_INODE, + LPROC_LL_SETXATTR, + LPROC_LL_GETXATTR, + LPROC_LL_LISTXATTR, + LPROC_LL_REMOVEXATTR, + LPROC_LL_INODE_PERM, + LPROC_LL_FILE_OPCODES +}; + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_log.h b/drivers/staging/lustre/lustre/include/linux/lustre_log.h new file mode 100644 index 000000000000..e9c8e56737d2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_log.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LINUX_LUSTRE_LOG_H +#define _LINUX_LUSTRE_LOG_H + +#ifndef _LUSTRE_LOG_H +#error Do not #include this file directly. #include <lustre_log.h> instead +#endif + +#define LUSTRE_LOG_SERVER + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_net.h b/drivers/staging/lustre/lustre/include/linux/lustre_net.h new file mode 100644 index 000000000000..2d7c425d7012 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_net.h @@ -0,0 +1,50 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_NET_H +#define _LINUX_LUSTRE_NET_H + +#ifndef _LUSTRE_NET_H +#error Do not #include this file directly. #include <lustre_net.h> instead +#endif + +#include <linux/version.h> +#include <linux/workqueue.h> + +/* XXX Liang: should be moved to other header instead of here */ +#ifndef WITH_GROUP_INFO +#define WITH_GROUP_INFO +#endif + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h new file mode 100644 index 000000000000..a8e9c0c8ffd2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h @@ -0,0 +1,83 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef LUSTRE_PATCHLESS_COMPAT_H +#define LUSTRE_PATCHLESS_COMPAT_H + +#include <linux/fs.h> + +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/hash.h> + + +#define ll_delete_from_page_cache(page) delete_from_page_cache(page) + +static inline void +truncate_complete_page(struct address_space *mapping, struct page *page) +{ + if (page->mapping != mapping) + return; + + if (PagePrivate(page)) + page->mapping->a_ops->invalidatepage(page, 0); + + cancel_dirty_page(page, PAGE_SIZE); + ClearPageMappedToDisk(page); + ll_delete_from_page_cache(page); +} + +#ifdef ATTR_OPEN +# define ATTR_FROM_OPEN ATTR_OPEN +#else +# ifndef ATTR_FROM_OPEN +# define ATTR_FROM_OPEN 0 +# endif +#endif /* ATTR_OPEN */ + +#ifndef ATTR_RAW +#define ATTR_RAW 0 +#endif + +#ifndef ATTR_CTIME_SET +/* + * set ATTR_CTIME_SET to a high value to avoid any risk of collision with other + * ATTR_* attributes (see bug 13828) + */ +#define ATTR_CTIME_SET (1 << 28) +#endif + +#endif /* LUSTRE_PATCHLESS_COMPAT_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_quota.h b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h new file mode 100644 index 000000000000..421866b004cf --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_quota.h @@ -0,0 +1,47 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_LUSTRE_QUOTA_H +#define _LINUX_LUSTRE_QUOTA_H + +#ifndef _LUSTRE_QUOTA_H +#error Do not #include this file directly. #include <lustre_quota.h> instead +#endif + +#include <linux/version.h> +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/quotaops.h> + +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lustre_user.h b/drivers/staging/lustre/lustre/include/linux/lustre_user.h new file mode 100644 index 000000000000..ebaf92977f7f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lustre_user.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LINUX_LUSTRE_USER_H +#define _LINUX_LUSTRE_USER_H + +# include <linux/version.h> +# include <linux/quota.h> + +/* + * asm-x86_64/processor.h on some SLES 9 distros seems to use + * kernel-only typedefs. fortunately skipping it altogether is ok + * (for now). + */ +#define __ASM_X86_64_PROCESSOR_H + +#include <linux/string.h> + +#if defined(__x86_64__) || defined(__ia64__) || defined(__ppc64__) || \ + defined(__craynv) || defined (__mips64__) || defined(__powerpc64__) +typedef struct stat lstat_t; +#define lstat_f lstat +#define HAVE_LOV_USER_MDS_DATA +#else +typedef struct stat64 lstat_t; +#define lstat_f lstat64 +#define HAVE_LOV_USER_MDS_DATA +#endif + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs.h b/drivers/staging/lustre/lustre/include/linux/lvfs.h new file mode 100644 index 000000000000..eb59ac7d5946 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lvfs.h @@ -0,0 +1,134 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/linux/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LINUX_LVFS_H__ +#define __LINUX_LVFS_H__ + +#ifndef __LVFS_H__ +#error Do not #include this file directly. #include <lvfs.h> instead +#endif + +#include <linux/lustre_compat25.h> +#include <linux/lustre_common.h> +#include <linux/lvfs_linux.h> + +#define LLOG_LVFS + +/* simple.c */ + +struct lvfs_ucred { + __u32 luc_uid; + __u32 luc_gid; + __u32 luc_fsuid; + __u32 luc_fsgid; + kernel_cap_t luc_cap; + __u32 luc_umask; + struct group_info *luc_ginfo; + struct md_identity *luc_identity; +}; + +struct lvfs_callback_ops { + struct dentry *(*l_fid2dentry)(__u64 id_ino, __u32 gen, __u64 gr, void *data); +}; + +#define OBD_RUN_CTXT_MAGIC 0xC0FFEEAA +#define OBD_CTXT_DEBUG /* development-only debugging */ +struct lvfs_run_ctxt { + struct vfsmount *pwdmnt; + struct dentry *pwd; + mm_segment_t fs; + struct lvfs_ucred luc; + int ngroups; + struct lvfs_callback_ops cb_ops; + struct group_info *group_info; + struct dt_device *dt; +#ifdef OBD_CTXT_DEBUG + __u32 magic; +#endif +}; + +#ifdef OBD_CTXT_DEBUG +#define OBD_SET_CTXT_MAGIC(ctxt) (ctxt)->magic = OBD_RUN_CTXT_MAGIC +#else +#define OBD_SET_CTXT_MAGIC(ctxt) do {} while(0) +#endif + + +int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname, + char *newname); + +static inline void l_dput(struct dentry *de) +{ + if (!de || IS_ERR(de)) + return; + //shrink_dcache_parent(de); + LASSERT(d_count(de) > 0); + dput(de); +} + +/* We need to hold the inode semaphore over the dcache lookup itself, or we + * run the risk of entering the filesystem lookup path concurrently on SMP + * systems, and instantiating two inodes for the same entry. We still + * protect against concurrent addition/removal races with the DLM locking. + */ +static inline struct dentry *ll_lookup_one_len(const char *fid_name, + struct dentry *dparent, + int fid_namelen) +{ + struct dentry *dchild; + + mutex_lock(&dparent->d_inode->i_mutex); + dchild = lookup_one_len(fid_name, dparent, fid_namelen); + mutex_unlock(&dparent->d_inode->i_mutex); + + if (IS_ERR(dchild) || dchild->d_inode == NULL) + return dchild; + + if (is_bad_inode(dchild->d_inode)) { + CERROR("bad inode returned %lu/%u\n", + dchild->d_inode->i_ino, dchild->d_inode->i_generation); + dput(dchild); + dchild = ERR_PTR(-ENOENT); + } + return dchild; +} + + +#endif diff --git a/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h new file mode 100644 index 000000000000..140a60f1f0c9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/lvfs_linux.h @@ -0,0 +1,66 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LVFS_LINUX_H__ +#define __LVFS_LINUX_H__ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/sched.h> + +#include <lvfs.h> + +#define l_file file +#define l_dentry dentry + +#define l_filp_open filp_open + +struct lvfs_run_ctxt; +struct l_file *l_dentry_open(struct lvfs_run_ctxt *, struct l_dentry *, + int flags); + +struct l_linux_dirent { + struct list_head lld_list; + ino_t lld_ino; + unsigned long lld_off; + char lld_name[LL_FID_NAMELEN]; +}; +struct l_readdir_callback { + struct l_linux_dirent *lrc_dirent; + struct list_head *lrc_list; +}; + +#endif /* __LVFS_LINUX_H__ */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd.h b/drivers/staging/lustre/lustre/include/linux/obd.h new file mode 100644 index 000000000000..2c36c0d19d06 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd.h @@ -0,0 +1,128 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_OBD_H +#define __LINUX_OBD_H + +#ifndef __OBD_H +#error Do not #include this file directly. #include <obd.h> instead +#endif + +#include <obd_support.h> + +# include <linux/fs.h> +# include <linux/list.h> +# include <linux/sched.h> /* for struct task_struct, for current.h */ +# include <linux/proc_fs.h> +# include <linux/mount.h> +# include <linux/lustre_intent.h> + +struct ll_iattr { + struct iattr iattr; + unsigned int ia_attr_flags; +}; + +#define CLIENT_OBD_LIST_LOCK_DEBUG 1 + +typedef struct { + spinlock_t lock; + + unsigned long time; + struct task_struct *task; + const char *func; + int line; +} client_obd_lock_t; + +static inline void __client_obd_list_lock(client_obd_lock_t *lock, + const char *func, int line) +{ + unsigned long cur = jiffies; + while (1) { + if (spin_trylock(&lock->lock)) { + LASSERT(lock->task == NULL); + lock->task = current; + lock->func = func; + lock->line = line; + lock->time = jiffies; + break; + } + + if ((jiffies - cur > 5 * HZ) && + (jiffies - lock->time > 5 * HZ)) { + struct task_struct *task = lock->task; + + if (task == NULL) + continue; + + LCONSOLE_WARN("%s:%d: lock %p was acquired" + " by <%s:%d:%s:%d> for %lu seconds.\n", + current->comm, current->pid, + lock, task->comm, task->pid, + lock->func, lock->line, + (jiffies - lock->time) / HZ); + LCONSOLE_WARN("====== for process holding the " + "lock =====\n"); + libcfs_debug_dumpstack(task); + LCONSOLE_WARN("====== for current process =====\n"); + libcfs_debug_dumpstack(NULL); + LCONSOLE_WARN("====== end =======\n"); + cfs_pause(1000 * HZ); + } + cpu_relax(); + } +} + +#define client_obd_list_lock(lock) \ + __client_obd_list_lock(lock, __FUNCTION__, __LINE__) + +static inline void client_obd_list_unlock(client_obd_lock_t *lock) +{ + LASSERT(lock->task != NULL); + lock->task = NULL; + lock->time = jiffies; + spin_unlock(&lock->lock); +} + + +static inline void client_obd_list_lock_init(client_obd_lock_t *lock) +{ + spin_lock_init(&lock->lock); +} + +static inline void client_obd_list_lock_done(client_obd_lock_t *lock) +{} + +#endif /* __LINUX_OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd_class.h b/drivers/staging/lustre/lustre/include/linux/obd_class.h new file mode 100644 index 000000000000..021ead6639fc --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd_class.h @@ -0,0 +1,58 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_CLASS_OBD_H +#define __LINUX_CLASS_OBD_H + +#ifndef __CLASS_OBD_H +#error Do not #include this file directly. #include <obd_class.h> instead +#endif + +#include <asm/uaccess.h> +#include <linux/types.h> +#include <linux/fs.h> +#include <linux/time.h> +#include <linux/timer.h> + +/* obdo.c */ +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid); +void la_from_obdo(struct lu_attr *la, struct obdo *dst, obd_flag valid); +void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid); +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid); +#define ll_inode_flags(inode) (inode->i_flags) + + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/linux/obd_support.h b/drivers/staging/lustre/lustre/include/linux/obd_support.h new file mode 100644 index 000000000000..9166503408aa --- /dev/null +++ b/drivers/staging/lustre/lustre/include/linux/obd_support.h @@ -0,0 +1,63 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LINUX_OBD_SUPPORT +#define _LINUX_OBD_SUPPORT + +#ifndef _OBD_SUPPORT +#error Do not #include this file directly. #include <obd_support.h> instead +#endif + +#ifdef CONFIG_X86 +#include <asm/cpufeature.h> +#endif +#include <asm/processor.h> +#include <linux/seq_file.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/swap.h> +#include <linux/lustre_compat25.h> +#include <linux/lustre_common.h> +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> + + +# include <linux/types.h> +# include <linux/blkdev.h> +# include <lvfs.h> + +#endif diff --git a/drivers/staging/lustre/lustre/include/lprocfs_status.h b/drivers/staging/lustre/lustre/include/lprocfs_status.h new file mode 100644 index 000000000000..55f182205d78 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lprocfs_status.h @@ -0,0 +1,1043 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lprocfs_status.h + * + * Top level header file for LProc SNMP + * + * Author: Hariharan Thantry thantry@users.sourceforge.net + */ +#ifndef _LPROCFS_SNMP_H +#define _LPROCFS_SNMP_H + +#include <linux/lprocfs_status.h> +#include <lustre/lustre_idl.h> +#include <linux/libcfs/params_tree.h> + +struct lprocfs_vars { + const char *name; + struct file_operations *fops; + void *data; + /** + * /proc file mode. + */ + umode_t proc_mode; +}; + +struct lprocfs_static_vars { + struct lprocfs_vars *module_vars; + struct lprocfs_vars *obd_vars; +}; + +/* if we find more consumers this could be generalized */ +#define OBD_HIST_MAX 32 +struct obd_histogram { + spinlock_t oh_lock; + unsigned long oh_buckets[OBD_HIST_MAX]; +}; + +enum { + BRW_R_PAGES = 0, + BRW_W_PAGES, + BRW_R_RPC_HIST, + BRW_W_RPC_HIST, + BRW_R_IO_TIME, + BRW_W_IO_TIME, + BRW_R_DISCONT_PAGES, + BRW_W_DISCONT_PAGES, + BRW_R_DISCONT_BLOCKS, + BRW_W_DISCONT_BLOCKS, + BRW_R_DISK_IOSIZE, + BRW_W_DISK_IOSIZE, + BRW_R_DIO_FRAGS, + BRW_W_DIO_FRAGS, + BRW_LAST, +}; + +struct brw_stats { + struct obd_histogram hist[BRW_LAST]; +}; + +enum { + RENAME_SAMEDIR_SIZE = 0, + RENAME_CROSSDIR_SRC_SIZE, + RENAME_CROSSDIR_TGT_SIZE, + RENAME_LAST, +}; + +struct rename_stats { + struct obd_histogram hist[RENAME_LAST]; +}; + +/* An lprocfs counter can be configured using the enum bit masks below. + * + * LPROCFS_CNTR_EXTERNALLOCK indicates that an external lock already + * protects this counter from concurrent updates. If not specified, + * lprocfs an internal per-counter lock variable. External locks are + * not used to protect counter increments, but are used to protect + * counter readout and resets. + * + * LPROCFS_CNTR_AVGMINMAX indicates a multi-valued counter samples, + * (i.e. counter can be incremented by more than "1"). When specified, + * the counter maintains min, max and sum in addition to a simple + * invocation count. This allows averages to be be computed. + * If not specified, the counter is an increment-by-1 counter. + * min, max, sum, etc. are not maintained. + * + * LPROCFS_CNTR_STDDEV indicates that the counter should track sum of + * squares (for multi-valued counter samples only). This allows + * external computation of standard deviation, but involves a 64-bit + * multiply per counter increment. + */ + +enum { + LPROCFS_CNTR_EXTERNALLOCK = 0x0001, + LPROCFS_CNTR_AVGMINMAX = 0x0002, + LPROCFS_CNTR_STDDEV = 0x0004, + + /* counter data type */ + LPROCFS_TYPE_REGS = 0x0100, + LPROCFS_TYPE_BYTES = 0x0200, + LPROCFS_TYPE_PAGES = 0x0400, + LPROCFS_TYPE_CYCLE = 0x0800, +}; + +#define LC_MIN_INIT ((~(__u64)0) >> 1) + +struct lprocfs_counter_header { + unsigned int lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ +}; + +struct lprocfs_counter { + __s64 lc_count; + __s64 lc_min; + __s64 lc_max; + __s64 lc_sumsquare; + /* + * Every counter has lc_array_sum[0], while lc_array_sum[1] is only + * for irq context counter, i.e. stats with + * LPROCFS_STATS_FLAG_IRQ_SAFE flag, its counter need + * lc_array_sum[1] + */ + __s64 lc_array_sum[1]; +}; +#define lc_sum lc_array_sum[0] +#define lc_sum_irq lc_array_sum[1] + +struct lprocfs_percpu { +#ifndef __GNUC__ + __s64 pad; +#endif + struct lprocfs_counter lp_cntr[0]; +}; + +#define LPROCFS_GET_NUM_CPU 0x0001 +#define LPROCFS_GET_SMP_ID 0x0002 + +enum lprocfs_stats_flags { + LPROCFS_STATS_FLAG_NONE = 0x0000, /* per cpu counter */ + LPROCFS_STATS_FLAG_NOPERCPU = 0x0001, /* stats have no percpu + * area and need locking */ + LPROCFS_STATS_FLAG_IRQ_SAFE = 0x0002, /* alloc need irq safe */ +}; + +enum lprocfs_fields_flags { + LPROCFS_FIELDS_FLAGS_CONFIG = 0x0001, + LPROCFS_FIELDS_FLAGS_SUM = 0x0002, + LPROCFS_FIELDS_FLAGS_MIN = 0x0003, + LPROCFS_FIELDS_FLAGS_MAX = 0x0004, + LPROCFS_FIELDS_FLAGS_AVG = 0x0005, + LPROCFS_FIELDS_FLAGS_SUMSQUARE = 0x0006, + LPROCFS_FIELDS_FLAGS_COUNT = 0x0007, +}; + +struct lprocfs_stats { + /* # of counters */ + unsigned short ls_num; + /* 1 + the biggest cpu # whose ls_percpu slot has been allocated */ + unsigned short ls_biggest_alloc_num; + enum lprocfs_stats_flags ls_flags; + /* Lock used when there are no percpu stats areas; For percpu stats, + * it is used to protect ls_biggest_alloc_num change */ + spinlock_t ls_lock; + + /* has ls_num of counter headers */ + struct lprocfs_counter_header *ls_cnt_header; + struct lprocfs_percpu *ls_percpu[0]; +}; + +#define OPC_RANGE(seg) (seg ## _LAST_OPC - seg ## _FIRST_OPC) + +/* Pack all opcodes down into a single monotonically increasing index */ +static inline int opcode_offset(__u32 opc) { + if (opc < OST_LAST_OPC) { + /* OST opcode */ + return (opc - OST_FIRST_OPC); + } else if (opc < MDS_LAST_OPC) { + /* MDS opcode */ + return (opc - MDS_FIRST_OPC + + OPC_RANGE(OST)); + } else if (opc < LDLM_LAST_OPC) { + /* LDLM Opcode */ + return (opc - LDLM_FIRST_OPC + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < MGS_LAST_OPC) { + /* MGS Opcode */ + return (opc - MGS_FIRST_OPC + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < OBD_LAST_OPC) { + /* OBD Ping */ + return (opc - OBD_FIRST_OPC + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < LLOG_LAST_OPC) { + /* LLOG Opcode */ + return (opc - LLOG_FIRST_OPC + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEQ_LAST_OPC) { + /* SEQ opcode */ + return (opc - SEQ_FIRST_OPC + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < SEC_LAST_OPC) { + /* SEC opcode */ + return (opc - SEC_FIRST_OPC + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < FLD_LAST_OPC) { + /* FLD opcode */ + return (opc - FLD_FIRST_OPC + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else if (opc < UPDATE_LAST_OPC) { + /* update opcode */ + return (opc - UPDATE_FIRST_OPC + + OPC_RANGE(FLD) + + OPC_RANGE(SEC) + + OPC_RANGE(SEQ) + + OPC_RANGE(QUOTA) + + OPC_RANGE(LLOG) + + OPC_RANGE(OBD) + + OPC_RANGE(MGS) + + OPC_RANGE(LDLM) + + OPC_RANGE(MDS) + + OPC_RANGE(OST)); + } else { + /* Unknown Opcode */ + return -1; + } +} + + +#define LUSTRE_MAX_OPCODES (OPC_RANGE(OST) + \ + OPC_RANGE(MDS) + \ + OPC_RANGE(LDLM) + \ + OPC_RANGE(MGS) + \ + OPC_RANGE(OBD) + \ + OPC_RANGE(LLOG) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(SEQ) + \ + OPC_RANGE(SEC) + \ + OPC_RANGE(FLD) + \ + OPC_RANGE(UPDATE)) + +#define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ + OPC_RANGE(EXTRA)) + +enum { + PTLRPC_REQWAIT_CNTR = 0, + PTLRPC_REQQDEPTH_CNTR, + PTLRPC_REQACTIVE_CNTR, + PTLRPC_TIMEOUT, + PTLRPC_REQBUF_AVAIL_CNTR, + PTLRPC_LAST_CNTR +}; + +#define PTLRPC_FIRST_CNTR PTLRPC_REQWAIT_CNTR + +enum { + LDLM_GLIMPSE_ENQUEUE = 0, + LDLM_PLAIN_ENQUEUE, + LDLM_EXTENT_ENQUEUE, + LDLM_FLOCK_ENQUEUE, + LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, + MDS_REINT_CREATE, + MDS_REINT_LINK, + MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, + BRW_READ_BYTES, + BRW_WRITE_BYTES, + EXTRA_LAST_OPC +}; + +#define EXTRA_FIRST_OPC LDLM_GLIMPSE_ENQUEUE +/* class_obd.c */ +extern proc_dir_entry_t *proc_lustre_root; + +struct obd_device; +struct obd_histogram; + +/* Days / hours / mins / seconds format */ +struct dhms { + int d,h,m,s; +}; +static inline void s2dhms(struct dhms *ts, time_t secs) +{ + ts->d = secs / 86400; + secs = secs % 86400; + ts->h = secs / 3600; + secs = secs % 3600; + ts->m = secs / 60; + ts->s = secs % 60; +} +#define DHMS_FMT "%dd%dh%02dm%02ds" +#define DHMS_VARS(x) (x)->d, (x)->h, (x)->m, (x)->s + +#define JOBSTATS_JOBID_VAR_MAX_LEN 20 +#define JOBSTATS_DISABLE "disable" +#define JOBSTATS_PROCNAME_UID "procname_uid" + +typedef void (*cntr_init_callback)(struct lprocfs_stats *stats); + +struct obd_job_stats { + cfs_hash_t *ojs_hash; + struct list_head ojs_list; + rwlock_t ojs_lock; /* protect the obj_list */ + cntr_init_callback ojs_cntr_init_fn; + int ojs_cntr_num; + int ojs_cleanup_interval; + time_t ojs_last_cleanup; +}; + +#ifdef LPROCFS + +extern int lprocfs_stats_alloc_one(struct lprocfs_stats *stats, + unsigned int cpuid); +/* + * \return value + * < 0 : on error (only possible for opc as LPROCFS_GET_SMP_ID) + */ +static inline int lprocfs_stats_lock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + int rc = 0; + + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 0; + } else { + unsigned int cpuid = get_cpu(); + + if (unlikely(stats->ls_percpu[cpuid] == NULL)) { + rc = lprocfs_stats_alloc_one(stats, cpuid); + if (rc < 0) { + put_cpu(); + return rc; + } + } + return cpuid; + } + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + spin_lock_irqsave(&stats->ls_lock, *flags); + else + spin_lock(&stats->ls_lock); + return 1; + } else { + return stats->ls_biggest_alloc_num; + } + } +} + +static inline void lprocfs_stats_unlock(struct lprocfs_stats *stats, int opc, + unsigned long *flags) +{ + switch (opc) { + default: + LBUG(); + + case LPROCFS_GET_SMP_ID: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } else { + put_cpu(); + } + return; + + case LPROCFS_GET_NUM_CPU: + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) { + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) { + spin_unlock_irqrestore(&stats->ls_lock, + *flags); + } else { + spin_unlock(&stats->ls_lock); + } + } + return; + } +} + +static inline unsigned int +lprocfs_stats_counter_size(struct lprocfs_stats *stats) +{ + unsigned int percpusize; + + percpusize = offsetof(struct lprocfs_percpu, lp_cntr[stats->ls_num]); + + /* irq safe stats need lc_array_sum[1] */ + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpusize += stats->ls_num * sizeof(__s64); + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) == 0) + percpusize = L1_CACHE_ALIGN(percpusize); + + return percpusize; +} + +static inline struct lprocfs_counter * +lprocfs_stats_counter_get(struct lprocfs_stats *stats, unsigned int cpuid, + int index) +{ + struct lprocfs_counter *cntr; + + cntr = &stats->ls_percpu[cpuid]->lp_cntr[index]; + + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + cntr = (void *)cntr + index * sizeof(__s64); + + return cntr; +} + +/* Two optimized LPROCFS counter increment functions are provided: + * lprocfs_counter_incr(cntr, value) - optimized for by-one counters + * lprocfs_counter_add(cntr) - use for multi-valued counters + * Counter data layout allows config flag, counter lock and the + * count itself to reside within a single cache line. + */ + +extern void lprocfs_counter_add(struct lprocfs_stats *stats, int idx, + long amount); +extern void lprocfs_counter_sub(struct lprocfs_stats *stats, int idx, + long amount); + +#define lprocfs_counter_incr(stats, idx) \ + lprocfs_counter_add(stats, idx, 1) +#define lprocfs_counter_decr(stats, idx) \ + lprocfs_counter_sub(stats, idx, 1) + +extern __s64 lprocfs_read_helper(struct lprocfs_counter *lc, + struct lprocfs_counter_header *header, + enum lprocfs_stats_flags flags, + enum lprocfs_fields_flags field); +static inline __u64 lprocfs_stats_collector(struct lprocfs_stats *stats, + int idx, + enum lprocfs_fields_flags field) +{ + int i; + unsigned int num_cpu; + unsigned long flags = 0; + __u64 ret = 0; + + LASSERT(stats != NULL); + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + ret += lprocfs_read_helper( + lprocfs_stats_counter_get(stats, i, idx), + &stats->ls_cnt_header[idx], stats->ls_flags, + field); + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); + return ret; +} + +extern struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags); +extern void lprocfs_clear_stats(struct lprocfs_stats *stats); +extern void lprocfs_free_stats(struct lprocfs_stats **stats); +extern void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats); +extern void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats); +extern int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats); +extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, + const char *units); +extern void lprocfs_free_obd_stats(struct obd_device *obddev); +extern void lprocfs_free_md_stats(struct obd_device *obddev); +struct obd_export; +struct nid_stat; +extern int lprocfs_add_clear_entry(struct obd_device * obd, + proc_dir_entry_t *entry); +extern int lprocfs_exp_setup(struct obd_export *exp, + lnet_nid_t *peer_nid, int *newnid); +extern int lprocfs_exp_cleanup(struct obd_export *exp); +extern proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, + void *data, + struct file_operations *fops); +extern struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...); +extern void lprocfs_free_per_client_stats(struct obd_device *obd); +extern int +lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data); + +extern int lprocfs_register_stats(proc_dir_entry_t *root, const char *name, + struct lprocfs_stats *stats); + +/* lprocfs_status.c */ +extern int lprocfs_add_vars(proc_dir_entry_t *root, + struct lprocfs_vars *var, + void *data); + +extern proc_dir_entry_t *lprocfs_register(const char *name, + proc_dir_entry_t *parent, + struct lprocfs_vars *list, + void *data); + +extern void lprocfs_remove(proc_dir_entry_t **root); +extern void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent); + +extern int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list); +extern int lprocfs_obd_cleanup(struct obd_device *obd); + +extern int lprocfs_seq_create(proc_dir_entry_t *parent, const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data); +extern int lprocfs_obd_seq_create(struct obd_device *dev, const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data); + +/* Generic callbacks */ + +extern int lprocfs_rd_u64(struct seq_file *m, void *data); +extern int lprocfs_rd_atomic(struct seq_file *m, void *data); +extern int lprocfs_wr_atomic(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uint(struct seq_file *m, void *data); +extern int lprocfs_wr_uint(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_rd_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_name(struct seq_file *m, void *data); +extern int lprocfs_rd_server_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_conn_uuid(struct seq_file *m, void *data); +extern int lprocfs_rd_import(struct seq_file *m, void *data); +extern int lprocfs_rd_state(struct seq_file *m, void *data); +extern int lprocfs_rd_connect_flags(struct seq_file *m, void *data); +extern int lprocfs_rd_num_exports(struct seq_file *m, void *data); +extern int lprocfs_rd_numrefs(struct seq_file *m, void *data); + +struct adaptive_timeout; +extern int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at); +extern int lprocfs_rd_timeouts(struct seq_file *m, void *data); +extern int lprocfs_wr_timeouts(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_wr_evict_client(struct file *file, const char *buffer, + size_t count, loff_t *off); +extern int lprocfs_wr_ping(struct file *file, const char *buffer, + size_t count, loff_t *off); +extern int lprocfs_wr_import(struct file *file, const char *buffer, + size_t count, loff_t *off); +extern int lprocfs_rd_pinger_recov(struct seq_file *m, void *n); +extern int lprocfs_wr_pinger_recov(struct file *file, const char *buffer, + size_t count, loff_t *off); + +/* Statfs helpers */ +extern int lprocfs_rd_blksize(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytestotal(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytesfree(struct seq_file *m, void *data); +extern int lprocfs_rd_kbytesavail(struct seq_file *m, void *data); +extern int lprocfs_rd_filestotal(struct seq_file *m, void *data); +extern int lprocfs_rd_filesfree(struct seq_file *m, void *data); + +extern int lprocfs_write_helper(const char *buffer, unsigned long count, + int *val); +extern int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult); +extern int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult); +extern int lprocfs_read_frac_helper(char *buffer, unsigned long count, + long val, int mult); +extern int lprocfs_write_u64_helper(const char *buffer, unsigned long count, + __u64 *val); +extern int lprocfs_write_frac_u64_helper(const char *buffer, + unsigned long count, + __u64 *val, int mult); +char *lprocfs_find_named_value(const char *buffer, const char *name, + unsigned long *count); +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value); +void lprocfs_oh_clear(struct obd_histogram *oh); +unsigned long lprocfs_oh_sum(struct obd_histogram *oh); + +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt); + +extern int lprocfs_single_release(cfs_inode_t *, struct file *); +extern int lprocfs_seq_release(cfs_inode_t *, struct file *); + +/* You must use these macros when you want to refer to + * the import in a client obd_device for a lprocfs entry */ +#define LPROCFS_CLIMP_CHECK(obd) do { \ + typecheck(struct obd_device *, obd); \ + down_read(&(obd)->u.cli.cl_sem); \ + if ((obd)->u.cli.cl_import == NULL) { \ + up_read(&(obd)->u.cli.cl_sem); \ + return -ENODEV; \ + } \ +} while(0) +#define LPROCFS_CLIMP_EXIT(obd) \ + up_read(&(obd)->u.cli.cl_sem); + + +/* write the name##_seq_show function, call LPROC_SEQ_FOPS_RO for read-only + proc entries; otherwise, you will define name##_seq_write function also for + a read-write proc entry, and then call LPROC_SEQ_SEQ instead. Finally, + call lprocfs_obd_seq_create(obd, filename, 0444, &name#_fops, data); */ +#define __LPROC_SEQ_FOPS(name, custom_seq_write) \ +static int name##_single_open(cfs_inode_t *inode, struct file *file) \ +{ \ + return single_open(file, name##_seq_show, PDE_DATA(inode)); \ +} \ +struct file_operations name##_fops = { \ + .owner = THIS_MODULE, \ + .open = name##_single_open, \ + .read = seq_read, \ + .write = custom_seq_write, \ + .llseek = seq_lseek, \ + .release = lprocfs_single_release, \ +} + +#define LPROC_SEQ_FOPS_RO(name) __LPROC_SEQ_FOPS(name, NULL) +#define LPROC_SEQ_FOPS(name) __LPROC_SEQ_FOPS(name, name##_seq_write) + +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_rd_##type(m, m->private); \ + } \ + LPROC_SEQ_FOPS_RO(name##_##type) + +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) \ + static int name##_##type##_seq_show(struct seq_file *m, void *v)\ + { \ + return lprocfs_rd_##type(m, m->private); \ + } \ + static ssize_t name##_##type##_seq_write(struct file *file, \ + const char *buffer, size_t count, loff_t *off) \ + { \ + struct seq_file *seq = file->private_data; \ + return lprocfs_wr_##type(file, buffer, \ + count, seq->private); \ + } \ + LPROC_SEQ_FOPS(name##_##type); + +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) \ + static ssize_t name##_##type##_write(struct file *file, \ + const char *buffer, size_t count, loff_t *off) \ + { \ + return lprocfs_wr_##type(file, buffer, count, off); \ + } \ + static int name##_##type##_open(cfs_inode_t *inode, struct file *file) \ + { \ + return single_open(file, NULL, PDE_DATA(inode)); \ + } \ + struct file_operations name##_##type##_fops = { \ + .open = name##_##type##_open, \ + .write = name##_##type##_write, \ + .release = lprocfs_single_release, \ + }; + +/* lprocfs_jobstats.c */ +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount); +void lprocfs_job_stats_fini(struct obd_device *obd); +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn); +int lprocfs_rd_job_interval(struct seq_file *m, void *data); +int lprocfs_wr_job_interval(struct file *file, const char *buffer, + unsigned long count, void *data); + +/* lproc_ptlrpc.c */ +struct ptlrpc_request; +extern void target_print_req(void *seq_file, struct ptlrpc_request *req); + +/* lproc_status.c */ +int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data); +int lprocfs_obd_wr_max_pages_per_rpc(struct file *file, const char *buffer, + size_t count, loff_t *off); + +/* all quota proc functions */ +extern int lprocfs_quota_rd_bunit(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_btune(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_iunit(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_itune(char *page, char **start, + loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_type(char *page, char **start, loff_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_seconds(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_sync_blk(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_qs(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_qs(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_boundary_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_bunit(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_bunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_iunit(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_iunit(struct file *file, + const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_qs_factor(char *page, char **start, loff_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_qs_factor(struct file *file, + const char *buffer, + unsigned long count, void *data); + + + +#else +/* LPROCFS is not defined */ + +#define proc_lustre_root NULL + +static inline void lprocfs_counter_add(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, + int index) +{ return; } +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, + int index, unsigned conf, + const char *name, const char *units) +{ return; } + +static inline __u64 lc_read_helper(struct lprocfs_counter *lc, + enum lprocfs_fields_flags field) +{ return 0; } + +/* NB: we return !NULL to satisfy error checker */ +static inline struct lprocfs_stats * +lprocfs_alloc_stats(unsigned int num, enum lprocfs_stats_flags flags) +{ return (struct lprocfs_stats *)1; } +static inline void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_free_stats(struct lprocfs_stats **stats) +{ return; } +static inline int lprocfs_register_stats(proc_dir_entry_t *root, + const char *name, + struct lprocfs_stats *stats) +{ return 0; } +static inline void lprocfs_init_ops_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_mps_stats(int num_private_stats, + struct lprocfs_stats *stats) +{ return; } +static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ return; } +static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline int lprocfs_alloc_md_stats(struct obd_device *obddev, + unsigned int num_private_stats) +{ return 0; } +static inline void lprocfs_free_obd_stats(struct obd_device *obddev) +{ return; } +static inline void lprocfs_free_md_stats(struct obd_device *obddev) +{ return; } + +struct obd_export; +static inline int lprocfs_add_clear_entry(struct obd_export *exp) +{ return 0; } +static inline int lprocfs_exp_setup(struct obd_export *exp,lnet_nid_t *peer_nid, + int *newnid) +{ return 0; } +static inline int lprocfs_exp_cleanup(struct obd_export *exp) +{ return 0; } +static inline proc_dir_entry_t * +lprocfs_add_simple(struct proc_dir_entry *root, char *name, + void *data, struct file_operations *fops) +{return 0; } +static inline struct proc_dir_entry * +lprocfs_add_symlink(const char *name, struct proc_dir_entry *parent, + const char *format, ...) +{return NULL; } +static inline void lprocfs_free_per_client_stats(struct obd_device *obd) +{ return; } +static inline +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{return count;} +static inline +int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data) +{ return 0; } + +static inline proc_dir_entry_t * +lprocfs_register(const char *name, proc_dir_entry_t *parent, + struct lprocfs_vars *list, void *data) +{ return NULL; } +static inline int lprocfs_add_vars(proc_dir_entry_t *root, + struct lprocfs_vars *var, + void *data) +{ return 0; } +static inline void lprocfs_remove(proc_dir_entry_t **root) +{ return; } +static inline void lprocfs_remove_proc_entry(const char *name, + struct proc_dir_entry *parent) +{ return; } +static inline int lprocfs_obd_setup(struct obd_device *dev, + struct lprocfs_vars *list) +{ return 0; } +static inline int lprocfs_obd_cleanup(struct obd_device *dev) +{ return 0; } +static inline int lprocfs_rd_u64(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_name(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_server_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_conn_uuid(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_import(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_pinger_recov(struct seq_file *m, void *n) +{ return 0; } +static inline int lprocfs_rd_state(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_connect_flags(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_rd_num_exports(struct seq_file *m, void *data) +{ return 0; } +extern inline int lprocfs_rd_numrefs(struct seq_file *m, void *data) +{ return 0; } +struct adaptive_timeout; +static inline int lprocfs_at_hist_helper(struct seq_file *m, + struct adaptive_timeout *at) +{ return 0; } +static inline int lprocfs_rd_timeouts(struct seq_file *m, void *data) +{ return 0; } +static inline int lprocfs_wr_timeouts(struct file *file, + const char *buffer, + unsigned long count, void *data) +{ return 0; } +static inline int lprocfs_wr_evict_client(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_ping(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_import(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ return 0; } +static inline int lprocfs_wr_pinger_recov(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ return 0; } + +/* Statfs helpers */ +static inline +int lprocfs_rd_blksize(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytestotal(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesfree(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_kbytesavail(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_filestotal(struct seq_file *m, void *data) +{ return 0; } +static inline +int lprocfs_rd_filesfree(struct seq_file *m, void *data) +{ return 0; } +static inline +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ return; } +static inline +void lprocfs_oh_clear(struct obd_histogram *oh) +{ return; } +static inline +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ return 0; } +static inline +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ return; } +static inline +__u64 lprocfs_stats_collector(struct lprocfs_stats *stats, int idx, + enum lprocfs_fields_flags field) +{ return (__u64)0; } + +#define LPROC_SEQ_FOPS_RO(name) +#define LPROC_SEQ_FOPS(name) +#define LPROC_SEQ_FOPS_RO_TYPE(name, type) +#define LPROC_SEQ_FOPS_RW_TYPE(name, type) +#define LPROC_SEQ_FOPS_WR_ONLY(name, type) + +/* lprocfs_jobstats.c */ +static inline +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, int event, + long amount) +{ return 0; } +static inline +void lprocfs_job_stats_fini(struct obd_device *obd) +{ return; } +static inline +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback fn) +{ return 0; } + + +/* lproc_ptlrpc.c */ +#define target_print_req NULL + +#endif /* LPROCFS */ + +#endif /* LPROCFS_SNMP_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_object.h b/drivers/staging/lustre/lustre/include/lu_object.h new file mode 100644 index 000000000000..d40ad81b4eb2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_object.h @@ -0,0 +1,1346 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_LU_OBJECT_H +#define __LUSTRE_LU_OBJECT_H + +#include <stdarg.h> +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> +#include <lu_ref.h> + +struct seq_file; +struct proc_dir_entry; +struct lustre_cfg; +struct lprocfs_stats; + +/** \defgroup lu lu + * lu_* data-types represent server-side entities shared by data and meta-data + * stacks. + * + * Design goals: + * + * -# support for layering. + * + * Server side object is split into layers, one per device in the + * corresponding device stack. Individual layer is represented by struct + * lu_object. Compound layered object --- by struct lu_object_header. Most + * interface functions take lu_object as an argument and operate on the + * whole compound object. This decision was made due to the following + * reasons: + * + * - it's envisaged that lu_object will be used much more often than + * lu_object_header; + * + * - we want lower (non-top) layers to be able to initiate operations + * on the whole object. + * + * Generic code supports layering more complex than simple stacking, e.g., + * it is possible that at some layer object "spawns" multiple sub-objects + * on the lower layer. + * + * -# fid-based identification. + * + * Compound object is uniquely identified by its fid. Objects are indexed + * by their fids (hash table is used for index). + * + * -# caching and life-cycle management. + * + * Object's life-time is controlled by reference counting. When reference + * count drops to 0, object is returned to cache. Cached objects still + * retain their identity (i.e., fid), and can be recovered from cache. + * + * Objects are kept in the global LRU list, and lu_site_purge() function + * can be used to reclaim given number of unused objects from the tail of + * the LRU. + * + * -# avoiding recursion. + * + * Generic code tries to replace recursion through layers by iterations + * where possible. Additionally to the end of reducing stack consumption, + * data, when practically possible, are allocated through lu_context_key + * interface rather than on stack. + * @{ + */ + +struct lu_site; +struct lu_object; +struct lu_device; +struct lu_object_header; +struct lu_context; +struct lu_env; + +/** + * Operations common for data and meta-data devices. + */ +struct lu_device_operations { + /** + * Allocate object for the given device (without lower-layer + * parts). This is called by lu_object_operations::loo_object_init() + * from the parent layer, and should setup at least lu_object::lo_dev + * and lu_object::lo_ops fields of resulting lu_object. + * + * Object creation protocol. + * + * Due to design goal of avoiding recursion, object creation (see + * lu_object_alloc()) is somewhat involved: + * + * - first, lu_device_operations::ldo_object_alloc() method of the + * top-level device in the stack is called. It should allocate top + * level object (including lu_object_header), but without any + * lower-layer sub-object(s). + * + * - then lu_object_alloc() sets fid in the header of newly created + * object. + * + * - then lu_object_operations::loo_object_init() is called. It has + * to allocate lower-layer object(s). To do this, + * lu_object_operations::loo_object_init() calls ldo_object_alloc() + * of the lower-layer device(s). + * + * - for all new objects allocated by + * lu_object_operations::loo_object_init() (and inserted into object + * stack), lu_object_operations::loo_object_init() is called again + * repeatedly, until no new objects are created. + * + * \post ergo(!IS_ERR(result), result->lo_dev == d && + * result->lo_ops != NULL); + */ + struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, + const struct lu_object_header *h, + struct lu_device *d); + /** + * process config specific for device. + */ + int (*ldo_process_config)(const struct lu_env *env, + struct lu_device *, struct lustre_cfg *); + int (*ldo_recovery_complete)(const struct lu_env *, + struct lu_device *); + + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + +}; + +/** + * For lu_object_conf flags + */ +typedef enum { + /* This is a new object to be allocated, or the file + * corresponding to the object does not exists. */ + LOC_F_NEW = 0x00000001, +} loc_flags_t; + +/** + * Object configuration, describing particulars of object being created. On + * server this is not used, as server objects are full identified by fid. On + * client configuration contains struct lustre_md. + */ +struct lu_object_conf { + /** + * Some hints for obj find and alloc. + */ + loc_flags_t loc_flags; +}; + +/** + * Type of "printer" function used by lu_object_operations::loo_object_print() + * method. + * + * Printer function is needed to provide some flexibility in (semi-)debugging + * output: possible implementations: printk, CDEBUG, sysfs/seq_file + */ +typedef int (*lu_printer_t)(const struct lu_env *env, + void *cookie, const char *format, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Operations specific for particular lu_object. + */ +struct lu_object_operations { + + /** + * Allocate lower-layer parts of the object by calling + * lu_device_operations::ldo_object_alloc() of the corresponding + * underlying device. + * + * This method is called once for each object inserted into object + * stack. It's responsibility of this method to insert lower-layer + * object(s) it create into appropriate places of object stack. + */ + int (*loo_object_init)(const struct lu_env *env, + struct lu_object *o, + const struct lu_object_conf *conf); + /** + * Called (in top-to-bottom order) during object allocation after all + * layers were allocated and initialized. Can be used to perform + * initialization depending on lower layers. + */ + int (*loo_object_start)(const struct lu_env *env, + struct lu_object *o); + /** + * Called before lu_object_operations::loo_object_free() to signal + * that object is being destroyed. Dual to + * lu_object_operations::loo_object_init(). + */ + void (*loo_object_delete)(const struct lu_env *env, + struct lu_object *o); + /** + * Dual to lu_device_operations::ldo_object_alloc(). Called when + * object is removed from memory. + */ + void (*loo_object_free)(const struct lu_env *env, + struct lu_object *o); + /** + * Called when last active reference to the object is released (and + * object returns to the cache). This method is optional. + */ + void (*loo_object_release)(const struct lu_env *env, + struct lu_object *o); + /** + * Optional debugging helper. Print given object. + */ + int (*loo_object_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + /** + * Optional debugging method. Returns true iff method is internally + * consistent. + */ + int (*loo_object_invariant)(const struct lu_object *o); +}; + +/** + * Type of lu_device. + */ +struct lu_device_type; + +/** + * Device: a layer in the server side abstraction stacking. + */ +struct lu_device { + /** + * reference count. This is incremented, in particular, on each object + * created at this layer. + * + * \todo XXX which means that atomic_t is probably too small. + */ + atomic_t ld_ref; + /** + * Pointer to device type. Never modified once set. + */ + struct lu_device_type *ld_type; + /** + * Operation vector for this device. + */ + const struct lu_device_operations *ld_ops; + /** + * Stack this device belongs to. + */ + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; + + /** \todo XXX: temporary back pointer into obd. */ + struct obd_device *ld_obd; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref ld_reference; + /** + * Link the device to the site. + **/ + struct list_head ld_linkage; +}; + +struct lu_device_type_operations; + +/** + * Tag bits for device type. They are used to distinguish certain groups of + * device types. + */ +enum lu_device_tag { + /** this is meta-data device */ + LU_DEVICE_MD = (1 << 0), + /** this is data device */ + LU_DEVICE_DT = (1 << 1), + /** data device in the client stack */ + LU_DEVICE_CL = (1 << 2) +}; + +/** + * Type of device. + */ +struct lu_device_type { + /** + * Tag bits. Taken from enum lu_device_tag. Never modified once set. + */ + __u32 ldt_tags; + /** + * Name of this class. Unique system-wide. Never modified once set. + */ + char *ldt_name; + /** + * Operations for this type. + */ + const struct lu_device_type_operations *ldt_ops; + /** + * \todo XXX: temporary pointer to associated obd_type. + */ + struct obd_type *ldt_obd_type; + /** + * \todo XXX: temporary: context tags used by obd_*() calls. + */ + __u32 ldt_ctx_tags; + /** + * Number of existing device type instances. + */ + unsigned ldt_device_nr; + /** + * Linkage into a global list of all device types. + * + * \see lu_device_types. + */ + struct list_head ldt_linkage; +}; + +/** + * Operations on a device type. + */ +struct lu_device_type_operations { + /** + * Allocate new device. + */ + struct lu_device *(*ldto_device_alloc)(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *lcfg); + /** + * Free device. Dual to + * lu_device_type_operations::ldto_device_alloc(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_free)(const struct lu_env *, + struct lu_device *); + + /** + * Initialize the devices after allocation + */ + int (*ldto_device_init)(const struct lu_env *env, + struct lu_device *, const char *, + struct lu_device *); + /** + * Finalize device. Dual to + * lu_device_type_operations::ldto_device_init(). Returns pointer to + * the next device in the stack. + */ + struct lu_device *(*ldto_device_fini)(const struct lu_env *env, + struct lu_device *); + /** + * Initialize device type. This is called on module load. + */ + int (*ldto_init)(struct lu_device_type *t); + /** + * Finalize device type. Dual to + * lu_device_type_operations::ldto_init(). Called on module unload. + */ + void (*ldto_fini)(struct lu_device_type *t); + /** + * Called when the first device is created. + */ + void (*ldto_start)(struct lu_device_type *t); + /** + * Called when number of devices drops to 0. + */ + void (*ldto_stop)(struct lu_device_type *t); +}; + +static inline int lu_device_is_md(const struct lu_device *d) +{ + return ergo(d != NULL, d->ld_type->ldt_tags & LU_DEVICE_MD); +} + +/** + * Flags for the object layers. + */ +enum lu_object_flags { + /** + * this flags is set if lu_object_operations::loo_object_init() has + * been called for this layer. Used by lu_object_alloc(). + */ + LU_OBJECT_ALLOCATED = (1 << 0) +}; + +/** + * Common object attributes. + */ +struct lu_attr { + /** size in bytes */ + __u64 la_size; + /** modification time in seconds since Epoch */ + obd_time la_mtime; + /** access time in seconds since Epoch */ + obd_time la_atime; + /** change time in seconds since Epoch */ + obd_time la_ctime; + /** 512-byte blocks allocated to object */ + __u64 la_blocks; + /** permission bits and file type */ + __u32 la_mode; + /** owner id */ + __u32 la_uid; + /** group id */ + __u32 la_gid; + /** object flags */ + __u32 la_flags; + /** number of persistent references to this object */ + __u32 la_nlink; + /** blk bits of the object*/ + __u32 la_blkbits; + /** blk size of the object*/ + __u32 la_blksize; + /** real device */ + __u32 la_rdev; + /** + * valid bits + * + * \see enum la_valid + */ + __u64 la_valid; +}; + +/** Bit-mask of valid attributes */ +enum la_valid { + LA_ATIME = 1 << 0, + LA_MTIME = 1 << 1, + LA_CTIME = 1 << 2, + LA_SIZE = 1 << 3, + LA_MODE = 1 << 4, + LA_UID = 1 << 5, + LA_GID = 1 << 6, + LA_BLOCKS = 1 << 7, + LA_TYPE = 1 << 8, + LA_FLAGS = 1 << 9, + LA_NLINK = 1 << 10, + LA_RDEV = 1 << 11, + LA_BLKSIZE = 1 << 12, + LA_KILL_SUID = 1 << 13, + LA_KILL_SGID = 1 << 14, +}; + +/** + * Layer in the layered object. + */ +struct lu_object { + /** + * Header for this object. + */ + struct lu_object_header *lo_header; + /** + * Device for this layer. + */ + struct lu_device *lo_dev; + /** + * Operations for this object. + */ + const struct lu_object_operations *lo_ops; + /** + * Linkage into list of all layers. + */ + struct list_head lo_linkage; + /** + * Depth. Top level layer depth is 0. + */ + int lo_depth; + /** + * Flags from enum lu_object_flags. + */ + __u32 lo_flags; + /** + * Link to the device, for debugging. + */ + struct lu_ref_link *lo_dev_ref; +}; + +enum lu_object_header_flags { + /** + * Don't keep this object in cache. Object will be destroyed as soon + * as last reference to it is released. This flag cannot be cleared + * once set. + */ + LU_OBJECT_HEARD_BANSHEE = 0, + /** + * Mark this object has already been taken out of cache. + */ + LU_OBJECT_UNHASHED = 1 +}; + +enum lu_object_header_attr { + LOHA_EXISTS = 1 << 0, + LOHA_REMOTE = 1 << 1, + /** + * UNIX file type is stored in S_IFMT bits. + */ + LOHA_FT_START = 001 << 12, /**< S_IFIFO */ + LOHA_FT_END = 017 << 12, /**< S_IFMT */ +}; + +/** + * "Compound" object, consisting of multiple layers. + * + * Compound object with given fid is unique with given lu_site. + * + * Note, that object does *not* necessary correspond to the real object in the + * persistent storage: object is an anchor for locking and method calling, so + * it is created for things like not-yet-existing child created by mkdir or + * create calls. lu_object_operations::loo_exists() can be used to check + * whether object is backed by persistent storage entity. + */ +struct lu_object_header { + /** + * Object flags from enum lu_object_header_flags. Set and checked + * atomically. + */ + unsigned long loh_flags; + /** + * Object reference count. Protected by lu_site::ls_guard. + */ + atomic_t loh_ref; + /** + * Fid, uniquely identifying this object. + */ + struct lu_fid loh_fid; + /** + * Common object attributes, cached for efficiency. From enum + * lu_object_header_attr. + */ + __u32 loh_attr; + /** + * Linkage into per-site hash table. Protected by lu_site::ls_guard. + */ + struct hlist_node loh_hash; + /** + * Linkage into per-site LRU list. Protected by lu_site::ls_guard. + */ + struct list_head loh_lru; + /** + * Linkage into list of layers. Never modified once set (except lately + * during object destruction). No locking is necessary. + */ + struct list_head loh_layers; + /** + * A list of references to this object, for debugging. + */ + struct lu_ref loh_reference; +}; + +struct fld; + +struct lu_site_bkt_data { + /** + * number of busy object on this bucket + */ + long lsb_busy; + /** + * LRU list, updated on each access to object. Protected by + * bucket lock of lu_site::ls_obj_hash. + * + * "Cold" end of LRU is lu_site::ls_lru.next. Accessed object are + * moved to the lu_site::ls_lru.prev (this is due to the non-existence + * of list_for_each_entry_safe_reverse()). + */ + struct list_head lsb_lru; + /** + * Wait-queue signaled when an object in this site is ultimately + * destroyed (lu_object_free()). It is used by lu_object_find() to + * wait before re-trying when object in the process of destruction is + * found in the hash table. + * + * \see htable_lookup(). + */ + wait_queue_head_t lsb_marche_funebre; +}; + +enum { + LU_SS_CREATED = 0, + LU_SS_CACHE_HIT, + LU_SS_CACHE_MISS, + LU_SS_CACHE_RACE, + LU_SS_CACHE_DEATH_RACE, + LU_SS_LRU_PURGED, + LU_SS_LAST_STAT +}; + +/** + * lu_site is a "compartment" within which objects are unique, and LRU + * discipline is maintained. + * + * lu_site exists so that multiple layered stacks can co-exist in the same + * address space. + * + * lu_site has the same relation to lu_device as lu_object_header to + * lu_object. + */ +struct lu_site { + /** + * objects hash table + */ + cfs_hash_t *ls_obj_hash; + /** + * index of bucket on hash table while purging + */ + int ls_purge_start; + /** + * Top-level device for this stack. + */ + struct lu_device *ls_top_dev; + /** + * Bottom-level device for this stack + */ + struct lu_device *ls_bottom_dev; + /** + * Linkage into global list of sites. + */ + struct list_head ls_linkage; + /** + * List for lu device for this site, protected + * by ls_ld_lock. + **/ + struct list_head ls_ld_linkage; + spinlock_t ls_ld_lock; + + /** + * lu_site stats + */ + struct lprocfs_stats *ls_stats; + /** + * XXX: a hack! fld has to find md_site via site, remove when possible + */ + struct seq_server_site *ld_seq_site; +}; + +static inline struct lu_site_bkt_data * +lu_site_bkt_from_fid(struct lu_site *site, struct lu_fid *fid) +{ + cfs_hash_bd_t bd; + + cfs_hash_bd_get(site->ls_obj_hash, fid, &bd); + return cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); +} + +/** \name ctors + * Constructors/destructors. + * @{ + */ + +int lu_site_init (struct lu_site *s, struct lu_device *d); +void lu_site_fini (struct lu_site *s); +int lu_site_init_finish (struct lu_site *s); +void lu_stack_fini (const struct lu_env *env, struct lu_device *top); +void lu_device_get (struct lu_device *d); +void lu_device_put (struct lu_device *d); +int lu_device_init (struct lu_device *d, struct lu_device_type *t); +void lu_device_fini (struct lu_device *d); +int lu_object_header_init(struct lu_object_header *h); +void lu_object_header_fini(struct lu_object_header *h); +int lu_object_init (struct lu_object *o, + struct lu_object_header *h, struct lu_device *d); +void lu_object_fini (struct lu_object *o); +void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); +void lu_object_add (struct lu_object *before, struct lu_object *o); + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d); +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d); + +/** + * Helpers to initialize and finalize device types. + */ + +int lu_device_type_init(struct lu_device_type *ldt); +void lu_device_type_fini(struct lu_device_type *ldt); +void lu_types_stop(void); + +/** @} ctors */ + +/** \name caching + * Caching and reference counting. + * @{ + */ + +/** + * Acquire additional reference to the given object. This function is used to + * attain additional reference. To acquire initial reference use + * lu_object_find(). + */ +static inline void lu_object_get(struct lu_object *o) +{ + LASSERT(atomic_read(&o->lo_header->loh_ref) > 0); + atomic_inc(&o->lo_header->loh_ref); +} + +/** + * Return true of object will not be cached after last reference to it is + * released. + */ +static inline int lu_object_is_dying(const struct lu_object_header *h) +{ + return test_bit(LU_OBJECT_HEARD_BANSHEE, &h->loh_flags); +} + +void lu_object_put(const struct lu_env *env, struct lu_object *o); +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o); +void lu_object_unhash(const struct lu_env *env, struct lu_object *o); + +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr); + +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer); +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf); +/** @} caching */ + +/** \name helpers + * Helpers. + * @{ + */ + +/** + * First (topmost) sub-object of given compound object + */ +static inline struct lu_object *lu_object_top(struct lu_object_header *h) +{ + LASSERT(!list_empty(&h->loh_layers)); + return container_of0(h->loh_layers.next, struct lu_object, lo_linkage); +} + +/** + * Next sub-object in the layering + */ +static inline struct lu_object *lu_object_next(const struct lu_object *o) +{ + return container_of0(o->lo_linkage.next, struct lu_object, lo_linkage); +} + +/** + * Pointer to the fid of this object. + */ +static inline const struct lu_fid *lu_object_fid(const struct lu_object *o) +{ + return &o->lo_header->loh_fid; +} + +/** + * return device operations vector for this object + */ +static const inline struct lu_device_operations * +lu_object_ops(const struct lu_object *o) +{ + return o->lo_dev->ld_ops; +} + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype); + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...); + +/** + * Print object description followed by a user-supplied message. + */ +#define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_print(env, &msgdata, lu_cdebug_printer, object);\ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Print short object description followed by a user-supplied message. + */ +#define LU_OBJECT_HEADER(mask, env, object, format, ...) \ +do { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, NULL); \ + \ + if (cfs_cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + lu_object_header_print(env, &msgdata, lu_cdebug_printer,\ + (object)->lo_header); \ + lu_cdebug_printer(env, &msgdata, "\n"); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +void lu_object_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o); +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o); + + +/** + * Check whether object exists, no matter on local or remote storage. + * Note: LOHA_EXISTS will be set once some one created the object, + * and it does not needs to be committed to storage. + */ +#define lu_object_exists(o) ((o)->lo_header->loh_attr & LOHA_EXISTS) + +/** + * Check whether object on the remote storage. + */ +#define lu_object_remote(o) unlikely((o)->lo_header->loh_attr & LOHA_REMOTE) + +static inline int lu_object_assert_exists(const struct lu_object *o) +{ + return lu_object_exists(o); +} + +static inline int lu_object_assert_not_exists(const struct lu_object *o) +{ + return !lu_object_exists(o); +} + +/** + * Attr of this object. + */ +static inline __u32 lu_object_attr(const struct lu_object *o) +{ + LASSERT(lu_object_exists(o) != 0); + return o->lo_header->loh_attr; +} + +static inline struct lu_ref_link *lu_object_ref_add(struct lu_object *o, + const char *scope, + const void *source) +{ + return lu_ref_add(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del(struct lu_object *o, + const char *scope, const void *source) +{ + lu_ref_del(&o->lo_header->loh_reference, scope, source); +} + +static inline void lu_object_ref_del_at(struct lu_object *o, + struct lu_ref_link *link, + const char *scope, const void *source) +{ + lu_ref_del_at(&o->lo_header->loh_reference, link, scope, source); +} + +/** input params, should be filled out by mdt */ +struct lu_rdpg { + /** hash */ + __u64 rp_hash; + /** count in bytes */ + unsigned int rp_count; + /** number of pages */ + unsigned int rp_npages; + /** requested attr */ + __u32 rp_attrs; + /** pointers to pages */ + struct page **rp_pages; +}; + +enum lu_xattr_flags { + LU_XATTR_REPLACE = (1 << 0), + LU_XATTR_CREATE = (1 << 1) +}; + +/** @} helpers */ + +/** \name lu_context + * @{ */ + +/** For lu_context health-checks */ +enum lu_context_state { + LCS_INITIALIZED = 1, + LCS_ENTERED, + LCS_LEFT, + LCS_FINALIZED +}; + +/** + * lu_context. Execution context for lu_object methods. Currently associated + * with thread. + * + * All lu_object methods, except device and device type methods (called during + * system initialization and shutdown) are executed "within" some + * lu_context. This means, that pointer to some "current" lu_context is passed + * as an argument to all methods. + * + * All service ptlrpc threads create lu_context as part of their + * initialization. It is possible to create "stand-alone" context for other + * execution environments (like system calls). + * + * lu_object methods mainly use lu_context through lu_context_key interface + * that allows each layer to associate arbitrary pieces of data with each + * context (see pthread_key_create(3) for similar interface). + * + * On a client, lu_context is bound to a thread, see cl_env_get(). + * + * \see lu_context_key + */ +struct lu_context { + /** + * lu_context is used on the client side too. Yet we don't want to + * allocate values of server-side keys for the client contexts and + * vice versa. + * + * To achieve this, set of tags in introduced. Contexts and keys are + * marked with tags. Key value are created only for context whose set + * of tags has non-empty intersection with one for key. Tags are taken + * from enum lu_context_tag. + */ + __u32 lc_tags; + enum lu_context_state lc_state; + /** + * Pointer to the home service thread. NULL for other execution + * contexts. + */ + struct ptlrpc_thread *lc_thread; + /** + * Pointer to an array with key values. Internal implementation + * detail. + */ + void **lc_value; + /** + * Linkage into a list of all remembered contexts. Only + * `non-transient' contexts, i.e., ones created for service threads + * are placed here. + */ + struct list_head lc_remember; + /** + * Version counter used to skip calls to lu_context_refill() when no + * keys were registered. + */ + unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; +}; + +/** + * lu_context_key interface. Similar to pthread_key. + */ + +enum lu_context_tag { + /** + * Thread on md server + */ + LCT_MD_THREAD = 1 << 0, + /** + * Thread on dt server + */ + LCT_DT_THREAD = 1 << 1, + /** + * Context for transaction handle + */ + LCT_TX_HANDLE = 1 << 2, + /** + * Thread on client + */ + LCT_CL_THREAD = 1 << 3, + /** + * A per-request session on a server, and a per-system-call session on + * a client. + */ + LCT_SESSION = 1 << 4, + /** + * A per-request data on OSP device + */ + LCT_OSP_THREAD = 1 << 5, + /** + * MGS device thread + */ + LCT_MG_THREAD = 1 << 6, + /** + * Context for local operations + */ + LCT_LOCAL = 1 << 7, + /** + * Set when at least one of keys, having values in this context has + * non-NULL lu_context_key::lct_exit() method. This is used to + * optimize lu_context_exit() call. + */ + LCT_HAS_EXIT = 1 << 28, + /** + * Don't add references for modules creating key values in that context. + * This is only for contexts used internally by lu_object framework. + */ + LCT_NOREF = 1 << 29, + /** + * Key is being prepared for retiring, don't create new values for it. + */ + LCT_QUIESCENT = 1 << 30, + /** + * Context should be remembered. + */ + LCT_REMEMBER = 1 << 31, + /** + * Contexts usable in cache shrinker thread. + */ + LCT_SHRINKER = LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD|LCT_NOREF +}; + +/** + * Key. Represents per-context value slot. + * + * Keys are usually registered when module owning the key is initialized, and + * de-registered when module is unloaded. Once key is registered, all new + * contexts with matching tags, will get key value. "Old" contexts, already + * initialized at the time of key registration, can be forced to get key value + * by calling lu_context_refill(). + * + * Every key value is counted in lu_context_key::lct_used and acquires a + * reference on an owning module. This means, that all key values have to be + * destroyed before module can be unloaded. This is usually achieved by + * stopping threads started by the module, that created contexts in their + * entry functions. Situation is complicated by the threads shared by multiple + * modules, like ptlrpcd daemon on a client. To work around this problem, + * contexts, created in such threads, are `remembered' (see + * LCT_REMEMBER)---i.e., added into a global list. When module is preparing + * for unloading it does the following: + * + * - marks its keys as `quiescent' (lu_context_tag::LCT_QUIESCENT) + * preventing new key values from being allocated in the new contexts, + * and + * + * - scans a list of remembered contexts, destroying values of module + * keys, thus releasing references to the module. + * + * This is done by lu_context_key_quiesce(). If module is re-activated + * before key has been de-registered, lu_context_key_revive() call clears + * `quiescent' marker. + * + * lu_context code doesn't provide any internal synchronization for these + * activities---it's assumed that startup (including threads start-up) and + * shutdown are serialized by some external means. + * + * \see lu_context + */ +struct lu_context_key { + /** + * Set of tags for which values of this key are to be instantiated. + */ + __u32 lct_tags; + /** + * Value constructor. This is called when new value is created for a + * context. Returns pointer to new value of error pointer. + */ + void *(*lct_init)(const struct lu_context *ctx, + struct lu_context_key *key); + /** + * Value destructor. Called when context with previously allocated + * value of this slot is destroyed. \a data is a value that was returned + * by a matching call to lu_context_key::lct_init(). + */ + void (*lct_fini)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Optional method called on lu_context_exit() for all allocated + * keys. Can be used by debugging code checking that locks are + * released, etc. + */ + void (*lct_exit)(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + /** + * Internal implementation detail: index within lu_context::lc_value[] + * reserved for this key. + */ + int lct_index; + /** + * Internal implementation detail: number of values created for this + * key. + */ + atomic_t lct_used; + /** + * Internal implementation detail: module for this key. + */ + module_t *lct_owner; + /** + * References to this key. For debugging. + */ + struct lu_ref lct_reference; +}; + +#define LU_KEY_INIT(mod, type) \ + static void* mod##_key_init(const struct lu_context *ctx, \ + struct lu_context_key *key) \ + { \ + type *value; \ + \ + CLASSERT(PAGE_CACHE_SIZE >= sizeof (*value)); \ + \ + OBD_ALLOC_PTR(value); \ + if (value == NULL) \ + value = ERR_PTR(-ENOMEM); \ + \ + return value; \ + } \ + struct __##mod##__dummy_init {;} /* semicolon catcher */ + +#define LU_KEY_FINI(mod, type) \ + static void mod##_key_fini(const struct lu_context *ctx, \ + struct lu_context_key *key, void* data) \ + { \ + type *info = data; \ + \ + OBD_FREE_PTR(info); \ + } \ + struct __##mod##__dummy_fini {;} /* semicolon catcher */ + +#define LU_KEY_INIT_FINI(mod, type) \ + LU_KEY_INIT(mod,type); \ + LU_KEY_FINI(mod,type) + +#define LU_CONTEXT_KEY_DEFINE(mod, tags) \ + struct lu_context_key mod##_thread_key = { \ + .lct_tags = tags, \ + .lct_init = mod##_key_init, \ + .lct_fini = mod##_key_fini \ + } + +#define LU_CONTEXT_KEY_INIT(key) \ +do { \ + (key)->lct_owner = THIS_MODULE; \ +} while (0) + +int lu_context_key_register(struct lu_context_key *key); +void lu_context_key_degister(struct lu_context_key *key); +void *lu_context_key_get (const struct lu_context *ctx, + const struct lu_context_key *key); +void lu_context_key_quiesce (struct lu_context_key *key); +void lu_context_key_revive (struct lu_context_key *key); + + +/* + * LU_KEY_INIT_GENERIC() has to be a macro to correctly determine an + * owning module. + */ + +#define LU_KEY_INIT_GENERIC(mod) \ + static void mod##_key_init_generic(struct lu_context_key *k, ...) \ + { \ + struct lu_context_key *key = k; \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ + key = va_arg(args, struct lu_context_key *); \ + } while (key != NULL); \ + va_end(args); \ + } + +#define LU_TYPE_INIT(mod, ...) \ + LU_KEY_INIT_GENERIC(mod) \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ + mod##_key_init_generic(__VA_ARGS__, NULL); \ + return lu_context_key_register_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_init {;} + +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ + lu_context_key_degister_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_fini {;} + +#define LU_TYPE_START(mod, ...) \ + static void mod##_type_start(struct lu_device_type *t) \ + { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_start {;} + +#define LU_TYPE_STOP(mod, ...) \ + static void mod##_type_stop(struct lu_device_type *t) \ + { \ + lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ + } \ + struct __##mod##_dummy_type_stop {;} + + + +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ + LU_TYPE_FINI(mod, __VA_ARGS__); \ + LU_TYPE_START(mod, __VA_ARGS__); \ + LU_TYPE_STOP(mod, __VA_ARGS__) + +int lu_context_init (struct lu_context *ctx, __u32 tags); +void lu_context_fini (struct lu_context *ctx); +void lu_context_enter (struct lu_context *ctx); +void lu_context_exit (struct lu_context *ctx); +int lu_context_refill(struct lu_context *ctx); + +/* + * Helper functions to operate on multiple keys. These are used by the default + * device type operations, defined by LU_TYPE_INIT_FINI(). + */ + +int lu_context_key_register_many(struct lu_context_key *k, ...); +void lu_context_key_degister_many(struct lu_context_key *k, ...); +void lu_context_key_revive_many (struct lu_context_key *k, ...); +void lu_context_key_quiesce_many (struct lu_context_key *k, ...); + +/* + * update/clear ctx/ses tags. + */ +void lu_context_tags_update(__u32 tags); +void lu_context_tags_clear(__u32 tags); +void lu_session_tags_update(__u32 tags); +void lu_session_tags_clear(__u32 tags); + +/** + * Environment. + */ +struct lu_env { + /** + * "Local" context, used to store data instead of stack. + */ + struct lu_context le_ctx; + /** + * "Session" context for per-request data. + */ + struct lu_context *le_ses; +}; + +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, __u32 stags); + +/** @} lu_context */ + +/** + * Output site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, struct seq_file *m); + +/** + * Common name structure to be passed around for various name related methods. + */ +struct lu_name { + const char *ln_name; + int ln_namelen; +}; + +/** + * Common buffer structure to be passed around for various xattr_{s,g}et() + * methods. + */ +struct lu_buf { + void *lb_buf; + ssize_t lb_len; +}; + +#define DLUBUF "(%p %zu)" +#define PLUBUF(buf) (buf)->lb_buf, (buf)->lb_len +/** + * One-time initializers, called at obdclass module initialization, not + * exported. + */ + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void); + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void); + +struct lu_kmem_descr { + struct kmem_cache **ckd_cache; + const char *ckd_name; + const size_t ckd_size; +}; + +int lu_kmem_init(struct lu_kmem_descr *caches); +void lu_kmem_fini(struct lu_kmem_descr *caches); + +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid); +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf); + +/** null buffer */ +extern struct lu_buf LU_BUF_NULL; + +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, int size); +void lu_buf_realloc(struct lu_buf *buf, int size); + +int lu_buf_check_and_grow(struct lu_buf *buf, int len); +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len); + +/** @} lu */ +#endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_ref.h b/drivers/staging/lustre/lustre/include/lu_ref.h new file mode 100644 index 000000000000..624c19be1524 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_ref.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#ifndef __LUSTRE_LU_REF_H +#define __LUSTRE_LU_REF_H + +#include <linux/list.h> + +/** \defgroup lu_ref lu_ref + * + * An interface to track references between objects. Mostly for debugging. + * + * Suppose there is a reference counted data-structure struct foo. To track + * who acquired references to instance of struct foo, add lu_ref field to it: + * + * \code + * struct foo { + * atomic_t foo_refcount; + * struct lu_ref foo_reference; + * ... + * }; + * \endcode + * + * foo::foo_reference has to be initialized by calling + * lu_ref_init(). Typically there will be functions or macros to increment and + * decrement foo::foo_refcount, let's say they are foo_get(struct foo *foo) + * and foo_put(struct foo *foo), respectively. + * + * Whenever foo_get() is called to acquire a reference on a foo, lu_ref_add() + * has to be called to insert into foo::foo_reference a record, describing + * acquired reference. Dually, lu_ref_del() removes matching record. Typical + * usages are: + * + * \code + * struct bar *bar; + * + * // bar owns a reference to foo. + * bar->bar_foo = foo_get(foo); + * lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del(&foo->foo_reference, "bar", bar); + * foo_put(bar->bar_foo); + * + * + * // current thread acquired a temporary reference to foo. + * foo_get(foo); + * lu_ref_add(&foo->reference, __FUNCTION__, current); + * + * ... + * + * // temporary reference is released. + * lu_ref_del(&foo->reference, __FUNCTION__, current); + * foo_put(foo); + * \endcode + * + * \e Et \e cetera. Often it makes sense to include lu_ref_add() and + * lu_ref_del() calls into foo_get() and foo_put(). When an instance of struct + * foo is destroyed, lu_ref_fini() has to be called that checks that no + * pending references remain. lu_ref_print() can be used to dump a list of + * pending references, while hunting down a leak. + * + * For objects to which a large number of references can be acquired, + * lu_ref_del() can become cpu consuming, as it has to scan the list of + * references. To work around this, remember result of lu_ref_add() (usually + * in the same place where pointer to struct foo is stored), and use + * lu_ref_del_at(): + * + * \code + * // There is a large number of bar's for a single foo. + * bar->bar_foo = foo_get(foo); + * bar->bar_foo_ref = lu_ref_add(&foo->foo_reference, "bar", bar); + * + * ... + * + * // reference from bar to foo is released. + * lu_ref_del_at(&foo->foo_reference, bar->bar_foo_ref, "bar", bar); + * foo_put(bar->bar_foo); + * \endcode + * + * lu_ref interface degrades gracefully in case of memory shortages. + * + * @{ + */ + + +struct lu_ref {}; + +static inline void lu_ref_init(struct lu_ref *ref) +{ +} + +static inline void lu_ref_fini(struct lu_ref *ref) +{ +} + +static inline struct lu_ref_link *lu_ref_add(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline struct lu_ref_link *lu_ref_add_atomic(struct lu_ref *ref, + const char *scope, + const void *source) +{ + return NULL; +} + +static inline void lu_ref_del(struct lu_ref *ref, const char *scope, + const void *source) +{ +} + +static inline void lu_ref_set_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source0, + const void *source1) +{ +} + +static inline void lu_ref_del_at(struct lu_ref *ref, struct lu_ref_link *link, + const char *scope, const void *source) +{ +} + +static inline int lu_ref_global_init(void) +{ + return 0; +} + +static inline void lu_ref_global_fini(void) +{ +} + +static inline void lu_ref_print(const struct lu_ref *ref) +{ +} + +static inline void lu_ref_print_all(void) +{ +} + +/** @} lu */ + +#endif /* __LUSTRE_LU_REF_H */ diff --git a/drivers/staging/lustre/lustre/include/lu_target.h b/drivers/staging/lustre/lustre/include/lu_target.h new file mode 100644 index 000000000000..8d48cf4e27ee --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lu_target.h @@ -0,0 +1,91 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_LU_TARGET_H +#define _LUSTRE_LU_TARGET_H + +#include <dt_object.h> +#include <lustre_disk.h> + +struct lu_target { + struct obd_device *lut_obd; + struct dt_device *lut_bottom; + /** last_rcvd file */ + struct dt_object *lut_last_rcvd; + /* transaction callbacks */ + struct dt_txn_callback lut_txn_cb; + /** server data in last_rcvd file */ + struct lr_server_data lut_lsd; + /** Server last transaction number */ + __u64 lut_last_transno; + /** Lock protecting last transaction number */ + spinlock_t lut_translock; + /** Lock protecting client bitmap */ + spinlock_t lut_client_bitmap_lock; + /** Bitmap of known clients */ + unsigned long *lut_client_bitmap; +}; + +typedef void (*tgt_cb_t)(struct lu_target *lut, __u64 transno, + void *data, int err); +struct tgt_commit_cb { + tgt_cb_t tgt_cb_func; + void *tgt_cb_data; +}; + +void tgt_boot_epoch_update(struct lu_target *lut); +int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *lut, + struct obd_export *exp, __u64 transno); +int tgt_new_client_cb_add(struct thandle *th, struct obd_export *exp); +int tgt_init(const struct lu_env *env, struct lu_target *lut, + struct obd_device *obd, struct dt_device *dt); +void tgt_fini(const struct lu_env *env, struct lu_target *lut); +int tgt_client_alloc(struct obd_export *exp); +void tgt_client_free(struct obd_export *exp); +int tgt_client_del(const struct lu_env *env, struct obd_export *exp); +int tgt_client_add(const struct lu_env *env, struct obd_export *exp, int); +int tgt_client_new(const struct lu_env *env, struct obd_export *exp); +int tgt_client_data_read(const struct lu_env *env, struct lu_target *tg, + struct lsd_client_data *lcd, loff_t *off, int index); +int tgt_client_data_write(const struct lu_env *env, struct lu_target *tg, + struct lsd_client_data *lcd, loff_t *off, struct thandle *th); +int tgt_server_data_read(const struct lu_env *env, struct lu_target *tg); +int tgt_server_data_write(const struct lu_env *env, struct lu_target *tg, + struct thandle *th); +int tgt_server_data_update(const struct lu_env *env, struct lu_target *tg, int sync); +int tgt_truncate_last_rcvd(const struct lu_env *env, struct lu_target *tg, loff_t off); + +#endif /* __LUSTRE_LU_TARGET_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/libiam.h b/drivers/staging/lustre/lustre/include/lustre/libiam.h new file mode 100644 index 000000000000..e8e0b084a6bc --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/libiam.h @@ -0,0 +1,145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/libiam.h + * + * iam user level library + * + * Author: Wang Di <wangdi@clusterfs.com> + * Author: Nikita Danilov <nikita@clusterfs.com> + * Author: Fan Yong <fanyong@clusterfs.com> + */ + +/* + * lustre/libiam.h + */ + +#ifndef __IAM_ULIB_H__ +#define __IAM_ULIB_H__ + +/** \defgroup libiam libiam + * + * @{ + */ + + +#define DX_FMT_NAME_LEN 16 + +enum iam_fmt_t { + FMT_LFIX, + FMT_LVAR +}; + +struct iam_uapi_info { + __u16 iui_keysize; + __u16 iui_recsize; + __u16 iui_ptrsize; + __u16 iui_height; + char iui_fmt_name[DX_FMT_NAME_LEN]; +}; + +/* + * Creat an iam file, but do NOT open it. + * Return 0 if success, else -1. + */ +int iam_creat(char *filename, enum iam_fmt_t fmt, + int blocksize, int keysize, int recsize, int ptrsize); + +/* + * Open an iam file, but do NOT creat it if the file doesn't exist. + * Please use iam_creat for creating the file before use iam_open. + * Return file id (fd) if success, else -1. + */ +int iam_open(char *filename, struct iam_uapi_info *ua); + +/* + * Close file opened by iam_open. + */ +int iam_close(int fd); + +/* + * Please use iam_open before use this function. + */ +int iam_insert(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_lookup(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_delete(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Please use iam_open before use this function. + */ +int iam_it_start(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_next(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *key_buf, + int *keysize, char *save_key, + int rec_need_convert, char *rec_buf, + int *recsize, char *save_rec); + +/* + * Please use iam_open before use this function. + */ +int iam_it_stop(int fd, struct iam_uapi_info *ua, + int key_need_convert, char *keybuf, + int rec_need_convert, char *recbuf); + +/* + * Change iam file mode. + */ +int iam_polymorph(char *filename, unsigned long mode); + +/** @} libiam */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h new file mode 100644 index 000000000000..707eb74fdf68 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/liblustreapi.h @@ -0,0 +1,43 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/* + * NOTE: This file is DEPRECATED! Please include lustreapi.h directly + * instead of this file. This file will be removed from a future version + * of lustre! + */ + +#ifndef _LIBLUSTREAPI_H_ +#define _LIBLUSTREAPI_H_ + +#include <lustre/lustreapi.h> +#warning "Including liblustreapi.h is deprecated. Include lustreapi.h directly." + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h new file mode 100644 index 000000000000..ad253c6deadd --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/ll_fiemap.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/ll_fiemap.h + * + * FIEMAP data structures and flags. This header file will be used until + * fiemap.h is available in the upstream kernel. + * + * Author: Kalpak Shah <kalpak.shah@sun.com> + * Author: Andreas Dilger <adilger@sun.com> + */ + +#ifndef _LUSTRE_FIEMAP_H +#define _LUSTRE_FIEMAP_H + + + +struct ll_fiemap_extent { + __u64 fe_logical; /* logical offset in bytes for the start of + * the extent from the beginning of the file */ + __u64 fe_physical; /* physical offset in bytes for the start + * of the extent from the beginning of the disk */ + __u64 fe_length; /* length in bytes for this extent */ + __u64 fe_reserved64[2]; + __u32 fe_flags; /* FIEMAP_EXTENT_* flags for this extent */ + __u32 fe_device; /* device number for this extent */ + __u32 fe_reserved[2]; +}; + +struct ll_user_fiemap { + __u64 fm_start; /* logical offset (inclusive) at + * which to start mapping (in) */ + __u64 fm_length; /* logical length of mapping which + * userspace wants (in) */ + __u32 fm_flags; /* FIEMAP_FLAG_* flags for request (in/out) */ + __u32 fm_mapped_extents;/* number of extents that were mapped (out) */ + __u32 fm_extent_count; /* size of fm_extents array (in) */ + __u32 fm_reserved; + struct ll_fiemap_extent fm_extents[0]; /* array of mapped extents (out) */ +}; + +#define FIEMAP_MAX_OFFSET (~0ULL) + +#define FIEMAP_FLAG_SYNC 0x00000001 /* sync file data before map */ +#define FIEMAP_FLAG_XATTR 0x00000002 /* map extended attribute tree */ + +#define FIEMAP_EXTENT_LAST 0x00000001 /* Last extent in file. */ +#define FIEMAP_EXTENT_UNKNOWN 0x00000002 /* Data location unknown. */ +#define FIEMAP_EXTENT_DELALLOC 0x00000004 /* Location still pending. + * Sets EXTENT_UNKNOWN. */ +#define FIEMAP_EXTENT_ENCODED 0x00000008 /* Data can not be read + * while fs is unmounted */ +#define FIEMAP_EXTENT_DATA_ENCRYPTED 0x00000080 /* Data is encrypted by fs. + * Sets EXTENT_NO_DIRECT. */ +#define FIEMAP_EXTENT_NOT_ALIGNED 0x00000100 /* Extent offsets may not be + * block aligned. */ +#define FIEMAP_EXTENT_DATA_INLINE 0x00000200 /* Data mixed with metadata. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_DATA_TAIL 0x00000400 /* Multiple files in block. + * Sets EXTENT_NOT_ALIGNED.*/ +#define FIEMAP_EXTENT_UNWRITTEN 0x00000800 /* Space allocated, but + * no data (i.e. zero). */ +#define FIEMAP_EXTENT_MERGED 0x00001000 /* File does not natively + * support extents. Result + * merged for efficiency. */ + + +static inline size_t fiemap_count_to_size(size_t extent_count) +{ + return (sizeof(struct ll_user_fiemap) + extent_count * + sizeof(struct ll_fiemap_extent)); +} + +static inline unsigned fiemap_size_to_count(size_t array_size) +{ + return ((array_size - sizeof(struct ll_user_fiemap)) / + sizeof(struct ll_fiemap_extent)); +} + +#define FIEMAP_FLAG_DEVICE_ORDER 0x40000000 /* return device ordered mapping */ + +#ifdef FIEMAP_FLAGS_COMPAT +#undef FIEMAP_FLAGS_COMPAT +#endif + +/* Lustre specific flags - use a high bit, don't conflict with upstream flag */ +#define FIEMAP_EXTENT_NO_DIRECT 0x40000000 /* Data mapping undefined */ +#define FIEMAP_EXTENT_NET 0x80000000 /* Data stored remotely. + * Sets NO_DIRECT flag */ + +#endif /* _LUSTRE_FIEMAP_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h new file mode 100644 index 000000000000..93a3d7db3010 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_build_version.h @@ -0,0 +1,2 @@ +#define BUILD_VERSION "v2_3_64_0-g6e62c21-CHANGED-3.9.0" +#define LUSTRE_RELEASE 3.9.0_g6e62c21 diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h new file mode 100644 index 000000000000..8825460f12ac --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_idl.h @@ -0,0 +1,3653 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_idl.h + * + * Lustre wire protocol definitions. + */ + +/** \defgroup lustreidl lustreidl + * + * Lustre wire protocol definitions. + * + * ALL structs passing over the wire should be declared here. Structs + * that are used in interfaces with userspace should go in lustre_user.h. + * + * All structs being declared here should be built from simple fixed-size + * types (__u8, __u16, __u32, __u64) or be built from other types or + * structs also declared in this file. Similarly, all flags and magic + * values in those structs should also be declared here. This ensures + * that the Lustre wire protocol is not influenced by external dependencies. + * + * The only other acceptable items in this file are VERY SIMPLE accessor + * functions to avoid callers grubbing inside the structures, and the + * prototypes of the swabber functions for each struct. Nothing that + * depends on external functions or definitions should be in here. + * + * Structs must be properly aligned to put 64-bit values on an 8-byte + * boundary. Any structs being added here must also be added to + * utils/wirecheck.c and "make newwiretest" run to regenerate the + * utils/wiretest.c sources. This allows us to verify that wire structs + * have the proper alignment/size on all architectures. + * + * DO NOT CHANGE any of the structs, flags, values declared here and used + * in released Lustre versions. Some structs may have padding fields that + * can be used. Some structs might allow addition at the end (verify this + * in the code to ensure that new/old clients that see this larger struct + * do not fail, otherwise you need to implement protocol compatibility). + * + * We assume all nodes are either little-endian or big-endian, and we + * always send messages in the sender's native format. The receiver + * detects the message format by checking the 'magic' field of the message + * (see lustre_msg_swabbed() below). + * + * Each wire type has corresponding 'lustre_swab_xxxtypexxx()' routines, + * implemented either here, inline (trivial implementations) or in + * ptlrpc/pack_generic.c. These 'swabbers' convert the type from "other" + * endian, in-place in the message buffer. + * + * A swabber takes a single pointer argument. The caller must already have + * verified that the length of the message buffer >= sizeof (type). + * + * For variable length types, a second 'lustre_swab_v_xxxtypexxx()' routine + * may be defined that swabs just the variable part, after the caller has + * verified that the message buffer is large enough. + * + * @{ + */ + +#ifndef _LUSTRE_IDL_H_ +#define _LUSTRE_IDL_H_ + +#if !defined(LASSERT) && !defined(LPU64) +#include <linux/libcfs/libcfs.h> /* for LASSERT, LPUX64, etc */ +#endif + +/* Defn's shared with user-space. */ +#include <lustre/lustre_user.h> + +/* + * GENERAL STUFF + */ +/* FOO_REQUEST_PORTAL is for incoming requests on the FOO + * FOO_REPLY_PORTAL is for incoming replies on the FOO + * FOO_BULK_PORTAL is for incoming bulk on the FOO + */ + +#define CONNMGR_REQUEST_PORTAL 1 +#define CONNMGR_REPLY_PORTAL 2 +//#define OSC_REQUEST_PORTAL 3 +#define OSC_REPLY_PORTAL 4 +//#define OSC_BULK_PORTAL 5 +#define OST_IO_PORTAL 6 +#define OST_CREATE_PORTAL 7 +#define OST_BULK_PORTAL 8 +//#define MDC_REQUEST_PORTAL 9 +#define MDC_REPLY_PORTAL 10 +//#define MDC_BULK_PORTAL 11 +#define MDS_REQUEST_PORTAL 12 +//#define MDS_REPLY_PORTAL 13 +#define MDS_BULK_PORTAL 14 +#define LDLM_CB_REQUEST_PORTAL 15 +#define LDLM_CB_REPLY_PORTAL 16 +#define LDLM_CANCEL_REQUEST_PORTAL 17 +#define LDLM_CANCEL_REPLY_PORTAL 18 +//#define PTLBD_REQUEST_PORTAL 19 +//#define PTLBD_REPLY_PORTAL 20 +//#define PTLBD_BULK_PORTAL 21 +#define MDS_SETATTR_PORTAL 22 +#define MDS_READPAGE_PORTAL 23 +#define MDS_MDS_PORTAL 24 + +#define MGC_REPLY_PORTAL 25 +#define MGS_REQUEST_PORTAL 26 +#define MGS_REPLY_PORTAL 27 +#define OST_REQUEST_PORTAL 28 +#define FLD_REQUEST_PORTAL 29 +#define SEQ_METADATA_PORTAL 30 +#define SEQ_DATA_PORTAL 31 +#define SEQ_CONTROLLER_PORTAL 32 +#define MGS_BULK_PORTAL 33 + +/* Portal 63 is reserved for the Cray Inc DVS - nic@cray.com, roe@cray.com, n8851@cray.com */ + +/* packet types */ +#define PTL_RPC_MSG_REQUEST 4711 +#define PTL_RPC_MSG_ERR 4712 +#define PTL_RPC_MSG_REPLY 4713 + +/* DON'T use swabbed values of MAGIC as magic! */ +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 +#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3 + +#define LUSTRE_MSG_MAGIC_V1_SWABBED 0xD00BD00B +#define LUSTRE_MSG_MAGIC_V2_SWABBED 0xD30BD00B + +#define LUSTRE_MSG_MAGIC LUSTRE_MSG_MAGIC_V2 + +#define PTLRPC_MSG_VERSION 0x00000003 +#define LUSTRE_VERSION_MASK 0xffff0000 +#define LUSTRE_OBD_VERSION 0x00010000 +#define LUSTRE_MDS_VERSION 0x00020000 +#define LUSTRE_OST_VERSION 0x00030000 +#define LUSTRE_DLM_VERSION 0x00040000 +#define LUSTRE_LOG_VERSION 0x00050000 +#define LUSTRE_MGS_VERSION 0x00060000 + +typedef __u32 mdsno_t; +typedef __u64 seqno_t; +typedef __u64 obd_id; +typedef __u64 obd_seq; +typedef __s64 obd_time; +typedef __u64 obd_size; +typedef __u64 obd_off; +typedef __u64 obd_blocks; +typedef __u64 obd_valid; +typedef __u32 obd_blksize; +typedef __u32 obd_mode; +typedef __u32 obd_uid; +typedef __u32 obd_gid; +typedef __u32 obd_flag; +typedef __u32 obd_count; + +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_index field holds mdt id + * of the home mdt. + */ +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_index; + __u32 lsr_flags; +}; + +#define LU_SEQ_RANGE_MDT 0x0 +#define LU_SEQ_RANGE_OST 0x1 +#define LU_SEQ_RANGE_ANY 0x3 + +#define LU_SEQ_RANGE_MASK 0x3 + +static inline unsigned fld_range_type(const struct lu_seq_range *range) +{ + return range->lsr_flags & LU_SEQ_RANGE_MASK; +} + +static inline int fld_range_is_ost(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_OST; +} + +static inline int fld_range_is_mdt(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_MDT; +} + +/** + * This all range is only being used when fld client sends fld query request, + * but it does not know whether the seq is MDT or OST, so it will send req + * with ALL type, which means either seq type gotten from lookup can be + * expected. + */ +static inline unsigned fld_range_is_any(const struct lu_seq_range *range) +{ + return fld_range_type(range) == LU_SEQ_RANGE_ANY; +} + +static inline void fld_range_set_type(struct lu_seq_range *range, + unsigned flags) +{ + LASSERT(!(flags & ~LU_SEQ_RANGE_MASK)); + range->lsr_flags |= flags; +} + +static inline void fld_range_set_mdt(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_MDT); +} + +static inline void fld_range_set_ost(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_OST); +} + +static inline void fld_range_set_any(struct lu_seq_range *range) +{ + fld_range_set_type(range, LU_SEQ_RANGE_ANY); +} + +/** + * returns width of given range \a r + */ + +static inline __u64 range_space(const struct lu_seq_range *range) +{ + return range->lsr_end - range->lsr_start; +} + +/** + * initialize range to zero + */ + +static inline void range_init(struct lu_seq_range *range) +{ + range->lsr_start = range->lsr_end = range->lsr_index = 0; +} + +/** + * check if given seq id \a s is within given range \a r + */ + +static inline int range_within(const struct lu_seq_range *range, + __u64 s) +{ + return s >= range->lsr_start && s < range->lsr_end; +} + +static inline int range_is_sane(const struct lu_seq_range *range) +{ + return (range->lsr_end >= range->lsr_start); +} + +static inline int range_is_zero(const struct lu_seq_range *range) +{ + return (range->lsr_start == 0 && range->lsr_end == 0); +} + +static inline int range_is_exhausted(const struct lu_seq_range *range) + +{ + return range_space(range) == 0; +} + +/* return 0 if two range have the same location */ +static inline int range_compare_loc(const struct lu_seq_range *r1, + const struct lu_seq_range *r2) +{ + return r1->lsr_index != r2->lsr_index || + r1->lsr_flags != r2->lsr_flags; +} + +#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x:%s" + +#define PRANGE(range) \ + (range)->lsr_start, \ + (range)->lsr_end, \ + (range)->lsr_index, \ + fld_range_is_mdt(range) ? "mdt" : "ost" + + +/** \defgroup lu_fid lu_fid + * @{ */ + +/** + * Flags for lustre_mdt_attrs::lma_compat and lustre_mdt_attrs::lma_incompat. + * Deprecated since HSM and SOM attributes are now stored in separate on-disk + * xattr. + */ +enum lma_compat { + LMAC_HSM = 0x00000001, + LMAC_SOM = 0x00000002, +}; + +/** + * Masks for all features that should be supported by a Lustre version to + * access a specific file. + * This information is stored in lustre_mdt_attrs::lma_incompat. + */ +enum lma_incompat { + LMAI_RELEASED = 0x0000001, /* file is released */ + LMAI_AGENT = 0x00000002, /* agent inode */ + LMAI_REMOTE_PARENT = 0x00000004, /* the parent of the object + is on the remote MDT */ +}; +#define LMA_INCOMPAT_SUPP (LMAI_AGENT | LMAI_REMOTE_PARENT) + +extern void lustre_lma_swab(struct lustre_mdt_attrs *lma); +extern void lustre_lma_init(struct lustre_mdt_attrs *lma, + const struct lu_fid *fid, __u32 incompat); +/** + * SOM on-disk attributes stored in a separate xattr. + */ +struct som_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 som_compat; + + /** Incompat feature list. The supported feature mask is availabe in + * SOM_INCOMPAT_SUPP */ + __u32 som_incompat; + + /** IO Epoch SOM attributes belongs to */ + __u64 som_ioepoch; + /** total file size in objects */ + __u64 som_size; + /** total fs blocks in objects */ + __u64 som_blocks; + /** mds mount id the size is valid for */ + __u64 som_mountid; +}; +extern void lustre_som_swab(struct som_attrs *attrs); + +#define SOM_INCOMPAT_SUPP 0x0 + +/** + * HSM on-disk attributes stored in a separate xattr. + */ +struct hsm_attrs { + /** Bitfield for supported data in this structure. For future use. */ + __u32 hsm_compat; + + /** HSM flags, see hsm_flags enum below */ + __u32 hsm_flags; + /** backend archive id associated with the file */ + __u64 hsm_arch_id; + /** version associated with the last archiving, if any */ + __u64 hsm_arch_ver; +}; +extern void lustre_hsm_swab(struct hsm_attrs *attrs); + +/** + * fid constants + */ +enum { + /** LASTID file has zero OID */ + LUSTRE_FID_LASTID_OID = 0UL, + /** initial fid id value */ + LUSTRE_FID_INIT_OID = 1UL +}; + +/** returns fid object sequence */ +static inline __u64 fid_seq(const struct lu_fid *fid) +{ + return fid->f_seq; +} + +/** returns fid object id */ +static inline __u32 fid_oid(const struct lu_fid *fid) +{ + return fid->f_oid; +} + +/** returns fid object version */ +static inline __u32 fid_ver(const struct lu_fid *fid) +{ + return fid->f_ver; +} + +static inline void fid_zero(struct lu_fid *fid) +{ + memset(fid, 0, sizeof(*fid)); +} + +static inline obd_id fid_ver_oid(const struct lu_fid *fid) +{ + return ((__u64)fid_ver(fid) << 32 | fid_oid(fid)); +} + +/** + * Note that reserved SEQ numbers below 12 will conflict with ldiskfs + * inodes in the IGIF namespace, so these reserved SEQ numbers can be + * used for other purposes and not risk collisions with existing inodes. + * + * Different FID Format + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs#NEW.0 + */ +enum fid_seq { + FID_SEQ_OST_MDT0 = 0, + FID_SEQ_LLOG = 1, /* unnamed llogs */ + FID_SEQ_ECHO = 2, + FID_SEQ_OST_MDT1 = 3, + FID_SEQ_OST_MAX = 9, /* Max MDT count before OST_on_FID */ + FID_SEQ_LLOG_NAME = 10, /* named llogs */ + FID_SEQ_RSVD = 11, + FID_SEQ_IGIF = 12, + FID_SEQ_IGIF_MAX = 0x0ffffffffULL, + FID_SEQ_IDIF = 0x100000000ULL, + FID_SEQ_IDIF_MAX = 0x1ffffffffULL, + /* Normal FID sequence starts from this value, i.e. 1<<33 */ + FID_SEQ_START = 0x200000000ULL, + /* sequence for local pre-defined FIDs listed in local_oid */ + FID_SEQ_LOCAL_FILE = 0x200000001ULL, + FID_SEQ_DOT_LUSTRE = 0x200000002ULL, + /* sequence is used for local named objects FIDs generated + * by local_object_storage library */ + FID_SEQ_LOCAL_NAME = 0x200000003ULL, + /* Because current FLD will only cache the fid sequence, instead + * of oid on the client side, if the FID needs to be exposed to + * clients sides, it needs to make sure all of fids under one + * sequence will be located in one MDT. */ + FID_SEQ_SPECIAL = 0x200000004ULL, + FID_SEQ_QUOTA = 0x200000005ULL, + FID_SEQ_QUOTA_GLB = 0x200000006ULL, + FID_SEQ_ROOT = 0x200000007ULL, /* Located on MDT0 */ + FID_SEQ_NORMAL = 0x200000400ULL, + FID_SEQ_LOV_DEFAULT = 0xffffffffffffffffULL +}; + +#define OBIF_OID_MAX_BITS 32 +#define OBIF_MAX_OID (1ULL << OBIF_OID_MAX_BITS) +#define OBIF_OID_MASK ((1ULL << OBIF_OID_MAX_BITS) - 1) +#define IDIF_OID_MAX_BITS 48 +#define IDIF_MAX_OID (1ULL << IDIF_OID_MAX_BITS) +#define IDIF_OID_MASK ((1ULL << IDIF_OID_MAX_BITS) - 1) + +/** OID for FID_SEQ_SPECIAL */ +enum special_oid { + /* Big Filesystem Lock to serialize rename operations */ + FID_OID_SPECIAL_BFL = 1UL, +}; + +/** OID for FID_SEQ_DOT_LUSTRE */ +enum dot_lustre_oid { + FID_OID_DOT_LUSTRE = 1UL, + FID_OID_DOT_LUSTRE_OBF = 2UL, +}; + +static inline int fid_seq_is_mdt0(obd_seq seq) +{ + return (seq == FID_SEQ_OST_MDT0); +} + +static inline int fid_seq_is_mdt(const __u64 seq) +{ + return seq == FID_SEQ_OST_MDT0 || seq >= FID_SEQ_NORMAL; +}; + +static inline int fid_seq_is_echo(obd_seq seq) +{ + return (seq == FID_SEQ_ECHO); +} + +static inline int fid_is_echo(const struct lu_fid *fid) +{ + return fid_seq_is_echo(fid_seq(fid)); +} + +static inline int fid_seq_is_llog(obd_seq seq) +{ + return (seq == FID_SEQ_LLOG); +} + +static inline int fid_is_llog(const struct lu_fid *fid) +{ + /* file with OID == 0 is not llog but contains last oid */ + return fid_seq_is_llog(fid_seq(fid)) && fid_oid(fid) > 0; +} + +static inline int fid_seq_is_rsvd(const __u64 seq) +{ + return (seq > FID_SEQ_OST_MDT0 && seq <= FID_SEQ_RSVD); +}; + +static inline int fid_seq_is_special(const __u64 seq) +{ + return seq == FID_SEQ_SPECIAL; +}; + +static inline int fid_seq_is_local_file(const __u64 seq) +{ + return seq == FID_SEQ_LOCAL_FILE || + seq == FID_SEQ_LOCAL_NAME; +}; + +static inline int fid_seq_is_root(const __u64 seq) +{ + return seq == FID_SEQ_ROOT; +} + +static inline int fid_seq_is_dot(const __u64 seq) +{ + return seq == FID_SEQ_DOT_LUSTRE; +} + +static inline int fid_seq_is_default(const __u64 seq) +{ + return seq == FID_SEQ_LOV_DEFAULT; +} + +static inline int fid_is_mdt0(const struct lu_fid *fid) +{ + return fid_seq_is_mdt0(fid_seq(fid)); +} + +static inline void lu_root_fid(struct lu_fid *fid) +{ + fid->f_seq = FID_SEQ_ROOT; + fid->f_oid = 1; + fid->f_ver = 0; +} + +/** + * Check if a fid is igif or not. + * \param fid the fid to be tested. + * \return true if the fid is a igif; otherwise false. + */ +static inline int fid_seq_is_igif(const __u64 seq) +{ + return seq >= FID_SEQ_IGIF && seq <= FID_SEQ_IGIF_MAX; +} + +static inline int fid_is_igif(const struct lu_fid *fid) +{ + return fid_seq_is_igif(fid_seq(fid)); +} + +/** + * Check if a fid is idif or not. + * \param fid the fid to be tested. + * \return true if the fid is a idif; otherwise false. + */ +static inline int fid_seq_is_idif(const __u64 seq) +{ + return seq >= FID_SEQ_IDIF && seq <= FID_SEQ_IDIF_MAX; +} + +static inline int fid_is_idif(const struct lu_fid *fid) +{ + return fid_seq_is_idif(fid_seq(fid)); +} + +static inline int fid_is_local_file(const struct lu_fid *fid) +{ + return fid_seq_is_local_file(fid_seq(fid)); +} + +static inline int fid_seq_is_norm(const __u64 seq) +{ + return (seq >= FID_SEQ_NORMAL); +} + +static inline int fid_is_norm(const struct lu_fid *fid) +{ + return fid_seq_is_norm(fid_seq(fid)); +} + +/* convert an OST objid into an IDIF FID SEQ number */ +static inline obd_seq fid_idif_seq(obd_id id, __u32 ost_idx) +{ + return FID_SEQ_IDIF | (ost_idx << 16) | ((id >> 32) & 0xffff); +} + +/* convert a packed IDIF FID into an OST objid */ +static inline obd_id fid_idif_id(obd_seq seq, __u32 oid, __u32 ver) +{ + return ((__u64)ver << 48) | ((seq & 0xffff) << 32) | oid; +} + +/* extract ost index from IDIF FID */ +static inline __u32 fid_idif_ost_idx(const struct lu_fid *fid) +{ + LASSERT(fid_is_idif(fid)); + return (fid_seq(fid) >> 16) & 0xffff; +} + +/* extract OST sequence (group) from a wire ost_id (id/seq) pair */ +static inline obd_seq ostid_seq(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid->oi.oi_seq)) + return FID_SEQ_OST_MDT0; + + if (fid_seq_is_default(ostid->oi.oi_seq)) + return FID_SEQ_LOV_DEFAULT; + + if (fid_is_idif(&ostid->oi_fid)) + return FID_SEQ_OST_MDT0; + + return fid_seq(&ostid->oi_fid); +} + +/* extract OST objid from a wire ost_id (id/seq) pair */ +static inline obd_id ostid_id(const struct ost_id *ostid) +{ + if (fid_seq_is_mdt0(ostid_seq(ostid))) + return ostid->oi.oi_id & IDIF_OID_MASK; + + if (fid_is_idif(&ostid->oi_fid)) + return fid_idif_id(fid_seq(&ostid->oi_fid), + fid_oid(&ostid->oi_fid), 0); + + return fid_oid(&ostid->oi_fid); +} + +static inline void ostid_set_seq(struct ost_id *oi, __u64 seq) +{ + if (fid_seq_is_mdt0(seq) || fid_seq_is_default(seq)) { + oi->oi.oi_seq = seq; + } else { + oi->oi_fid.f_seq = seq; + /* Note: if f_oid + f_ver is zero, we need init it + * to be 1, otherwise, ostid_seq will treat this + * as old ostid (oi_seq == 0) */ + if (oi->oi_fid.f_oid == 0 && oi->oi_fid.f_ver == 0) + oi->oi_fid.f_oid = LUSTRE_FID_INIT_OID; + } +} + +static inline void ostid_set_seq_mdt0(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_OST_MDT0); +} + +static inline void ostid_set_seq_echo(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_ECHO); +} + +static inline void ostid_set_seq_llog(struct ost_id *oi) +{ + ostid_set_seq(oi, FID_SEQ_LLOG); +} + +/** + * Note: we need check oi_seq to decide where to set oi_id, + * so oi_seq should always be set ahead of oi_id. + */ +static inline void ostid_set_id(struct ost_id *oi, __u64 oid) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (oid >= IDIF_MAX_OID) { + CERROR("Bad "LPU64" to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi.oi_id = oid; + } else { + if (oid > OBIF_MAX_OID) { + CERROR("Bad "LPU64" to set "DOSTID"\n", + oid, POSTID(oi)); + return; + } + oi->oi_fid.f_oid = oid; + } +} + +static inline void ostid_inc_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + if (unlikely(ostid_id(oi) + 1 > IDIF_MAX_OID)) { + CERROR("Bad inc "DOSTID"\n", POSTID(oi)); + return; + } + oi->oi.oi_id++; + } else { + oi->oi_fid.f_oid++; + } +} + +static inline void ostid_dec_id(struct ost_id *oi) +{ + if (fid_seq_is_mdt0(ostid_seq(oi))) + oi->oi.oi_id--; + else + oi->oi_fid.f_oid--; +} + +/** + * Unpack an OST object id/seq (group) into a FID. This is needed for + * converting all obdo, lmm, lsm, etc. 64-bit id/seq pairs into proper + * FIDs. Note that if an id/seq is already in FID/IDIF format it will + * be passed through unchanged. Only legacy OST objects in "group 0" + * will be mapped into the IDIF namespace so that they can fit into the + * struct lu_fid fields without loss. For reference see: + * http://arch.lustre.org/index.php?title=Interoperability_fids_zfs + */ +static inline int ostid_to_fid(struct lu_fid *fid, struct ost_id *ostid, + __u32 ost_idx) +{ + if (ost_idx > 0xffff) { + CERROR("bad ost_idx, "DOSTID" ost_idx:%u\n", POSTID(ostid), + ost_idx); + return -EBADF; + } + + if (fid_seq_is_mdt0(ostid_seq(ostid))) { + /* This is a "legacy" (old 1.x/2.early) OST object in "group 0" + * that we map into the IDIF namespace. It allows up to 2^48 + * objects per OST, as this is the object namespace that has + * been in production for years. This can handle create rates + * of 1M objects/s/OST for 9 years, or combinations thereof. */ + if (ostid_id(ostid) >= IDIF_MAX_OID) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + fid->f_seq = fid_idif_seq(ostid_id(ostid), ost_idx); + /* truncate to 32 bits by assignment */ + fid->f_oid = ostid_id(ostid); + /* in theory, not currently used */ + fid->f_ver = ostid_id(ostid) >> 48; + } else /* if (fid_seq_is_idif(seq) || fid_seq_is_norm(seq)) */ { + /* This is either an IDIF object, which identifies objects across + * all OSTs, or a regular FID. The IDIF namespace maps legacy + * OST objects into the FID namespace. In both cases, we just + * pass the FID through, no conversion needed. */ + if (ostid->oi_fid.f_ver != 0) { + CERROR("bad MDT0 id, "DOSTID" ost_idx:%u\n", + POSTID(ostid), ost_idx); + return -EBADF; + } + *fid = ostid->oi_fid; + } + + return 0; +} + +/* pack any OST FID into an ostid (id/seq) for the wire/disk */ +static inline int fid_to_ostid(const struct lu_fid *fid, struct ost_id *ostid) +{ + if (unlikely(fid_seq_is_igif(fid->f_seq))) { + CERROR("bad IGIF, "DFID"\n", PFID(fid)); + return -EBADF; + } + + if (fid_is_idif(fid)) { + ostid_set_seq_mdt0(ostid); + ostid_set_id(ostid, fid_idif_id(fid_seq(fid), fid_oid(fid), + fid_ver(fid))); + } else { + ostid->oi_fid = *fid; + } + + return 0; +} + +/* Check whether the fid is for LAST_ID */ +static inline int fid_is_last_id(const struct lu_fid *fid) +{ + return (fid_oid(fid) == 0); +} + +/** + * Get inode number from a igif. + * \param fid a igif to get inode number from. + * \return inode number for the igif. + */ +static inline ino_t lu_igif_ino(const struct lu_fid *fid) +{ + return fid_seq(fid); +} + +extern void lustre_swab_ost_id(struct ost_id *oid); + +/** + * Get inode generation from a igif. + * \param fid a igif to get inode generation from. + * \return inode generation for the igif. + */ +static inline __u32 lu_igif_gen(const struct lu_fid *fid) +{ + return fid_oid(fid); +} + +/** + * Build igif from the inode number/generation. + */ +static inline void lu_igif_build(struct lu_fid *fid, __u32 ino, __u32 gen) +{ + fid->f_seq = ino; + fid->f_oid = gen; + fid->f_ver = 0; +} + +/* + * Fids are transmitted across network (in the sender byte-ordering), + * and stored on disk in big-endian order. + */ +static inline void fid_cpu_to_le(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = cpu_to_le64(fid_seq(src)); + dst->f_oid = cpu_to_le32(fid_oid(src)); + dst->f_ver = cpu_to_le32(fid_ver(src)); +} + +static inline void fid_le_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = le64_to_cpu(fid_seq(src)); + dst->f_oid = le32_to_cpu(fid_oid(src)); + dst->f_ver = le32_to_cpu(fid_ver(src)); +} + +static inline void fid_cpu_to_be(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = cpu_to_be64(fid_seq(src)); + dst->f_oid = cpu_to_be32(fid_oid(src)); + dst->f_ver = cpu_to_be32(fid_ver(src)); +} + +static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) +{ + /* check that all fields are converted */ + CLASSERT(sizeof *src == + sizeof fid_seq(src) + + sizeof fid_oid(src) + sizeof fid_ver(src)); + dst->f_seq = be64_to_cpu(fid_seq(src)); + dst->f_oid = be32_to_cpu(fid_oid(src)); + dst->f_ver = be32_to_cpu(fid_ver(src)); +} + +static inline int fid_is_sane(const struct lu_fid *fid) +{ + return fid != NULL && + ((fid_seq(fid) >= FID_SEQ_START && fid_ver(fid) == 0) || + fid_is_igif(fid) || fid_is_idif(fid) || + fid_seq_is_rsvd(fid_seq(fid))); +} + +static inline int fid_is_zero(const struct lu_fid *fid) +{ + return fid_seq(fid) == 0 && fid_oid(fid) == 0; +} + +extern void lustre_swab_lu_fid(struct lu_fid *fid); +extern void lustre_swab_lu_seq_range(struct lu_seq_range *range); + +static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) +{ + /* Check that there is no alignment padding. */ + CLASSERT(sizeof *f0 == + sizeof f0->f_seq + sizeof f0->f_oid + sizeof f0->f_ver); + return memcmp(f0, f1, sizeof *f0) == 0; +} + +#define __diff_normalize(val0, val1) \ +({ \ + typeof(val0) __val0 = (val0); \ + typeof(val1) __val1 = (val1); \ + \ + (__val0 == __val1 ? 0 : __val0 > __val1 ? +1 : -1); \ +}) + +static inline int lu_fid_cmp(const struct lu_fid *f0, + const struct lu_fid *f1) +{ + return + __diff_normalize(fid_seq(f0), fid_seq(f1)) ?: + __diff_normalize(fid_oid(f0), fid_oid(f1)) ?: + __diff_normalize(fid_ver(f0), fid_ver(f1)); +} + +static inline void ostid_cpu_to_le(struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); + } else { + fid_cpu_to_le(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +static inline void ostid_le_to_cpu(struct ost_id *src_oi, + struct ost_id *dst_oi) +{ + if (fid_seq_is_mdt0(ostid_seq(src_oi))) { + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); + } else { + fid_le_to_cpu(&dst_oi->oi_fid, &src_oi->oi_fid); + } +} + +/** @} lu_fid */ + +/** \defgroup lu_dir lu_dir + * @{ */ + +/** + * Enumeration of possible directory entry attributes. + * + * Attributes follow directory entry header in the order they appear in this + * enumeration. + */ +enum lu_dirent_attrs { + LUDA_FID = 0x0001, + LUDA_TYPE = 0x0002, + LUDA_64BITHASH = 0x0004, + + /* The following attrs are used for MDT interanl only, + * not visible to client */ + + /* Verify the dirent consistency */ + LUDA_VERIFY = 0x8000, + /* Only check but not repair the dirent inconsistency */ + LUDA_VERIFY_DRYRUN = 0x4000, + /* The dirent has been repaired, or to be repaired (dryrun). */ + LUDA_REPAIR = 0x2000, + /* The system is upgraded, has beed or to be repaired (dryrun). */ + LUDA_UPGRADE = 0x1000, + /* Ignore this record, go to next directly. */ + LUDA_IGNORE = 0x0800, +}; + +#define LU_DIRENT_ATTRS_MASK 0xf800 + +/** + * Layout of readdir pages, as transmitted on wire. + */ +struct lu_dirent { + /** valid if LUDA_FID is set. */ + struct lu_fid lde_fid; + /** a unique entry identifier: a hash or an offset. */ + __u64 lde_hash; + /** total record length, including all attributes. */ + __u16 lde_reclen; + /** name length */ + __u16 lde_namelen; + /** optional variable size attributes following this entry. + * taken from enum lu_dirent_attrs. + */ + __u32 lde_attrs; + /** name is followed by the attributes indicated in ->ldp_attrs, in + * their natural order. After the last attribute, padding bytes are + * added to make ->lde_reclen a multiple of 8. + */ + char lde_name[0]; +}; + +/* + * Definitions of optional directory entry attributes formats. + * + * Individual attributes do not have their length encoded in a generic way. It + * is assumed that consumer of an attribute knows its format. This means that + * it is impossible to skip over an unknown attribute, except by skipping over all + * remaining attributes (by using ->lde_reclen), which is not too + * constraining, because new server versions will append new attributes at + * the end of an entry. + */ + +/** + * Fid directory attribute: a fid of an object referenced by the entry. This + * will be almost always requested by the client and supplied by the server. + * + * Aligned to 8 bytes. + */ +/* To have compatibility with 1.8, lets have fid in lu_dirent struct. */ + +/** + * File type. + * + * Aligned to 2 bytes. + */ +struct luda_type { + __u16 lt_type; +}; + +struct lu_dirpage { + __u64 ldp_hash_start; + __u64 ldp_hash_end; + __u32 ldp_flags; + __u32 ldp_pad0; + struct lu_dirent ldp_entries[0]; +}; + +enum lu_dirpage_flags { + /** + * dirpage contains no entry. + */ + LDF_EMPTY = 1 << 0, + /** + * last entry's lde_hash equals ldp_hash_end. + */ + LDF_COLLIDE = 1 << 1 +}; + +static inline struct lu_dirent *lu_dirent_start(struct lu_dirpage *dp) +{ + if (le32_to_cpu(dp->ldp_flags) & LDF_EMPTY) + return NULL; + else + return dp->ldp_entries; +} + +static inline struct lu_dirent *lu_dirent_next(struct lu_dirent *ent) +{ + struct lu_dirent *next; + + if (le16_to_cpu(ent->lde_reclen) != 0) + next = ((void *)ent) + le16_to_cpu(ent->lde_reclen); + else + next = NULL; + + return next; +} + +static inline int lu_dirent_calc_size(int namelen, __u16 attr) +{ + int size; + + if (attr & LUDA_TYPE) { + const unsigned align = sizeof(struct luda_type) - 1; + size = (sizeof(struct lu_dirent) + namelen + align) & ~align; + size += sizeof(struct luda_type); + } else + size = sizeof(struct lu_dirent) + namelen; + + return (size + 7) & ~7; +} + +static inline int lu_dirent_size(struct lu_dirent *ent) +{ + if (le16_to_cpu(ent->lde_reclen) == 0) { + return lu_dirent_calc_size(le16_to_cpu(ent->lde_namelen), + le32_to_cpu(ent->lde_attrs)); + } + return le16_to_cpu(ent->lde_reclen); +} + +#define MDS_DIR_END_OFF 0xfffffffffffffffeULL + +/** + * MDS_READPAGE page size + * + * This is the directory page size packed in MDS_READPAGE RPC. + * It's different than PAGE_CACHE_SIZE because the client needs to + * access the struct lu_dirpage header packed at the beginning of + * the "page" and without this there isn't any way to know find the + * lu_dirpage header is if client and server PAGE_CACHE_SIZE differ. + */ +#define LU_PAGE_SHIFT 12 +#define LU_PAGE_SIZE (1UL << LU_PAGE_SHIFT) +#define LU_PAGE_MASK (~(LU_PAGE_SIZE - 1)) + +#define LU_PAGE_COUNT (1 << (PAGE_CACHE_SHIFT - LU_PAGE_SHIFT)) + +/** @} lu_dir */ + +struct lustre_handle { + __u64 cookie; +}; +#define DEAD_HANDLE_MAGIC 0xdeadbeefcafebabeULL + +static inline int lustre_handle_is_used(struct lustre_handle *lh) +{ + return lh->cookie != 0ull; +} + +static inline int lustre_handle_equal(const struct lustre_handle *lh1, + const struct lustre_handle *lh2) +{ + return lh1->cookie == lh2->cookie; +} + +static inline void lustre_handle_copy(struct lustre_handle *tgt, + struct lustre_handle *src) +{ + tgt->cookie = src->cookie; +} + +/* flags for lm_flags */ +#define MSGHDR_AT_SUPPORT 0x1 +#define MSGHDR_CKSUM_INCOMPAT18 0x2 + +#define lustre_msg lustre_msg_v2 +/* we depend on this structure to be 8-byte aligned */ +/* this type is only endian-adjusted in lustre_unpack_msg() */ +struct lustre_msg_v2 { + __u32 lm_bufcount; + __u32 lm_secflvr; + __u32 lm_magic; + __u32 lm_repsize; + __u32 lm_cksum; + __u32 lm_flags; + __u32 lm_padding_2; + __u32 lm_padding_3; + __u32 lm_buflens[0]; +}; + +/* without gss, ptlrpc_body is put at the first buffer. */ +#define PTLRPC_NUM_VERSIONS 4 +#define JOBSTATS_JOBID_SIZE 32 /* 32 bytes string */ +struct ptlrpc_body_v3 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; + char pb_jobid[JOBSTATS_JOBID_SIZE]; +}; +#define ptlrpc_body ptlrpc_body_v3 + +struct ptlrpc_body_v2 { + struct lustre_handle pb_handle; + __u32 pb_type; + __u32 pb_version; + __u32 pb_opc; + __u32 pb_status; + __u64 pb_last_xid; + __u64 pb_last_seen; + __u64 pb_last_committed; + __u64 pb_transno; + __u32 pb_flags; + __u32 pb_op_flags; + __u32 pb_conn_cnt; + __u32 pb_timeout; /* for req, the deadline, for rep, the service est */ + __u32 pb_service_time; /* for rep, actual service time, also used for + net_latency of req */ + __u32 pb_limit; + __u64 pb_slv; + /* VBR: pre-versions */ + __u64 pb_pre_versions[PTLRPC_NUM_VERSIONS]; + /* padding for future needs */ + __u64 pb_padding[4]; +}; + +extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); + +/* message body offset for lustre_msg_v2 */ +/* ptlrpc body offset in all request/reply messages */ +#define MSG_PTLRPC_BODY_OFF 0 + +/* normal request/reply message record offset */ +#define REQ_REC_OFF 1 +#define REPLY_REC_OFF 1 + +/* ldlm request message body offset */ +#define DLM_LOCKREQ_OFF 1 /* lockreq offset */ +#define DLM_REQ_REC_OFF 2 /* normal dlm request record offset */ + +/* ldlm intent lock message body offset */ +#define DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define DLM_INTENT_REC_OFF 3 /* intent lock record offset */ + +/* ldlm reply message body offset */ +#define DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define DLM_REPLY_REC_OFF 2 /* reply record offset */ + +/** only use in req->rq_{req,rep}_swab_mask */ +#define MSG_PTLRPC_HEADER_OFF 31 + +/* Flags that are operation-specific go in the top 16 bits. */ +#define MSG_OP_FLAG_MASK 0xffff0000 +#define MSG_OP_FLAG_SHIFT 16 + +/* Flags that apply to all requests are in the bottom 16 bits */ +#define MSG_GEN_FLAG_MASK 0x0000ffff +#define MSG_LAST_REPLAY 0x0001 +#define MSG_RESENT 0x0002 +#define MSG_REPLAY 0x0004 +/* #define MSG_AT_SUPPORT 0x0008 + * This was used in early prototypes of adaptive timeouts, and while there + * shouldn't be any users of that code there also isn't a need for using this + * bits. Defer usage until at least 1.10 to avoid potential conflict. */ +#define MSG_DELAY_REPLAY 0x0010 +#define MSG_VERSION_REPLAY 0x0020 +#define MSG_REQ_REPLAY_DONE 0x0040 +#define MSG_LOCK_REPLAY_DONE 0x0080 + +/* + * Flags for all connect opcodes (MDS_CONNECT, OST_CONNECT) + */ + +#define MSG_CONNECT_RECOVERING 0x00000001 +#define MSG_CONNECT_RECONNECT 0x00000002 +#define MSG_CONNECT_REPLAYABLE 0x00000004 +//#define MSG_CONNECT_PEER 0x8 +#define MSG_CONNECT_LIBCLIENT 0x00000010 +#define MSG_CONNECT_INITIAL 0x00000020 +#define MSG_CONNECT_ASYNC 0x00000040 +#define MSG_CONNECT_NEXT_VER 0x00000080 /* use next version of lustre_msg */ +#define MSG_CONNECT_TRANSNO 0x00000100 /* report transno */ + +/* Connect flags */ +#define OBD_CONNECT_RDONLY 0x1ULL /*client has read-only access*/ +#define OBD_CONNECT_INDEX 0x2ULL /*connect specific LOV idx */ +#define OBD_CONNECT_MDS 0x4ULL /*connect from MDT to OST */ +#define OBD_CONNECT_GRANT 0x8ULL /*OSC gets grant at connect */ +#define OBD_CONNECT_SRVLOCK 0x10ULL /*server takes locks for cli */ +#define OBD_CONNECT_VERSION 0x20ULL /*Lustre versions in ocd */ +#define OBD_CONNECT_REQPORTAL 0x40ULL /*Separate non-IO req portal */ +#define OBD_CONNECT_ACL 0x80ULL /*access control lists */ +#define OBD_CONNECT_XATTR 0x100ULL /*client use extended attr */ +#define OBD_CONNECT_CROW 0x200ULL /*MDS+OST create obj on write*/ +#define OBD_CONNECT_TRUNCLOCK 0x400ULL /*locks on server for punch */ +#define OBD_CONNECT_TRANSNO 0x800ULL /*replay sends init transno */ +#define OBD_CONNECT_IBITS 0x1000ULL /*support for inodebits locks*/ +#define OBD_CONNECT_JOIN 0x2000ULL /*files can be concatenated. + *We do not support JOIN FILE + *anymore, reserve this flags + *just for preventing such bit + *to be reused.*/ +#define OBD_CONNECT_ATTRFID 0x4000ULL /*Server can GetAttr By Fid*/ +#define OBD_CONNECT_NODEVOH 0x8000ULL /*No open hndl on specl nodes*/ +#define OBD_CONNECT_RMT_CLIENT 0x10000ULL /*Remote client */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x20000ULL /*Remote client by force */ +#define OBD_CONNECT_BRW_SIZE 0x40000ULL /*Max bytes per rpc */ +#define OBD_CONNECT_QUOTA64 0x80000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_MDS_CAPA 0x100000ULL /*MDS capability */ +#define OBD_CONNECT_OSS_CAPA 0x200000ULL /*OSS capability */ +#define OBD_CONNECT_CANCELSET 0x400000ULL /*Early batched cancels. */ +#define OBD_CONNECT_SOM 0x800000ULL /*Size on MDS */ +#define OBD_CONNECT_AT 0x1000000ULL /*client uses AT */ +#define OBD_CONNECT_LRU_RESIZE 0x2000000ULL /*LRU resize feature. */ +#define OBD_CONNECT_MDS_MDS 0x4000000ULL /*MDS-MDS connection */ +#define OBD_CONNECT_REAL 0x8000000ULL /*real connection */ +#define OBD_CONNECT_CHANGE_QS 0x10000000ULL /*Not used since 2.4 */ +#define OBD_CONNECT_CKSUM 0x20000000ULL /*support several cksum algos*/ +#define OBD_CONNECT_FID 0x40000000ULL /*FID is supported by server */ +#define OBD_CONNECT_VBR 0x80000000ULL /*version based recovery */ +#define OBD_CONNECT_LOV_V3 0x100000000ULL /*client supports LOV v3 EA */ +#define OBD_CONNECT_GRANT_SHRINK 0x200000000ULL /* support grant shrink */ +#define OBD_CONNECT_SKIP_ORPHAN 0x400000000ULL /* don't reuse orphan objids */ +#define OBD_CONNECT_MAX_EASIZE 0x800000000ULL /* preserved for large EA */ +#define OBD_CONNECT_FULL20 0x1000000000ULL /* it is 2.0 client */ +#define OBD_CONNECT_LAYOUTLOCK 0x2000000000ULL /* client uses layout lock */ +#define OBD_CONNECT_64BITHASH 0x4000000000ULL /* client supports 64-bits + * directory hash */ +#define OBD_CONNECT_MAXBYTES 0x8000000000ULL /* max stripe size */ +#define OBD_CONNECT_IMP_RECOV 0x10000000000ULL /* imp recovery support */ +#define OBD_CONNECT_JOBSTATS 0x20000000000ULL /* jobid in ptlrpc_body */ +#define OBD_CONNECT_UMASK 0x40000000000ULL /* create uses client umask */ +#define OBD_CONNECT_EINPROGRESS 0x80000000000ULL /* client handles -EINPROGRESS + * RPC error properly */ +#define OBD_CONNECT_GRANT_PARAM 0x100000000000ULL/* extra grant params used for + * finer space reservation */ +#define OBD_CONNECT_FLOCK_OWNER 0x200000000000ULL /* for the fixed 1.8 + * policy and 2.x server */ +#define OBD_CONNECT_LVB_TYPE 0x400000000000ULL /* variable type of LVB */ +#define OBD_CONNECT_NANOSEC_TIME 0x800000000000ULL /* nanosecond timestamps */ +#define OBD_CONNECT_LIGHTWEIGHT 0x1000000000000ULL/* lightweight connection */ +#define OBD_CONNECT_SHORTIO 0x2000000000000ULL/* short io */ +#define OBD_CONNECT_PINGLESS 0x4000000000000ULL/* pings not required */ +/* XXX README XXX: + * Please DO NOT add flag values here before first ensuring that this same + * flag value is not in use on some other branch. Please clear any such + * changes with senior engineers before starting to use a new flag. Then, + * submit a small patch against EVERY branch that ONLY adds the new flag, + * updates obd_connect_names[] for lprocfs_rd_connect_flags(), adds the + * flag to check_obd_connect_data(), and updates wiretests accordingly, so it + * can be approved and landed easily to reserve the flag for future use. */ + +/* The MNE_SWAB flag is overloading the MDS_MDS bit only for the MGS + * connection. It is a temporary bug fix for Imperative Recovery interop + * between 2.2 and 2.3 x86/ppc nodes, and can be removed when interop for + * 2.2 clients/servers is no longer needed. LU-1252/LU-1644. */ +#define OBD_CONNECT_MNE_SWAB OBD_CONNECT_MDS_MDS + +#define OCD_HAS_FLAG(ocd, flg) \ + (!!((ocd)->ocd_connect_flags & OBD_CONNECT_##flg)) + + +#define LRU_RESIZE_CONNECT_FLAG OBD_CONNECT_LRU_RESIZE + +#define MDT_CONNECT_SUPPORTED (OBD_CONNECT_RDONLY | OBD_CONNECT_VERSION | \ + OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ + OBD_CONNECT_IBITS | \ + OBD_CONNECT_NODEVOH | OBD_CONNECT_ATTRFID | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_MDS_CAPA | \ + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_MDS_MDS | \ + OBD_CONNECT_FID | LRU_RESIZE_CONNECT_FLAG | \ + OBD_CONNECT_VBR | OBD_CONNECT_LOV_V3 | \ + OBD_CONNECT_SOM | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_UMASK | \ + OBD_CONNECT_LVB_TYPE | OBD_CONNECT_LAYOUTLOCK |\ + OBD_CONNECT_PINGLESS) +#define OST_CONNECT_SUPPORTED (OBD_CONNECT_SRVLOCK | OBD_CONNECT_GRANT | \ + OBD_CONNECT_REQPORTAL | OBD_CONNECT_VERSION | \ + OBD_CONNECT_TRUNCLOCK | OBD_CONNECT_INDEX | \ + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_OSS_CAPA | \ + OBD_CONNECT_CANCELSET | OBD_CONNECT_AT | \ + LRU_RESIZE_CONNECT_FLAG | OBD_CONNECT_CKSUM | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | OBD_CONNECT_VBR | \ + OBD_CONNECT_MDS | OBD_CONNECT_SKIP_ORPHAN | \ + OBD_CONNECT_GRANT_SHRINK | OBD_CONNECT_FULL20 | \ + OBD_CONNECT_64BITHASH | OBD_CONNECT_MAXBYTES | \ + OBD_CONNECT_MAX_EASIZE | \ + OBD_CONNECT_EINPROGRESS | \ + OBD_CONNECT_JOBSTATS | \ + OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ + OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ + OBD_CONNECT_PINGLESS) +#define ECHO_CONNECT_SUPPORTED (0) +#define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ + OBD_CONNECT_MNE_SWAB | OBD_CONNECT_PINGLESS) + +/* Features required for this version of the client to work with server */ +#define CLIENT_CONNECT_MDT_REQD (OBD_CONNECT_IBITS | OBD_CONNECT_FID | \ + OBD_CONNECT_FULL20) + +#define OBD_OCD_VERSION(major,minor,patch,fix) (((major)<<24) + ((minor)<<16) +\ + ((patch)<<8) + (fix)) +#define OBD_OCD_VERSION_MAJOR(version) ((int)((version)>>24)&255) +#define OBD_OCD_VERSION_MINOR(version) ((int)((version)>>16)&255) +#define OBD_OCD_VERSION_PATCH(version) ((int)((version)>>8)&255) +#define OBD_OCD_VERSION_FIX(version) ((int)(version)&255) + +/* This structure is used for both request and reply. + * + * If we eventually have separate connect data for different types, which we + * almost certainly will, then perhaps we stick a union in here. */ +struct obd_connect_data_v1 { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes, must be 2^n */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* also fix lustre_swab_connect */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ +}; + +struct obd_connect_data { + __u64 ocd_connect_flags; /* OBD_CONNECT_* per above */ + __u32 ocd_version; /* lustre release version number */ + __u32 ocd_grant; /* initial cache grant amount (bytes) */ + __u32 ocd_index; /* LOV index to connect to */ + __u32 ocd_brw_size; /* Maximum BRW size in bytes */ + __u64 ocd_ibits_known; /* inode bits this client understands */ + __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ + __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ + __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u64 ocd_transno; /* first transno from client to be replayed */ + __u32 ocd_group; /* MDS group on OST */ + __u32 ocd_cksum_types; /* supported checksum algorithms */ + __u32 ocd_max_easize; /* How big LOV EA can be on MDS */ + __u32 ocd_instance; /* instance # of this target */ + __u64 ocd_maxbytes; /* Maximum stripe size in bytes */ + /* Fields after ocd_maxbytes are only accessible by the receiver + * if the corresponding flag in ocd_connect_flags is set. Accessing + * any field after ocd_maxbytes on the receiver without a valid flag + * may result in out-of-bound memory access and kernel oops. */ + __u64 padding1; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding2; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding3; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding4; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding5; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding6; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding7; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding8; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 padding9; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingA; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingB; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingC; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingD; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingE; /* added 2.1.0. also fix lustre_swab_connect */ + __u64 paddingF; /* added 2.1.0. also fix lustre_swab_connect */ +}; +/* XXX README XXX: + * Please DO NOT use any fields here before first ensuring that this same + * field is not in use on some other branch. Please clear any such changes + * with senior engineers before starting to use a new field. Then, submit + * a small patch against EVERY branch that ONLY adds the new field along with + * the matching OBD_CONNECT flag, so that can be approved and landed easily to + * reserve the flag for future use. */ + + +extern void lustre_swab_connect(struct obd_connect_data *ocd); + +/* + * Supported checksum algorithms. Up to 32 checksum types are supported. + * (32-bit mask stored in obd_connect_data::ocd_cksum_types) + * Please update DECLARE_CKSUM_NAME/OBD_CKSUM_ALL in obd.h when adding a new + * algorithm and also the OBD_FL_CKSUM* flags. + */ +typedef enum { + OBD_CKSUM_CRC32 = 0x00000001, + OBD_CKSUM_ADLER = 0x00000002, + OBD_CKSUM_CRC32C= 0x00000004, +} cksum_type_t; + +/* + * OST requests: OBDO & OBD request records + */ + +/* opcodes */ +typedef enum { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, /* not used since 2.4 */ + OST_LAST_OPC +} ost_cmd_t; +#define OST_FIRST_OPC OST_REPLY + +enum obdo_flags { + OBD_FL_INLINEDATA = 0x00000001, + OBD_FL_OBDMDEXISTS = 0x00000002, + OBD_FL_DELORPHAN = 0x00000004, /* if set in o_flags delete orphans */ + OBD_FL_NORPC = 0x00000008, /* set in o_flags do in OSC not OST */ + OBD_FL_IDONLY = 0x00000010, /* set in o_flags only adjust obj id*/ + OBD_FL_RECREATE_OBJS= 0x00000020, /* recreate missing obj */ + OBD_FL_DEBUG_CHECK = 0x00000040, /* echo client/server debug check */ + OBD_FL_NO_USRQUOTA = 0x00000100, /* the object's owner is over quota */ + OBD_FL_NO_GRPQUOTA = 0x00000200, /* the object's group is over quota */ + OBD_FL_CREATE_CROW = 0x00000400, /* object should be create on write */ + OBD_FL_SRVLOCK = 0x00000800, /* delegate DLM locking to server */ + OBD_FL_CKSUM_CRC32 = 0x00001000, /* CRC32 checksum type */ + OBD_FL_CKSUM_ADLER = 0x00002000, /* ADLER checksum type */ + OBD_FL_CKSUM_CRC32C = 0x00004000, /* CRC32C checksum type */ + OBD_FL_CKSUM_RSVD2 = 0x00008000, /* for future cksum types */ + OBD_FL_CKSUM_RSVD3 = 0x00010000, /* for future cksum types */ + OBD_FL_SHRINK_GRANT = 0x00020000, /* object shrink the grant */ + OBD_FL_MMAP = 0x00040000, /* object is mmapped on the client. + * XXX: obsoleted - reserved for old + * clients prior than 2.2 */ + OBD_FL_RECOV_RESEND = 0x00080000, /* recoverable resent */ + OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ + + /* Note that while these checksum values are currently separate bits, + * in 2.x we can actually allow all values from 1-31 if we wanted. */ + OBD_FL_CKSUM_ALL = OBD_FL_CKSUM_CRC32 | OBD_FL_CKSUM_ADLER | + OBD_FL_CKSUM_CRC32C, + + /* mask for local-only flag, which won't be sent over network */ + OBD_FL_LOCAL_MASK = 0xF0000000, +}; + +#define LOV_MAGIC_V1 0x0BD10BD0 +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_MAGIC_V3 0x0BD30BD0 + +/* + * magic for fully defined striping + * the idea is that we should have different magics for striping "hints" + * (struct lov_user_md_v[13]) and defined ready-to-use striping (struct + * lov_mds_md_v[13]). at the moment the magics are used in wire protocol, + * we can't just change it w/o long way preparation, but we still need a + * mechanism to allow LOD to differentiate hint versus ready striping. + * so, at the moment we do a trick: MDT knows what to expect from request + * depending on the case (replay uses ready striping, non-replay req uses + * hints), so MDT replaces magic with appropriate one and now LOD can + * easily understand what's inside -bzzz + */ +#define LOV_MAGIC_V1_DEF 0x0CD10BD0 +#define LOV_MAGIC_V3_DEF 0x0CD30BD0 + +#define LOV_PATTERN_RAID0 0x001 /* stripes are used round-robin */ +#define LOV_PATTERN_RAID1 0x002 /* stripes are mirrors of each other */ +#define LOV_PATTERN_FIRST 0x100 /* first stripe is not in round-robin */ +#define LOV_PATTERN_CMOBD 0x200 + +#define lov_ost_data lov_ost_data_v1 +struct lov_ost_data_v1 { /* per-stripe data structure (little-endian)*/ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this l_ost_idx */ + __u32 l_ost_idx; /* OST index in LOV (lov_tgt_desc->tgts) */ +}; + +#define lov_mds_md lov_mds_md_v1 +struct lov_mds_md_v1 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +/** + * Sigh, because pre-2.4 uses + * struct lov_mds_md_v1 { + * ........ + * __u64 lmm_object_id; + * __u64 lmm_object_seq; + * ...... + * } + * to identify the LOV(MDT) object, and lmm_object_seq will + * be normal_fid, which make it hard to combine these conversion + * to ostid_to FID. so we will do lmm_oi/fid conversion separately + * + * We can tell the lmm_oi by this way, + * 1.8: lmm_object_id = {inode}, lmm_object_gr = 0 + * 2.1: lmm_object_id = {oid < 128k}, lmm_object_seq = FID_SEQ_NORMAL + * 2.4: lmm_oi.f_seq = FID_SEQ_NORMAL, lmm_oi.f_oid = {oid < 128k}, + * lmm_oi.f_ver = 0 + * + * But currently lmm_oi/lsm_oi does not have any "real" usages, + * except for printing some information, and the user can always + * get the real FID from LMA, besides this multiple case check might + * make swab more complicate. So we will keep using id/seq for lmm_oi. + */ + +static inline void fid_to_lmm_oi(const struct lu_fid *fid, + struct ost_id *oi) +{ + oi->oi.oi_id = fid_oid(fid); + oi->oi.oi_seq = fid_seq(fid); +} + +static inline void lmm_oi_set_seq(struct ost_id *oi, __u64 seq) +{ + oi->oi.oi_seq = seq; +} + +static inline __u64 lmm_oi_id(struct ost_id *oi) +{ + return oi->oi.oi_id; +} + +static inline __u64 lmm_oi_seq(struct ost_id *oi) +{ + return oi->oi.oi_seq; +} + +static inline void lmm_oi_le_to_cpu(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = le64_to_cpu(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = le64_to_cpu(src_oi->oi.oi_seq); +} + +static inline void lmm_oi_cpu_to_le(struct ost_id *dst_oi, + struct ost_id *src_oi) +{ + dst_oi->oi.oi_id = cpu_to_le64(src_oi->oi.oi_id); + dst_oi->oi.oi_seq = cpu_to_le64(src_oi->oi.oi_seq); +} + +/* extern void lustre_swab_lov_mds_md(struct lov_mds_md *llm); */ + +#define MAX_MD_SIZE (sizeof(struct lov_mds_md) + 4 * sizeof(struct lov_ost_data)) +#define MIN_MD_SIZE (sizeof(struct lov_mds_md) + 1 * sizeof(struct lov_ost_data)) + +#define XATTR_NAME_ACL_ACCESS "system.posix_acl_access" +#define XATTR_NAME_ACL_DEFAULT "system.posix_acl_default" +#define XATTR_USER_PREFIX "user." +#define XATTR_TRUSTED_PREFIX "trusted." +#define XATTR_SECURITY_PREFIX "security." +#define XATTR_LUSTRE_PREFIX "lustre." + +#define XATTR_NAME_LOV "trusted.lov" +#define XATTR_NAME_LMA "trusted.lma" +#define XATTR_NAME_LMV "trusted.lmv" +#define XATTR_NAME_LINK "trusted.link" +#define XATTR_NAME_FID "trusted.fid" +#define XATTR_NAME_VERSION "trusted.version" +#define XATTR_NAME_SOM "trusted.som" +#define XATTR_NAME_HSM "trusted.hsm" +#define XATTR_NAME_LFSCK_NAMESPACE "trusted.lfsck_namespace" + +struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ + __u32 lmm_magic; /* magic number = LOV_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + /* lmm_stripe_count used to be __u32 */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + __u16 lmm_layout_gen; /* layout generation number */ + char lmm_pool_name[LOV_MAXPOOLNAME]; /* must be 32bit aligned */ + struct lov_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +}; + +#define OBD_MD_FLID (0x00000001ULL) /* object ID */ +#define OBD_MD_FLATIME (0x00000002ULL) /* access time */ +#define OBD_MD_FLMTIME (0x00000004ULL) /* data modification time */ +#define OBD_MD_FLCTIME (0x00000008ULL) /* change time */ +#define OBD_MD_FLSIZE (0x00000010ULL) /* size */ +#define OBD_MD_FLBLOCKS (0x00000020ULL) /* allocated blocks count */ +#define OBD_MD_FLBLKSZ (0x00000040ULL) /* block size */ +#define OBD_MD_FLMODE (0x00000080ULL) /* access bits (mode & ~S_IFMT) */ +#define OBD_MD_FLTYPE (0x00000100ULL) /* object type (mode & S_IFMT) */ +#define OBD_MD_FLUID (0x00000200ULL) /* user ID */ +#define OBD_MD_FLGID (0x00000400ULL) /* group ID */ +#define OBD_MD_FLFLAGS (0x00000800ULL) /* flags word */ +#define OBD_MD_FLNLINK (0x00002000ULL) /* link count */ +#define OBD_MD_FLGENER (0x00004000ULL) /* generation number */ +/*#define OBD_MD_FLINLINE (0x00008000ULL) inline data. used until 1.6.5 */ +#define OBD_MD_FLRDEV (0x00010000ULL) /* device number */ +#define OBD_MD_FLEASIZE (0x00020000ULL) /* extended attribute data */ +#define OBD_MD_LINKNAME (0x00040000ULL) /* symbolic link target */ +#define OBD_MD_FLHANDLE (0x00080000ULL) /* file/lock handle */ +#define OBD_MD_FLCKSUM (0x00100000ULL) /* bulk data checksum */ +#define OBD_MD_FLQOS (0x00200000ULL) /* quality of service stats */ +/*#define OBD_MD_FLOSCOPQ (0x00400000ULL) osc opaque data, never used */ +#define OBD_MD_FLCOOKIE (0x00800000ULL) /* log cancellation cookie */ +#define OBD_MD_FLGROUP (0x01000000ULL) /* group */ +#define OBD_MD_FLFID (0x02000000ULL) /* ->ost write inline fid */ +#define OBD_MD_FLEPOCH (0x04000000ULL) /* ->ost write with ioepoch */ + /* ->mds if epoch opens or closes */ +#define OBD_MD_FLGRANT (0x08000000ULL) /* ost preallocation space grant */ +#define OBD_MD_FLDIREA (0x10000000ULL) /* dir's extended attribute data */ +#define OBD_MD_FLUSRQUOTA (0x20000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLGRPQUOTA (0x40000000ULL) /* over quota flags sent from ost */ +#define OBD_MD_FLMODEASIZE (0x80000000ULL) /* EA size will be changed */ + +#define OBD_MD_MDS (0x0000000100000000ULL) /* where an inode lives on */ +#define OBD_MD_REINT (0x0000000200000000ULL) /* reintegrate oa */ +#define OBD_MD_MEA (0x0000000400000000ULL) /* CMD split EA */ + +/* OBD_MD_MDTIDX is used to get MDT index, but it is never been used overwire, + * and it is already obsolete since 2.3 */ +/* #define OBD_MD_MDTIDX (0x0000000800000000ULL) */ + +#define OBD_MD_FLXATTR (0x0000001000000000ULL) /* xattr */ +#define OBD_MD_FLXATTRLS (0x0000002000000000ULL) /* xattr list */ +#define OBD_MD_FLXATTRRM (0x0000004000000000ULL) /* xattr remove */ +#define OBD_MD_FLACL (0x0000008000000000ULL) /* ACL */ +#define OBD_MD_FLRMTPERM (0x0000010000000000ULL) /* remote permission */ +#define OBD_MD_FLMDSCAPA (0x0000020000000000ULL) /* MDS capability */ +#define OBD_MD_FLOSSCAPA (0x0000040000000000ULL) /* OSS capability */ +#define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ +#define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_MD_FLGETATTRLOCK (0x0000200000000000ULL) /* Get IOEpoch attributes + * under lock */ +#define OBD_MD_FLOBJCOUNT (0x0000400000000000ULL) /* for multiple destroy */ + +#define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */ +#define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */ +#define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */ +#define OBD_MD_FLRMTRGETFACL (0x0008000000000000ULL) /* lfs rgetfacl case */ + +#define OBD_MD_FLDATAVERSION (0x0010000000000000ULL) /* iversion sum */ + +#define OBD_MD_FLGETATTR (OBD_MD_FLID | OBD_MD_FLATIME | OBD_MD_FLMTIME | \ + OBD_MD_FLCTIME | OBD_MD_FLSIZE | OBD_MD_FLBLKSZ | \ + OBD_MD_FLMODE | OBD_MD_FLTYPE | OBD_MD_FLUID | \ + OBD_MD_FLGID | OBD_MD_FLFLAGS | OBD_MD_FLNLINK | \ + OBD_MD_FLGENER | OBD_MD_FLRDEV | OBD_MD_FLGROUP) + +/* don't forget obdo_fid which is way down at the bottom so it can + * come after the definition of llog_cookie */ + +enum hss_valid { + HSS_SETMASK = 0x01, + HSS_CLEARMASK = 0x02, + HSS_ARCHIVE_ID = 0x04, +}; + +struct hsm_state_set { + __u32 hss_valid; + __u32 hss_archive_id; + __u64 hss_setmask; + __u64 hss_clearmask; +}; + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_state_set(struct hsm_state_set *hss); + +extern void lustre_swab_obd_statfs (struct obd_statfs *os); + +/* ost_body.data values for OST_BRW */ + +#define OBD_BRW_READ 0x01 +#define OBD_BRW_WRITE 0x02 +#define OBD_BRW_RWMASK (OBD_BRW_READ | OBD_BRW_WRITE) +#define OBD_BRW_SYNC 0x08 /* this page is a part of synchronous + * transfer and is not accounted in + * the grant. */ +#define OBD_BRW_CHECK 0x10 +#define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ +#define OBD_BRW_GRANTED 0x40 /* the ost manages this */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ +#define OBD_BRW_NOQUOTA 0x100 +#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ +#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ +#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ +#define OBD_BRW_OVER_USRQUOTA 0x1000 /* Running out of user quota */ +#define OBD_BRW_OVER_GRPQUOTA 0x2000 /* Running out of group quota */ + +#define OBD_OBJECT_EOF 0xffffffffffffffffULL + +#define OST_MIN_PRECREATE 32 +#define OST_MAX_PRECREATE 20000 + +struct obd_ioobj { + struct ost_id ioo_oid; /* object ID, if multi-obj BRW */ + __u32 ioo_max_brw; /* low 16 bits were o_mode before 2.4, + * now (PTLRPC_BULK_OPS_COUNT - 1) in + * high 16 bits in 2.4 and later */ + __u32 ioo_bufcnt; /* number of niobufs for this object */ +}; + +#define IOOBJ_MAX_BRW_BITS 16 +#define IOOBJ_TYPE_MASK ((1U << IOOBJ_MAX_BRW_BITS) - 1) +#define ioobj_max_brw_get(ioo) (((ioo)->ioo_max_brw >> IOOBJ_MAX_BRW_BITS) + 1) +#define ioobj_max_brw_set(ioo, num) \ +do { (ioo)->ioo_max_brw = ((num) - 1) << IOOBJ_MAX_BRW_BITS; } while (0) + +extern void lustre_swab_obd_ioobj (struct obd_ioobj *ioo); + +/* multiple of 8 bytes => can array */ +struct niobuf_remote { + __u64 offset; + __u32 len; + __u32 flags; +}; + +extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); + +/* lock value block communicated between the filter and llite */ + +/* OST_LVB_ERR_INIT is needed because the return code in rc is + * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ +#define OST_LVB_ERR_INIT 0xffbadbad80000000ULL +#define OST_LVB_ERR_MASK 0xffbadbad00000000ULL +#define OST_LVB_IS_ERR(blocks) \ + ((blocks & OST_LVB_ERR_MASK) == OST_LVB_ERR_MASK) +#define OST_LVB_SET_ERR(blocks, rc) \ + do { blocks = OST_LVB_ERR_INIT + rc; } while (0) +#define OST_LVB_GET_ERR(blocks) (int)(blocks - OST_LVB_ERR_INIT) + +struct ost_lvb_v1 { + __u64 lvb_size; + obd_time lvb_mtime; + obd_time lvb_atime; + obd_time lvb_ctime; + __u64 lvb_blocks; +}; + +extern void lustre_swab_ost_lvb_v1(struct ost_lvb_v1 *lvb); + +struct ost_lvb { + __u64 lvb_size; + obd_time lvb_mtime; + obd_time lvb_atime; + obd_time lvb_ctime; + __u64 lvb_blocks; + __u32 lvb_mtime_ns; + __u32 lvb_atime_ns; + __u32 lvb_ctime_ns; + __u32 lvb_padding; +}; + +extern void lustre_swab_ost_lvb(struct ost_lvb *lvb); + +/* + * lquota data structures + */ + +#ifndef QUOTABLOCK_BITS +#define QUOTABLOCK_BITS 10 +#endif + +#ifndef QUOTABLOCK_SIZE +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) +#endif + +#ifndef toqb +#define toqb(x) (((x) + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS) +#endif + +/* The lquota_id structure is an union of all the possible identifier types that + * can be used with quota, this includes: + * - 64-bit user ID + * - 64-bit group ID + * - a FID which can be used for per-directory quota in the future */ +union lquota_id { + struct lu_fid qid_fid; /* FID for per-directory quota */ + __u64 qid_uid; /* user identifier */ + __u64 qid_gid; /* group identifier */ +}; + +/* quotactl management */ +struct obd_quotactl { + __u32 qc_cmd; + __u32 qc_type; /* see Q_* flag below */ + __u32 qc_id; + __u32 qc_stat; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; +}; + +extern void lustre_swab_obd_quotactl(struct obd_quotactl *q); + +#define Q_QUOTACHECK 0x800100 /* deprecated as of 2.4 */ +#define Q_INITQUOTA 0x800101 /* deprecated as of 2.4 */ +#define Q_GETOINFO 0x800102 /* get obd quota info */ +#define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* deprecated as of 2.4 */ + +#define Q_COPY(out, in, member) (out)->member = (in)->member + +#define QCTL_COPY(out, in) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ +} while (0) + +/* Body of quota request used for quota acquire/release RPCs between quota + * master (aka QMT) and slaves (ak QSD). */ +struct quota_body { + struct lu_fid qb_fid; /* FID of global index packing the pool ID + * and type (data or metadata) as well as + * the quota type (user or group). */ + union lquota_id qb_id; /* uid or gid or directory FID */ + __u32 qb_flags; /* see below */ + __u32 qb_padding; + __u64 qb_count; /* acquire/release count (kbytes/inodes) */ + __u64 qb_usage; /* current slave usage (kbytes/inodes) */ + __u64 qb_slv_ver; /* slave index file version */ + struct lustre_handle qb_lockh; /* per-ID lock handle */ + struct lustre_handle qb_glb_lockh; /* global lock handle */ + __u64 qb_padding1[4]; +}; + +/* When the quota_body is used in the reply of quota global intent + * lock (IT_QUOTA_CONN) reply, qb_fid contains slave index file FID. */ +#define qb_slv_fid qb_fid +/* qb_usage is the current qunit (in kbytes/inodes) when quota_body is used in + * quota reply */ +#define qb_qunit qb_usage + +#define QUOTA_DQACQ_FL_ACQ 0x1 /* acquire quota */ +#define QUOTA_DQACQ_FL_PREACQ 0x2 /* pre-acquire */ +#define QUOTA_DQACQ_FL_REL 0x4 /* release quota */ +#define QUOTA_DQACQ_FL_REPORT 0x8 /* report usage */ + +extern void lustre_swab_quota_body(struct quota_body *b); + +/* Quota types currently supported */ +enum { + LQUOTA_TYPE_USR = 0x00, /* maps to USRQUOTA */ + LQUOTA_TYPE_GRP = 0x01, /* maps to GRPQUOTA */ + LQUOTA_TYPE_MAX +}; + +/* There are 2 different resource types on which a quota limit can be enforced: + * - inodes on the MDTs + * - blocks on the OSTs */ +enum { + LQUOTA_RES_MD = 0x01, /* skip 0 to avoid null oid in FID */ + LQUOTA_RES_DT = 0x02, + LQUOTA_LAST_RES, + LQUOTA_FIRST_RES = LQUOTA_RES_MD +}; +#define LQUOTA_NR_RES (LQUOTA_LAST_RES - LQUOTA_FIRST_RES + 1) + +/* + * Space accounting support + * Format of an accounting record, providing disk usage information for a given + * user or group + */ +struct lquota_acct_rec { /* 16 bytes */ + __u64 bspace; /* current space in use */ + __u64 ispace; /* current # inodes in use */ +}; + +/* + * Global quota index support + * Format of a global record, providing global quota settings for a given quota + * identifier + */ +struct lquota_glb_rec { /* 32 bytes */ + __u64 qbr_hardlimit; /* quota hard limit, in #inodes or kbytes */ + __u64 qbr_softlimit; /* quota soft limit, in #inodes or kbytes */ + __u64 qbr_time; /* grace time, in seconds */ + __u64 qbr_granted; /* how much is granted to slaves, in #inodes or + * kbytes */ +}; + +/* + * Slave index support + * Format of a slave record, recording how much space is granted to a given + * slave + */ +struct lquota_slv_rec { /* 8 bytes */ + __u64 qsr_granted; /* space granted to the slave for the key=ID, + * in #inodes or kbytes */ +}; + +/* Data structures associated with the quota locks */ + +/* Glimpse descriptor used for the index & per-ID quota locks */ +struct ldlm_gl_lquota_desc { + union lquota_id gl_id; /* quota ID subject to the glimpse */ + __u64 gl_flags; /* see LQUOTA_FL* below */ + __u64 gl_ver; /* new index version */ + __u64 gl_hardlimit; /* new hardlimit or qunit value */ + __u64 gl_softlimit; /* new softlimit */ + __u64 gl_time; + __u64 gl_pad2; +}; +#define gl_qunit gl_hardlimit /* current qunit value used when + * glimpsing per-ID quota locks */ + +/* quota glimpse flags */ +#define LQUOTA_FL_EDQUOT 0x1 /* user/group out of quota space on QMT */ + +/* LVB used with quota (global and per-ID) locks */ +struct lquota_lvb { + __u64 lvb_flags; /* see LQUOTA_FL* above */ + __u64 lvb_id_may_rel; /* space that might be released later */ + __u64 lvb_id_rel; /* space released by the slave for this ID */ + __u64 lvb_id_qunit; /* current qunit value */ + __u64 lvb_pad1; +}; + +extern void lustre_swab_lquota_lvb(struct lquota_lvb *lvb); + +/* LVB used with global quota lock */ +#define lvb_glb_ver lvb_id_may_rel /* current version of the global index */ + +/* op codes */ +typedef enum { + QUOTA_DQACQ = 601, + QUOTA_DQREL = 602, + QUOTA_LAST_OPC +} quota_cmd_t; +#define QUOTA_FIRST_OPC QUOTA_DQACQ + +/* + * MDS REQ RECORDS + */ + +/* opcodes */ +typedef enum { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GETSTATUS = 40, + MDS_STATFS = 41, + MDS_PIN = 42, + MDS_UNPIN = 43, + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, /* obsolete, now it's MDS_REINT op */ + MDS_WRITEPAGE = 51, + MDS_IS_SUBDIR = 52, + MDS_GET_INFO = 53, + MDS_HSM_STATE_GET = 54, + MDS_HSM_STATE_SET = 55, + MDS_HSM_ACTION = 56, + MDS_HSM_PROGRESS = 57, + MDS_HSM_REQUEST = 58, + MDS_HSM_CT_REGISTER = 59, + MDS_HSM_CT_UNREGISTER = 60, + MDS_SWAP_LAYOUTS = 61, + MDS_LAST_OPC +} mds_cmd_t; + +#define MDS_FIRST_OPC MDS_GETATTR + + +/* opcodes for object update */ +typedef enum { + UPDATE_OBJ = 1000, + UPDATE_LAST_OPC +} update_cmd_t; + +#define UPDATE_FIRST_OPC UPDATE_OBJ + +/* + * Do not exceed 63 + */ + +typedef enum { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + REINT_SETXATTR = 7, + REINT_RMENTRY = 8, +// REINT_WRITE = 9, + REINT_MAX +} mds_reint_t, mdt_reint_t; + +extern void lustre_swab_generic_32s (__u32 *val); + +/* the disposition of the intent outlines what was executed */ +#define DISP_IT_EXECD 0x00000001 +#define DISP_LOOKUP_EXECD 0x00000002 +#define DISP_LOOKUP_NEG 0x00000004 +#define DISP_LOOKUP_POS 0x00000008 +#define DISP_OPEN_CREATE 0x00000010 +#define DISP_OPEN_OPEN 0x00000020 +#define DISP_ENQ_COMPLETE 0x00400000 +#define DISP_ENQ_OPEN_REF 0x00800000 +#define DISP_ENQ_CREATE_REF 0x01000000 +#define DISP_OPEN_LOCK 0x02000000 + +/* INODE LOCK PARTS */ +#define MDS_INODELOCK_LOOKUP 0x000001 /* dentry, mode, owner, group */ +#define MDS_INODELOCK_UPDATE 0x000002 /* size, links, timestamps */ +#define MDS_INODELOCK_OPEN 0x000004 /* For opened files */ +#define MDS_INODELOCK_LAYOUT 0x000008 /* for layout */ +#define MDS_INODELOCK_PERM 0x000010 /* for permission */ + +#define MDS_INODELOCK_MAXSHIFT 4 +/* This FULL lock is useful to take on unlink sort of operations */ +#define MDS_INODELOCK_FULL ((1<<(MDS_INODELOCK_MAXSHIFT+1))-1) + +extern void lustre_swab_ll_fid (struct ll_fid *fid); + +/* NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * name[2,3] fields that need to be used for the quota id (also a FID). */ +enum { + LUSTRE_RES_ID_SEQ_OFF = 0, + LUSTRE_RES_ID_VER_OID_OFF = 1, + LUSTRE_RES_ID_WAS_VER_OFF = 2, /* see note above */ + LUSTRE_RES_ID_QUOTA_SEQ_OFF = 2, + LUSTRE_RES_ID_QUOTA_VER_OID_OFF = 3, + LUSTRE_RES_ID_HSH_OFF = 3 +}; + +#define MDS_STATUS_CONN 1 +#define MDS_STATUS_LOV 2 + +/* mdt_thread_info.mti_flags. */ +enum md_op_flags { + /* The flag indicates Size-on-MDS attributes are changed. */ + MF_SOM_CHANGE = (1 << 0), + /* Flags indicates an epoch opens or closes. */ + MF_EPOCH_OPEN = (1 << 1), + MF_EPOCH_CLOSE = (1 << 2), + MF_MDC_CANCEL_FID1 = (1 << 3), + MF_MDC_CANCEL_FID2 = (1 << 4), + MF_MDC_CANCEL_FID3 = (1 << 5), + MF_MDC_CANCEL_FID4 = (1 << 6), + /* There is a pending attribute update. */ + MF_SOM_AU = (1 << 7), + /* Cancel OST locks while getattr OST attributes. */ + MF_GETATTR_LOCK = (1 << 8), + MF_GET_MDT_IDX = (1 << 9), +}; + +#define MF_SOM_LOCAL_FLAGS (MF_SOM_CHANGE | MF_EPOCH_OPEN | MF_EPOCH_CLOSE) + +#define LUSTRE_BFLAG_UNCOMMITTED_WRITES 0x1 + +/* these should be identical to their EXT4_*_FL counterparts, they are + * redefined here only to avoid dragging in fs/ext4/ext4.h */ +#define LUSTRE_SYNC_FL 0x00000008 /* Synchronous updates */ +#define LUSTRE_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define LUSTRE_APPEND_FL 0x00000020 /* writes to file may only append */ +#define LUSTRE_NOATIME_FL 0x00000080 /* do not update atime */ +#define LUSTRE_DIRSYNC_FL 0x00010000 /* dirsync behaviour (dir only) */ + +/* Convert wire LUSTRE_*_FL to corresponding client local VFS S_* values + * for the client inode i_flags. The LUSTRE_*_FL are the Lustre wire + * protocol equivalents of LDISKFS_*_FL values stored on disk, while + * the S_* flags are kernel-internal values that change between kernel + * versions. These flags are set/cleared via FSFILT_IOC_{GET,SET}_FLAGS. + * See b=16526 for a full history. */ +static inline int ll_ext_to_inode_flags(int flags) +{ + return (((flags & LUSTRE_SYNC_FL) ? S_SYNC : 0) | + ((flags & LUSTRE_NOATIME_FL) ? S_NOATIME : 0) | + ((flags & LUSTRE_APPEND_FL) ? S_APPEND : 0) | +#if defined(S_DIRSYNC) + ((flags & LUSTRE_DIRSYNC_FL) ? S_DIRSYNC : 0) | +#endif + ((flags & LUSTRE_IMMUTABLE_FL) ? S_IMMUTABLE : 0)); +} + +static inline int ll_inode_to_ext_flags(int iflags) +{ + return (((iflags & S_SYNC) ? LUSTRE_SYNC_FL : 0) | + ((iflags & S_NOATIME) ? LUSTRE_NOATIME_FL : 0) | + ((iflags & S_APPEND) ? LUSTRE_APPEND_FL : 0) | +#if defined(S_DIRSYNC) + ((iflags & S_DIRSYNC) ? LUSTRE_DIRSYNC_FL : 0) | +#endif + ((iflags & S_IMMUTABLE) ? LUSTRE_IMMUTABLE_FL : 0)); +} + +struct mdt_body { + struct lu_fid fid1; + struct lu_fid fid2; + struct lustre_handle handle; + __u64 valid; + __u64 size; /* Offset, in the case of MDS_READPAGE */ + obd_time mtime; + obd_time atime; + obd_time ctime; + __u64 blocks; /* XID, in the case of MDS_READPAGE */ + __u64 ioepoch; + __u64 unused1; /* was "ino" until 2.4.0 */ + __u32 fsuid; + __u32 fsgid; + __u32 capability; + __u32 mode; + __u32 uid; + __u32 gid; + __u32 flags; /* from vfs for pin/unpin, LUSTRE_BFLAG close */ + __u32 rdev; + __u32 nlink; /* #bytes to read in the case of MDS_READPAGE */ + __u32 unused2; /* was "generation" until 2.4.0 */ + __u32 suppgid; + __u32 eadatasize; + __u32 aclsize; + __u32 max_mdsize; + __u32 max_cookiesize; + __u32 uid_h; /* high 32-bits of uid, for FUID */ + __u32 gid_h; /* high 32-bits of gid, for FUID */ + __u32 padding_5; /* also fix lustre_swab_mdt_body */ + __u64 padding_6; + __u64 padding_7; + __u64 padding_8; + __u64 padding_9; + __u64 padding_10; +}; /* 216 */ + +extern void lustre_swab_mdt_body (struct mdt_body *b); + +struct mdt_ioepoch { + struct lustre_handle handle; + __u64 ioepoch; + __u32 flags; + __u32 padding; +}; + +extern void lustre_swab_mdt_ioepoch (struct mdt_ioepoch *b); + +/* permissions for md_perm.mp_perm */ +enum { + CFS_SETUID_PERM = 0x01, + CFS_SETGID_PERM = 0x02, + CFS_SETGRP_PERM = 0x04, + CFS_RMTACL_PERM = 0x08, + CFS_RMTOWN_PERM = 0x10 +}; + +/* inode access permission for remote user, the inode info are omitted, + * for client knows them. */ +struct mdt_remote_perm { + __u32 rp_uid; + __u32 rp_gid; + __u32 rp_fsuid; + __u32 rp_fsuid_h; + __u32 rp_fsgid; + __u32 rp_fsgid_h; + __u32 rp_access_perm; /* MAY_READ/WRITE/EXEC */ + __u32 rp_padding; +}; + +extern void lustre_swab_mdt_remote_perm(struct mdt_remote_perm *p); + +struct mdt_rec_setattr { + __u32 sa_opcode; + __u32 sa_cap; + __u32 sa_fsuid; + __u32 sa_fsuid_h; + __u32 sa_fsgid; + __u32 sa_fsgid_h; + __u32 sa_suppgid; + __u32 sa_suppgid_h; + __u32 sa_padding_1; + __u32 sa_padding_1_h; + struct lu_fid sa_fid; + __u64 sa_valid; + __u32 sa_uid; + __u32 sa_gid; + __u64 sa_size; + __u64 sa_blocks; + obd_time sa_mtime; + obd_time sa_atime; + obd_time sa_ctime; + __u32 sa_attr_flags; + __u32 sa_mode; + __u32 sa_bias; /* some operation flags */ + __u32 sa_padding_3; + __u32 sa_padding_4; + __u32 sa_padding_5; +}; + +extern void lustre_swab_mdt_rec_setattr (struct mdt_rec_setattr *sa); + +/* + * Attribute flags used in mdt_rec_setattr::sa_valid. + * The kernel's #defines for ATTR_* should not be used over the network + * since the client and MDS may run different kernels (see bug 13828) + * Therefore, we should only use MDS_ATTR_* attributes for sa_valid. + */ +#define MDS_ATTR_MODE 0x1ULL /* = 1 */ +#define MDS_ATTR_UID 0x2ULL /* = 2 */ +#define MDS_ATTR_GID 0x4ULL /* = 4 */ +#define MDS_ATTR_SIZE 0x8ULL /* = 8 */ +#define MDS_ATTR_ATIME 0x10ULL /* = 16 */ +#define MDS_ATTR_MTIME 0x20ULL /* = 32 */ +#define MDS_ATTR_CTIME 0x40ULL /* = 64 */ +#define MDS_ATTR_ATIME_SET 0x80ULL /* = 128 */ +#define MDS_ATTR_MTIME_SET 0x100ULL /* = 256 */ +#define MDS_ATTR_FORCE 0x200ULL /* = 512, Not a change, but a change it */ +#define MDS_ATTR_ATTR_FLAG 0x400ULL /* = 1024 */ +#define MDS_ATTR_KILL_SUID 0x800ULL /* = 2048 */ +#define MDS_ATTR_KILL_SGID 0x1000ULL /* = 4096 */ +#define MDS_ATTR_CTIME_SET 0x2000ULL /* = 8192 */ +#define MDS_ATTR_FROM_OPEN 0x4000ULL /* = 16384, called from open path, ie O_TRUNC */ +#define MDS_ATTR_BLOCKS 0x8000ULL /* = 32768 */ + +#ifndef FMODE_READ +#define FMODE_READ 00000001 +#define FMODE_WRITE 00000002 +#endif + +#define MDS_FMODE_CLOSED 00000000 +#define MDS_FMODE_EXEC 00000004 +/* IO Epoch is opened on a closed file. */ +#define MDS_FMODE_EPOCH 01000000 +/* IO Epoch is opened on a file truncate. */ +#define MDS_FMODE_TRUNC 02000000 +/* Size-on-MDS Attribute Update is pending. */ +#define MDS_FMODE_SOM 04000000 + +#define MDS_OPEN_CREATED 00000010 +#define MDS_OPEN_CROSS 00000020 + +#define MDS_OPEN_CREAT 00000100 +#define MDS_OPEN_EXCL 00000200 +#define MDS_OPEN_TRUNC 00001000 +#define MDS_OPEN_APPEND 00002000 +#define MDS_OPEN_SYNC 00010000 +#define MDS_OPEN_DIRECTORY 00200000 + +#define MDS_OPEN_BY_FID 040000000 /* open_by_fid for known object */ +#define MDS_OPEN_DELAY_CREATE 0100000000 /* delay initial object create */ +#define MDS_OPEN_OWNEROVERRIDE 0200000000 /* NFSD rw-reopen ro file for owner */ +#define MDS_OPEN_JOIN_FILE 0400000000 /* open for join file. + * We do not support JOIN FILE + * anymore, reserve this flags + * just for preventing such bit + * to be reused. */ + +#define MDS_OPEN_LOCK 04000000000 /* This open requires open lock */ +#define MDS_OPEN_HAS_EA 010000000000 /* specify object create pattern */ +#define MDS_OPEN_HAS_OBJS 020000000000 /* Just set the EA the obj exist */ +#define MDS_OPEN_NORESTORE 0100000000000ULL /* Do not restore file at open */ +#define MDS_OPEN_NEWSTRIPE 0200000000000ULL /* New stripe needed (restripe or + * hsm restore) */ +#define MDS_OPEN_VOLATILE 0400000000000ULL /* File is volatile = created + unlinked */ + +/* permission for create non-directory file */ +#define MAY_CREATE (1 << 7) +/* permission for create directory file */ +#define MAY_LINK (1 << 8) +/* permission for delete from the directory */ +#define MAY_UNLINK (1 << 9) +/* source's permission for rename */ +#define MAY_RENAME_SRC (1 << 10) +/* target's permission for rename */ +#define MAY_RENAME_TAR (1 << 11) +/* part (parent's) VTX permission check */ +#define MAY_VTX_PART (1 << 12) +/* full VTX permission check */ +#define MAY_VTX_FULL (1 << 13) +/* lfs rgetfacl permission check */ +#define MAY_RGETFACL (1 << 14) + +enum { + MDS_CHECK_SPLIT = 1 << 0, + MDS_CROSS_REF = 1 << 1, + MDS_VTX_BYPASS = 1 << 2, + MDS_PERM_BYPASS = 1 << 3, + MDS_SOM = 1 << 4, + MDS_QUOTA_IGNORE = 1 << 5, + MDS_CLOSE_CLEANUP = 1 << 6, + MDS_KEEP_ORPHAN = 1 << 7, + MDS_RECOV_OPEN = 1 << 8, + MDS_DATA_MODIFIED = 1 << 9, + MDS_CREATE_VOLATILE = 1 << 10, + MDS_OWNEROVERRIDE = 1 << 11, +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_create { + __u32 cr_opcode; + __u32 cr_cap; + __u32 cr_fsuid; + __u32 cr_fsuid_h; + __u32 cr_fsgid; + __u32 cr_fsgid_h; + __u32 cr_suppgid1; + __u32 cr_suppgid1_h; + __u32 cr_suppgid2; + __u32 cr_suppgid2_h; + struct lu_fid cr_fid1; + struct lu_fid cr_fid2; + struct lustre_handle cr_old_handle; /* handle in case of open replay */ + obd_time cr_time; + __u64 cr_rdev; + __u64 cr_ioepoch; + __u64 cr_padding_1; /* rr_blocks */ + __u32 cr_mode; + __u32 cr_bias; + /* use of helpers set/get_mrc_cr_flags() is needed to access + * 64 bits cr_flags [cr_flags_l, cr_flags_h], this is done to + * extend cr_flags size without breaking 1.8 compat */ + __u32 cr_flags_l; /* for use with open, low 32 bits */ + __u32 cr_flags_h; /* for use with open, high 32 bits */ + __u32 cr_umask; /* umask for create */ + __u32 cr_padding_4; /* rr_padding_4 */ +}; + +static inline void set_mrc_cr_flags(struct mdt_rec_create *mrc, __u64 flags) +{ + mrc->cr_flags_l = (__u32)(flags & 0xFFFFFFFFUll); + mrc->cr_flags_h = (__u32)(flags >> 32); +} + +static inline __u64 get_mrc_cr_flags(struct mdt_rec_create *mrc) +{ + return ((__u64)(mrc->cr_flags_l) | ((__u64)mrc->cr_flags_h << 32)); +} + +/* instance of mdt_reint_rec */ +struct mdt_rec_link { + __u32 lk_opcode; + __u32 lk_cap; + __u32 lk_fsuid; + __u32 lk_fsuid_h; + __u32 lk_fsgid; + __u32 lk_fsgid_h; + __u32 lk_suppgid1; + __u32 lk_suppgid1_h; + __u32 lk_suppgid2; + __u32 lk_suppgid2_h; + struct lu_fid lk_fid1; + struct lu_fid lk_fid2; + obd_time lk_time; + __u64 lk_padding_1; /* rr_atime */ + __u64 lk_padding_2; /* rr_ctime */ + __u64 lk_padding_3; /* rr_size */ + __u64 lk_padding_4; /* rr_blocks */ + __u32 lk_bias; + __u32 lk_padding_5; /* rr_mode */ + __u32 lk_padding_6; /* rr_flags */ + __u32 lk_padding_7; /* rr_padding_2 */ + __u32 lk_padding_8; /* rr_padding_3 */ + __u32 lk_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_unlink { + __u32 ul_opcode; + __u32 ul_cap; + __u32 ul_fsuid; + __u32 ul_fsuid_h; + __u32 ul_fsgid; + __u32 ul_fsgid_h; + __u32 ul_suppgid1; + __u32 ul_suppgid1_h; + __u32 ul_suppgid2; + __u32 ul_suppgid2_h; + struct lu_fid ul_fid1; + struct lu_fid ul_fid2; + obd_time ul_time; + __u64 ul_padding_2; /* rr_atime */ + __u64 ul_padding_3; /* rr_ctime */ + __u64 ul_padding_4; /* rr_size */ + __u64 ul_padding_5; /* rr_blocks */ + __u32 ul_bias; + __u32 ul_mode; + __u32 ul_padding_6; /* rr_flags */ + __u32 ul_padding_7; /* rr_padding_2 */ + __u32 ul_padding_8; /* rr_padding_3 */ + __u32 ul_padding_9; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_rename { + __u32 rn_opcode; + __u32 rn_cap; + __u32 rn_fsuid; + __u32 rn_fsuid_h; + __u32 rn_fsgid; + __u32 rn_fsgid_h; + __u32 rn_suppgid1; + __u32 rn_suppgid1_h; + __u32 rn_suppgid2; + __u32 rn_suppgid2_h; + struct lu_fid rn_fid1; + struct lu_fid rn_fid2; + obd_time rn_time; + __u64 rn_padding_1; /* rr_atime */ + __u64 rn_padding_2; /* rr_ctime */ + __u64 rn_padding_3; /* rr_size */ + __u64 rn_padding_4; /* rr_blocks */ + __u32 rn_bias; /* some operation flags */ + __u32 rn_mode; /* cross-ref rename has mode */ + __u32 rn_padding_5; /* rr_flags */ + __u32 rn_padding_6; /* rr_padding_2 */ + __u32 rn_padding_7; /* rr_padding_3 */ + __u32 rn_padding_8; /* rr_padding_4 */ +}; + +/* instance of mdt_reint_rec */ +struct mdt_rec_setxattr { + __u32 sx_opcode; + __u32 sx_cap; + __u32 sx_fsuid; + __u32 sx_fsuid_h; + __u32 sx_fsgid; + __u32 sx_fsgid_h; + __u32 sx_suppgid1; + __u32 sx_suppgid1_h; + __u32 sx_suppgid2; + __u32 sx_suppgid2_h; + struct lu_fid sx_fid; + __u64 sx_padding_1; /* These three are rr_fid2 */ + __u32 sx_padding_2; + __u32 sx_padding_3; + __u64 sx_valid; + obd_time sx_time; + __u64 sx_padding_5; /* rr_ctime */ + __u64 sx_padding_6; /* rr_size */ + __u64 sx_padding_7; /* rr_blocks */ + __u32 sx_size; + __u32 sx_flags; + __u32 sx_padding_8; /* rr_flags */ + __u32 sx_padding_9; /* rr_padding_2 */ + __u32 sx_padding_10; /* rr_padding_3 */ + __u32 sx_padding_11; /* rr_padding_4 */ +}; + +/* + * mdt_rec_reint is the template for all mdt_reint_xxx structures. + * Do NOT change the size of various members, otherwise the value + * will be broken in lustre_swab_mdt_rec_reint(). + * + * If you add new members in other mdt_reint_xxx structres and need to use the + * rr_padding_x fields, then update lustre_swab_mdt_rec_reint() also. + */ +struct mdt_rec_reint { + __u32 rr_opcode; + __u32 rr_cap; + __u32 rr_fsuid; + __u32 rr_fsuid_h; + __u32 rr_fsgid; + __u32 rr_fsgid_h; + __u32 rr_suppgid1; + __u32 rr_suppgid1_h; + __u32 rr_suppgid2; + __u32 rr_suppgid2_h; + struct lu_fid rr_fid1; + struct lu_fid rr_fid2; + obd_time rr_mtime; + obd_time rr_atime; + obd_time rr_ctime; + __u64 rr_size; + __u64 rr_blocks; + __u32 rr_bias; + __u32 rr_mode; + __u32 rr_flags; + __u32 rr_flags_h; + __u32 rr_umask; + __u32 rr_padding_4; /* also fix lustre_swab_mdt_rec_reint */ +}; + +extern void lustre_swab_mdt_rec_reint(struct mdt_rec_reint *rr); + +struct lmv_desc { + __u32 ld_tgt_count; /* how many MDS's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default MEA_MAGIC_* */ + __u64 ld_default_hash_size; + __u64 ld_padding_1; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lmv_desc */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_3; /* also fix lustre_swab_lmv_desc */ + __u32 ld_padding_4; /* also fix lustre_swab_lmv_desc */ + struct obd_uuid ld_uuid; +}; + +extern void lustre_swab_lmv_desc (struct lmv_desc *ld); + +/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ +struct lmv_stripe_md { + __u32 mea_magic; + __u32 mea_count; + __u32 mea_master; + __u32 mea_padding; + char mea_pool_name[LOV_MAXPOOLNAME]; + struct lu_fid mea_ids[0]; +}; + +extern void lustre_swab_lmv_stripe_md(struct lmv_stripe_md *mea); + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +enum fld_rpc_opc { + FLD_QUERY = 900, + FLD_LAST_OPC, + FLD_FIRST_OPC = FLD_QUERY +}; + +enum seq_rpc_opc { + SEQ_QUERY = 700, + SEQ_LAST_OPC, + SEQ_FIRST_OPC = SEQ_QUERY +}; + +enum seq_op { + SEQ_ALLOC_SUPER = 0, + SEQ_ALLOC_META = 1 +}; + +/* + * LOV data structures + */ + +#define LOV_MAX_UUID_BUFFER_SIZE 8192 +/* The size of the buffer the lov/mdc reserves for the + * array of UUIDs returned by the MDS. With the current + * protocol, this will limit the max number of OSTs per LOV */ + +#define LOV_DESC_MAGIC 0xB0CCDE5C + +/* LOV settings descriptor (should only contain static info) */ +struct lov_desc { + __u32 ld_tgt_count; /* how many OBD's */ + __u32 ld_active_tgt_count; /* how many active */ + __u32 ld_default_stripe_count; /* how many objects are used */ + __u32 ld_pattern; /* default PATTERN_RAID0 */ + __u64 ld_default_stripe_size; /* in bytes */ + __u64 ld_default_stripe_offset; /* in bytes */ + __u32 ld_padding_0; /* unused */ + __u32 ld_qos_maxage; /* in second */ + __u32 ld_padding_1; /* also fix lustre_swab_lov_desc */ + __u32 ld_padding_2; /* also fix lustre_swab_lov_desc */ + struct obd_uuid ld_uuid; +}; + +#define ld_magic ld_active_tgt_count /* for swabbing from llogs */ + +extern void lustre_swab_lov_desc (struct lov_desc *ld); + +/* + * LDLM requests: + */ +/* opcodes -- MUST be distinct from OST/MDS opcodes */ +typedef enum { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_SET_INFO = 107, + LDLM_LAST_OPC +} ldlm_cmd_t; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define RES_NAME_SIZE 4 +struct ldlm_res_id { + __u64 name[RES_NAME_SIZE]; +}; + +extern void lustre_swab_ldlm_res_id (struct ldlm_res_id *id); + +static inline int ldlm_res_eq(const struct ldlm_res_id *res0, + const struct ldlm_res_id *res1) +{ + return !memcmp(res0, res1, sizeof(*res0)); +} + +/* lock types */ +typedef enum { + LCK_MINMODE = 0, + LCK_EX = 1, + LCK_PW = 2, + LCK_PR = 4, + LCK_CW = 8, + LCK_CR = 16, + LCK_NL = 32, + LCK_GROUP = 64, + LCK_COS = 128, + LCK_MAXMODE +} ldlm_mode_t; + +#define LCK_MODE_NUM 8 + +typedef enum { + LDLM_PLAIN = 10, + LDLM_EXTENT = 11, + LDLM_FLOCK = 12, + LDLM_IBITS = 13, + LDLM_MAX_TYPE +} ldlm_type_t; + +#define LDLM_MIN_TYPE LDLM_PLAIN + +struct ldlm_extent { + __u64 start; + __u64 end; + __u64 gid; +}; + +static inline int ldlm_extent_overlap(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->end) && (ex2->start <= ex1->end); +} + +/* check if @ex1 contains @ex2 */ +static inline int ldlm_extent_contain(struct ldlm_extent *ex1, + struct ldlm_extent *ex2) +{ + return (ex1->start <= ex2->start) && (ex1->end >= ex2->end); +} + +struct ldlm_inodebits { + __u64 bits; +}; + +struct ldlm_flock_wire { + __u64 lfw_start; + __u64 lfw_end; + __u64 lfw_owner; + __u32 lfw_padding; + __u32 lfw_pid; +}; + +/* it's important that the fields of the ldlm_extent structure match + * the first fields of the ldlm_flock structure because there is only + * one ldlm_swab routine to process the ldlm_policy_data_t union. if + * this ever changes we will need to swab the union differently based + * on the resource type. */ + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock_wire l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_wire_policy_data_t; + +extern void lustre_swab_ldlm_policy_data (ldlm_wire_policy_data_t *d); + +union ldlm_gl_desc { + struct ldlm_gl_lquota_desc lquota_desc; +}; + +extern void lustre_swab_gl_desc(union ldlm_gl_desc *); + +struct ldlm_intent { + __u64 opc; +}; + +extern void lustre_swab_ldlm_intent (struct ldlm_intent *i); + +struct ldlm_resource_desc { + ldlm_type_t lr_type; + __u32 lr_padding; /* also fix lustre_swab_ldlm_resource_desc */ + struct ldlm_res_id lr_name; +}; + +extern void lustre_swab_ldlm_resource_desc (struct ldlm_resource_desc *r); + +struct ldlm_lock_desc { + struct ldlm_resource_desc l_resource; + ldlm_mode_t l_req_mode; + ldlm_mode_t l_granted_mode; + ldlm_wire_policy_data_t l_policy_data; +}; + +extern void lustre_swab_ldlm_lock_desc (struct ldlm_lock_desc *l); + +#define LDLM_LOCKREQ_HANDLES 2 +#define LDLM_ENQUEUE_CANCEL_OFF 1 + +struct ldlm_request { + __u32 lock_flags; + __u32 lock_count; + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle[LDLM_LOCKREQ_HANDLES]; +}; + +extern void lustre_swab_ldlm_request (struct ldlm_request *rq); + +/* If LDLM_ENQUEUE, 1 slot is already occupied, 1 is available. + * Otherwise, 2 are available. */ +#define ldlm_request_bufsize(count,type) \ +({ \ + int _avail = LDLM_LOCKREQ_HANDLES; \ + _avail -= (type == LDLM_ENQUEUE ? LDLM_ENQUEUE_CANCEL_OFF : 0); \ + sizeof(struct ldlm_request) + \ + (count > _avail ? count - _avail : 0) * \ + sizeof(struct lustre_handle); \ +}) + +struct ldlm_reply { + __u32 lock_flags; + __u32 lock_padding; /* also fix lustre_swab_ldlm_reply */ + struct ldlm_lock_desc lock_desc; + struct lustre_handle lock_handle; + __u64 lock_policy_res1; + __u64 lock_policy_res2; +}; + +extern void lustre_swab_ldlm_reply (struct ldlm_reply *r); + +#define ldlm_flags_to_wire(flags) ((__u32)(flags)) +#define ldlm_flags_from_wire(flags) ((__u64)(flags)) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +typedef enum { + MGS_CONNECT = 250, + MGS_DISCONNECT, + MGS_EXCEPTION, /* node died, etc. */ + MGS_TARGET_REG, /* whenever target starts up */ + MGS_TARGET_DEL, + MGS_SET_INFO, + MGS_CONFIG_READ, + MGS_LAST_OPC +} mgs_cmd_t; +#define MGS_FIRST_OPC MGS_CONNECT + +#define MGS_PARAM_MAXLEN 1024 +#define KEY_SET_INFO "set_info" + +struct mgs_send_param { + char mgs_param[MGS_PARAM_MAXLEN]; +}; + +/* We pass this info to the MGS so it can write config logs */ +#define MTI_NAME_MAXLEN 64 +#define MTI_PARAM_MAXLEN 4096 +#define MTI_NIDS_MAX 32 +struct mgs_target_info { + __u32 mti_lustre_ver; + __u32 mti_stripe_index; + __u32 mti_config_ver; + __u32 mti_flags; + __u32 mti_nid_count; + __u32 mti_instance; /* Running instance of target */ + char mti_fsname[MTI_NAME_MAXLEN]; + char mti_svname[MTI_NAME_MAXLEN]; + char mti_uuid[sizeof(struct obd_uuid)]; + __u64 mti_nids[MTI_NIDS_MAX]; /* host nids (lnet_nid_t)*/ + char mti_params[MTI_PARAM_MAXLEN]; +}; +extern void lustre_swab_mgs_target_info(struct mgs_target_info *oinfo); + +struct mgs_nidtbl_entry { + __u64 mne_version; /* table version of this entry */ + __u32 mne_instance; /* target instance # */ + __u32 mne_index; /* target index */ + __u32 mne_length; /* length of this entry - by bytes */ + __u8 mne_type; /* target type LDD_F_SV_TYPE_OST/MDT */ + __u8 mne_nid_type; /* type of nid(mbz). for ipv6. */ + __u8 mne_nid_size; /* size of each NID, by bytes */ + __u8 mne_nid_count; /* # of NIDs in buffer */ + union { + lnet_nid_t nids[0]; /* variable size buffer for NIDs. */ + } u; +}; +extern void lustre_swab_mgs_nidtbl_entry(struct mgs_nidtbl_entry *oinfo); + +struct mgs_config_body { + char mcb_name[MTI_NAME_MAXLEN]; /* logname */ + __u64 mcb_offset; /* next index of config log to request */ + __u16 mcb_type; /* type of log: CONFIG_T_[CONFIG|RECOVER] */ + __u8 mcb_reserved; + __u8 mcb_bits; /* bits unit size of config log */ + __u32 mcb_units; /* # of units for bulk transfer */ +}; +extern void lustre_swab_mgs_config_body(struct mgs_config_body *body); + +struct mgs_config_res { + __u64 mcr_offset; /* index of last config log */ + __u64 mcr_size; /* size of the log */ +}; +extern void lustre_swab_mgs_config_res(struct mgs_config_res *body); + +/* Config marker flags (in config log) */ +#define CM_START 0x01 +#define CM_END 0x02 +#define CM_SKIP 0x04 +#define CM_UPGRADE146 0x08 +#define CM_EXCLUDE 0x10 +#define CM_START_SKIP (CM_START | CM_SKIP) + +struct cfg_marker { + __u32 cm_step; /* aka config version */ + __u32 cm_flags; + __u32 cm_vers; /* lustre release version number */ + __u32 cm_padding; /* 64 bit align */ + obd_time cm_createtime; /*when this record was first created */ + obd_time cm_canceltime; /*when this record is no longer valid*/ + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +extern void lustre_swab_cfg_marker(struct cfg_marker *marker, + int swab, int size); + +/* + * Opcodes for multiple servers. + */ + +typedef enum { + OBD_PING = 400, + OBD_LOG_CANCEL, + OBD_QC_CALLBACK, + OBD_IDX_READ, + OBD_LAST_OPC +} obd_cmd_t; +#define OBD_FIRST_OPC OBD_PING + +/* catalog of log objects */ + +/** Identifier for a single log object */ +struct llog_logid { + struct ost_id lgl_oi; + __u32 lgl_ogen; +} __attribute__((packed)); + +/** Records written to the CATALOGS list */ +#define CATLIST "CATALOGS" +struct llog_catid { + struct llog_logid lci_logid; + __u32 lci_padding1; + __u32 lci_padding2; + __u32 lci_padding3; +} __attribute__((packed)); + +/* Log data record types - there is no specific reason that these need to + * be related to the RPC opcodes, but no reason not to (may be handy later?) + */ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +typedef enum { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + /* OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, never used */ + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | + REINT_UNLINK, /* obsolete after 2.5.0 */ + MDS_UNLINK64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_UNLINK, + /* MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x12401, obsolete 1.8.0 */ + MDS_SETATTR64_REC = LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | + REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + /* PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, obsolete 1.4.0 */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + /* LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, obsolete 1.8.0 */ + CHANGELOG_REC = LLOG_OP_MAGIC | 0x60000, + CHANGELOG_USER_REC = LLOG_OP_MAGIC | 0x70000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +} llog_op_type; + +#define LLOG_REC_HDR_NEEDS_SWABBING(r) \ + (((r)->lrh_type & __swab32(LLOG_OP_MASK)) == __swab32(LLOG_OP_MAGIC)) + +/** Log record header - stored in little endian order. + * Each record must start with this struct, end with a llog_rec_tail, + * and be a multiple of 256 bits in size. + */ +struct llog_rec_hdr { + __u32 lrh_len; + __u32 lrh_index; + __u32 lrh_type; + __u32 lrh_id; +}; + +struct llog_rec_tail { + __u32 lrt_len; + __u32 lrt_index; +}; + +/* Where data follow just after header */ +#define REC_DATA(ptr) \ + ((void *)((char *)ptr + sizeof(struct llog_rec_hdr))) + +#define REC_DATA_LEN(rec) \ + (rec->lrh_len - sizeof(struct llog_rec_hdr) - \ + sizeof(struct llog_rec_tail)) + +struct llog_logid_rec { + struct llog_rec_hdr lid_hdr; + struct llog_logid lid_id; + __u32 lid_padding1; + __u64 lid_padding2; + __u64 lid_padding3; + struct llog_rec_tail lid_tail; +} __attribute__((packed)); + +struct llog_unlink_rec { + struct llog_rec_hdr lur_hdr; + obd_id lur_oid; + obd_count lur_oseq; + obd_count lur_count; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_unlink64_rec { + struct llog_rec_hdr lur_hdr; + struct lu_fid lur_fid; + obd_count lur_count; /* to destroy the lost precreated */ + __u32 lur_padding1; + __u64 lur_padding2; + __u64 lur_padding3; + struct llog_rec_tail lur_tail; +} __attribute__((packed)); + +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + struct ost_id lsr_oi; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + __u64 lsr_padding; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + +struct llog_size_change_rec { + struct llog_rec_hdr lsc_hdr; + struct ll_fid lsc_fid; + __u32 lsc_ioepoch; + __u32 lsc_padding1; + __u64 lsc_padding2; + __u64 lsc_padding3; + struct llog_rec_tail lsc_tail; +} __attribute__((packed)); + +#define CHANGELOG_MAGIC 0xca103000 + +/** \a changelog_rec_type's that can't be masked */ +#define CHANGELOG_MINMASK (1 << CL_MARK) +/** bits covering all \a changelog_rec_type's */ +#define CHANGELOG_ALLMASK 0XFFFFFFFF +/** default \a changelog_rec_type mask */ +#define CHANGELOG_DEFMASK CHANGELOG_ALLMASK & ~(1 << CL_ATIME | 1 << CL_CLOSE) + +/* changelog llog name, needed by client replicators */ +#define CHANGELOG_CATALOG "changelog_catalog" + +struct changelog_setinfo { + __u64 cs_recno; + __u32 cs_id; +} __attribute__((packed)); + +/** changelog record */ +struct llog_changelog_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +struct llog_changelog_ext_rec { + struct llog_rec_hdr cr_hdr; + struct changelog_ext_rec cr; + struct llog_rec_tail cr_tail; /**< for_sizezof_only */ +} __attribute__((packed)); + +#define CHANGELOG_USER_PREFIX "cl" + +struct llog_changelog_user_rec { + struct llog_rec_hdr cur_hdr; + __u32 cur_id; + __u32 cur_padding; + __u64 cur_endrec; + struct llog_rec_tail cur_tail; +} __attribute__((packed)); + +/* Old llog gen for compatibility */ +struct llog_gen { + __u64 mnt_cnt; + __u64 conn_cnt; +} __attribute__((packed)); + +struct llog_gen_rec { + struct llog_rec_hdr lgr_hdr; + struct llog_gen lgr_gen; + __u64 padding1; + __u64 padding2; + __u64 padding3; + struct llog_rec_tail lgr_tail; +}; + +/* On-disk header structure of each log object, stored in little endian order */ +#define LLOG_CHUNK_SIZE 8192 +#define LLOG_HEADER_SIZE (96) +#define LLOG_BITMAP_BYTES (LLOG_CHUNK_SIZE - LLOG_HEADER_SIZE) + +#define LLOG_MIN_REC_SIZE (24) /* round(llog_rec_hdr + llog_rec_tail) */ + +/* flags for the logs */ +enum llog_flag { + LLOG_F_ZAP_WHEN_EMPTY = 0x1, + LLOG_F_IS_CAT = 0x2, + LLOG_F_IS_PLAIN = 0x4, +}; + +struct llog_log_hdr { + struct llog_rec_hdr llh_hdr; + obd_time llh_timestamp; + __u32 llh_count; + __u32 llh_bitmap_offset; + __u32 llh_size; + __u32 llh_flags; + __u32 llh_cat_idx; + /* for a catalog the first plain slot is next to it */ + struct obd_uuid llh_tgtuuid; + __u32 llh_reserved[LLOG_HEADER_SIZE/sizeof(__u32) - 23]; + __u32 llh_bitmap[LLOG_BITMAP_BYTES/sizeof(__u32)]; + struct llog_rec_tail llh_tail; +} __attribute__((packed)); + +#define LLOG_BITMAP_SIZE(llh) (__u32)((llh->llh_hdr.lrh_len - \ + llh->llh_bitmap_offset - \ + sizeof(llh->llh_tail)) * 8) + +/** log cookies are used to reference a specific log file and a record therein */ +struct llog_cookie { + struct llog_logid lgc_lgl; + __u32 lgc_subsys; + __u32 lgc_index; + __u32 lgc_padding; +} __attribute__((packed)); + +/** llog protocol */ +enum llogd_rpc_ops { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, + LLOG_ORIGIN_HANDLE_WRITE_REC = 504, + LLOG_ORIGIN_HANDLE_CLOSE = 505, + LLOG_ORIGIN_CONNECT = 506, + LLOG_CATINFO = 507, /* deprecated */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/ + LLOG_LAST_OPC, + LLOG_FIRST_OPC = LLOG_ORIGIN_HANDLE_CREATE +}; + +struct llogd_body { + struct llog_logid lgd_logid; + __u32 lgd_ctxt_idx; + __u32 lgd_llh_flags; + __u32 lgd_index; + __u32 lgd_saved_index; + __u32 lgd_len; + __u64 lgd_cur_offset; +} __attribute__((packed)); + +struct llogd_conn_body { + struct llog_gen lgdc_gen; + struct llog_logid lgdc_logid; + __u32 lgdc_ctxt_idx; +} __attribute__((packed)); + +/* Note: 64-bit types are 64-bit aligned in structure */ +struct obdo { + obd_valid o_valid; /* hot fields in this obdo */ + struct ost_id o_oi; + obd_id o_parent_seq; + obd_size o_size; /* o_size-o_blocks == ost_lvb */ + obd_time o_mtime; + obd_time o_atime; + obd_time o_ctime; + obd_blocks o_blocks; /* brw: cli sent cached bytes */ + obd_size o_grant; + + /* 32-bit fields start here: keep an even number of them via padding */ + obd_blksize o_blksize; /* optimal IO blocksize */ + obd_mode o_mode; /* brw: cli sent cache remain */ + obd_uid o_uid; + obd_gid o_gid; + obd_flag o_flags; + obd_count o_nlink; /* brw: checksum */ + obd_count o_parent_oid; + obd_count o_misc; /* brw: o_dropped */ + + __u64 o_ioepoch; /* epoch in ost writes */ + __u32 o_stripe_idx; /* holds stripe idx */ + __u32 o_parent_ver; + struct lustre_handle o_handle; /* brw: lock handle to prolong + * locks */ + struct llog_cookie o_lcookie; /* destroy: unlink cookie from + * MDS */ + __u32 o_uid_h; + __u32 o_gid_h; + + __u64 o_data_version; /* getattr: sum of iversion for + * each stripe. + * brw: grant space consumed on + * the client for the write */ + __u64 o_padding_4; + __u64 o_padding_5; + __u64 o_padding_6; +}; + +#define o_dirty o_blocks +#define o_undirty o_mode +#define o_dropped o_misc +#define o_cksum o_nlink +#define o_grant_used o_data_version + +static inline void lustre_set_wire_obdo(struct obd_connect_data *ocd, + struct obdo *wobdo, struct obdo *lobdo) +{ + memcpy(wobdo, lobdo, sizeof(*lobdo)); + wobdo->o_flags &= ~OBD_FL_LOCAL_MASK; + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(ostid_seq(&lobdo->o_oi))) { + /* Currently OBD_FL_OSTID will only be used when 2.4 echo + * client communicate with pre-2.4 server */ + wobdo->o_oi.oi.oi_id = fid_oid(&lobdo->o_oi.oi_fid); + wobdo->o_oi.oi.oi_seq = fid_seq(&lobdo->o_oi.oi_fid); + } +} + +static inline void lustre_get_wire_obdo(struct obd_connect_data *ocd, + struct obdo *lobdo, struct obdo *wobdo) +{ + obd_flag local_flags = 0; + + if (lobdo->o_valid & OBD_MD_FLFLAGS) + local_flags = lobdo->o_flags & OBD_FL_LOCAL_MASK; + + LASSERT(!(wobdo->o_flags & OBD_FL_LOCAL_MASK)); + + memcpy(lobdo, wobdo, sizeof(*lobdo)); + if (local_flags != 0) { + lobdo->o_valid |= OBD_MD_FLFLAGS; + lobdo->o_flags &= ~OBD_FL_LOCAL_MASK; + lobdo->o_flags |= local_flags; + } + if (ocd == NULL) + return; + + if (unlikely(!(ocd->ocd_connect_flags & OBD_CONNECT_FID)) && + fid_seq_is_echo(wobdo->o_oi.oi.oi_seq)) { + /* see above */ + lobdo->o_oi.oi_fid.f_seq = wobdo->o_oi.oi.oi_seq; + lobdo->o_oi.oi_fid.f_oid = wobdo->o_oi.oi.oi_id; + lobdo->o_oi.oi_fid.f_ver = 0; + } +} + +extern void lustre_swab_obdo (struct obdo *o); + +/* request structure for OST's */ +struct ost_body { + struct obdo oa; +}; + +/* Key for FIEMAP to be used in get_info calls */ +struct ll_fiemap_info_key { + char name[8]; + struct obdo oa; + struct ll_user_fiemap fiemap; +}; + +extern void lustre_swab_ost_body (struct ost_body *b); +extern void lustre_swab_ost_last_id(obd_id *id); +extern void lustre_swab_fiemap(struct ll_user_fiemap *fiemap); + +extern void lustre_swab_lov_user_md_v1(struct lov_user_md_v1 *lum); +extern void lustre_swab_lov_user_md_v3(struct lov_user_md_v3 *lum); +extern void lustre_swab_lov_user_md_objects(struct lov_user_ost_data *lod, + int stripe_count); +extern void lustre_swab_lov_mds_md(struct lov_mds_md *lmm); + +/* llog_swab.c */ +extern void lustre_swab_llogd_body (struct llogd_body *d); +extern void lustre_swab_llog_hdr (struct llog_log_hdr *h); +extern void lustre_swab_llogd_conn_body (struct llogd_conn_body *d); +extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec); +extern void lustre_swab_llog_id(struct llog_logid *lid); + +struct lustre_cfg; +extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); + +/* Functions for dumping PTLRPC fields */ +void dump_rniobuf(struct niobuf_remote *rnb); +void dump_ioo(struct obd_ioobj *nb); +void dump_obdo(struct obdo *oa); +void dump_ost_body(struct ost_body *ob); +void dump_rcs(__u32 *rc); + +#define IDX_INFO_MAGIC 0x3D37CC37 + +/* Index file transfer through the network. The server serializes the index into + * a byte stream which is sent to the client via a bulk transfer */ +struct idx_info { + __u32 ii_magic; + + /* reply: see idx_info_flags below */ + __u32 ii_flags; + + /* request & reply: number of lu_idxpage (to be) transferred */ + __u16 ii_count; + __u16 ii_pad0; + + /* request: requested attributes passed down to the iterator API */ + __u32 ii_attrs; + + /* request & reply: index file identifier (FID) */ + struct lu_fid ii_fid; + + /* reply: version of the index file before starting to walk the index. + * Please note that the version can be modified at any time during the + * transfer */ + __u64 ii_version; + + /* request: hash to start with: + * reply: hash of the first entry of the first lu_idxpage and hash + * of the entry to read next if any */ + __u64 ii_hash_start; + __u64 ii_hash_end; + + /* reply: size of keys in lu_idxpages, minimal one if II_FL_VARKEY is + * set */ + __u16 ii_keysize; + + /* reply: size of records in lu_idxpages, minimal one if II_FL_VARREC + * is set */ + __u16 ii_recsize; + + __u32 ii_pad1; + __u64 ii_pad2; + __u64 ii_pad3; +}; +extern void lustre_swab_idx_info(struct idx_info *ii); + +#define II_END_OFF MDS_DIR_END_OFF /* all entries have been read */ + +/* List of flags used in idx_info::ii_flags */ +enum idx_info_flags { + II_FL_NOHASH = 1 << 0, /* client doesn't care about hash value */ + II_FL_VARKEY = 1 << 1, /* keys can be of variable size */ + II_FL_VARREC = 1 << 2, /* records can be of variable size */ + II_FL_NONUNQ = 1 << 3, /* index supports non-unique keys */ +}; + +#define LIP_MAGIC 0x8A6D6B6C + +/* 4KB (= LU_PAGE_SIZE) container gathering key/record pairs */ +struct lu_idxpage { + /* 16-byte header */ + __u32 lip_magic; + __u16 lip_flags; + __u16 lip_nr; /* number of entries in the container */ + __u64 lip_pad0; /* additional padding for future use */ + + /* key/record pairs are stored in the remaining 4080 bytes. + * depending upon the flags in idx_info::ii_flags, each key/record + * pair might be preceded by: + * - a hash value + * - the key size (II_FL_VARKEY is set) + * - the record size (II_FL_VARREC is set) + * + * For the time being, we only support fixed-size key & record. */ + char lip_entries[0]; +}; +extern void lustre_swab_lip_header(struct lu_idxpage *lip); + +#define LIP_HDR_SIZE (offsetof(struct lu_idxpage, lip_entries)) + +/* Gather all possible type associated with a 4KB container */ +union lu_page { + struct lu_dirpage lp_dir; /* for MDS_READPAGE */ + struct lu_idxpage lp_idx; /* for OBD_IDX_READ */ + char lp_array[LU_PAGE_SIZE]; +}; + +/* security opcodes */ +typedef enum { + SEC_CTX_INIT = 801, + SEC_CTX_INIT_CONT = 802, + SEC_CTX_FINI = 803, + SEC_LAST_OPC, + SEC_FIRST_OPC = SEC_CTX_INIT +} sec_cmd_t; + +/* + * capa related definitions + */ +#define CAPA_HMAC_MAX_LEN 64 +#define CAPA_HMAC_KEY_MAX_LEN 56 + +/* NB take care when changing the sequence of elements this struct, + * because the offset info is used in find_capa() */ +struct lustre_capa { + struct lu_fid lc_fid; /** fid */ + __u64 lc_opc; /** operations allowed */ + __u64 lc_uid; /** file owner */ + __u64 lc_gid; /** file group */ + __u32 lc_flags; /** HMAC algorithm & flags */ + __u32 lc_keyid; /** key# used for the capability */ + __u32 lc_timeout; /** capa timeout value (sec) */ + __u32 lc_expiry; /** expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa(struct lustre_capa *c); + +/** lustre_capa::lc_opc */ +enum { + CAPA_OPC_BODY_WRITE = 1<<0, /**< write object data */ + CAPA_OPC_BODY_READ = 1<<1, /**< read object data */ + CAPA_OPC_INDEX_LOOKUP = 1<<2, /**< lookup object fid */ + CAPA_OPC_INDEX_INSERT = 1<<3, /**< insert object fid */ + CAPA_OPC_INDEX_DELETE = 1<<4, /**< delete object fid */ + CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ + CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ + CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ + CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ + CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ +}; + +#define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) +#define CAPA_OPC_MDS_ONLY \ + (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ + CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) +#define CAPA_OPC_OSS_ONLY \ + (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ + CAPA_OPC_OSS_DESTROY) +#define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY +#define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) + +/* MDS capability covers object capability for operations of body r/w + * (dir readpage/sendpage), index lookup/insert/delete and meta data r/w, + * while OSS capability only covers object capability for operations of + * oss data(file content) r/w/truncate. + */ +static inline int capa_for_mds(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) != 0; +} + +static inline int capa_for_oss(struct lustre_capa *c) +{ + return (c->lc_opc & CAPA_OPC_INDEX_LOOKUP) == 0; +} + +/* lustre_capa::lc_hmac_alg */ +enum { + CAPA_HMAC_ALG_SHA1 = 1, /**< sha1 algorithm */ + CAPA_HMAC_ALG_MAX, +}; + +#define CAPA_FL_MASK 0x00ffffff +#define CAPA_HMAC_ALG_MASK 0xff000000 + +struct lustre_capa_key { + __u64 lk_seq; /**< mds# */ + __u32 lk_keyid; /**< key# */ + __u32 lk_padding; + __u8 lk_key[CAPA_HMAC_KEY_MAX_LEN]; /**< key */ +} __attribute__((packed)); + +extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); + +/** The link ea holds 1 \a link_ea_entry for each hardlink */ +#define LINK_EA_MAGIC 0x11EAF1DFUL +struct link_ea_header { + __u32 leh_magic; + __u32 leh_reccount; + __u64 leh_len; /* total size */ + /* future use */ + __u32 padding1; + __u32 padding2; +}; + +/** Hardlink data is name and parent fid. + * Stored in this crazy struct for maximum packing and endian-neutrality + */ +struct link_ea_entry { + /** __u16 stored big-endian, unaligned */ + unsigned char lee_reclen[2]; + unsigned char lee_parent_fid[sizeof(struct lu_fid)]; + char lee_name[0]; +}__attribute__((packed)); + +/** fid2path request/reply structure */ +struct getinfo_fid2path { + struct lu_fid gf_fid; + __u64 gf_recno; + __u32 gf_linkno; + __u32 gf_pathlen; + char gf_path[0]; +} __attribute__((packed)); + +void lustre_swab_fid2path (struct getinfo_fid2path *gf); + +enum { + LAYOUT_INTENT_ACCESS = 0, + LAYOUT_INTENT_READ = 1, + LAYOUT_INTENT_WRITE = 2, + LAYOUT_INTENT_GLIMPSE = 3, + LAYOUT_INTENT_TRUNC = 4, + LAYOUT_INTENT_RELEASE = 5, + LAYOUT_INTENT_RESTORE = 6 +}; + +/* enqueue layout lock with intent */ +struct layout_intent { + __u32 li_opc; /* intent operation for enqueue, read, write etc */ + __u32 li_flags; + __u64 li_start; + __u64 li_end; +}; + +void lustre_swab_layout_intent(struct layout_intent *li); + +/** + * On the wire version of hsm_progress structure. + * + * Contains the userspace hsm_progress and some internal fields. + */ +struct hsm_progress_kernel { + /* Field taken from struct hsm_progress */ + lustre_fid hpk_fid; + __u64 hpk_cookie; + struct hsm_extent hpk_extent; + __u16 hpk_flags; + __u16 hpk_errval; /* positive val */ + __u32 hpk_padding1; + /* Additional fields */ + __u64 hpk_data_version; + __u64 hpk_padding2; +} __attribute__((packed)); + +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_current_action(struct hsm_current_action *action); +extern void lustre_swab_hsm_progress_kernel(struct hsm_progress_kernel *hpk); +extern void lustre_swab_hsm_user_state(struct hsm_user_state *hus); +extern void lustre_swab_hsm_user_item(struct hsm_user_item *hui); +extern void lustre_swab_hsm_request(struct hsm_request *hr); + +/** + * These are object update opcode under UPDATE_OBJ, which is currently + * being used by cross-ref operations between MDT. + * + * During the cross-ref operation, the Master MDT, which the client send the + * request to, will disassembly the operation into object updates, then OSP + * will send these updates to the remote MDT to be executed. + * + * Update request format + * magic: UPDATE_BUFFER_MAGIC_V1 + * Count: How many updates in the req. + * bufs[0] : following are packets of object. + * update[0]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * update[1]: + * type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * .......... + * update[7]: type: object_update_op, the op code of update + * fid: The object fid of the update. + * lens/bufs: other parameters of the update. + * Current 8 maxim updates per object update request. + * + ******************************************************************* + * update reply format: + * + * ur_version: UPDATE_REPLY_V1 + * ur_count: The count of the reply, which is usually equal + * to the number of updates in the request. + * ur_lens: The reply lengths of each object update. + * + * replies: 1st update reply [4bytes_ret: other body] + * 2nd update reply [4bytes_ret: other body] + * ..... + * nth update reply [4bytes_ret: other body] + * + * For each reply of the update, the format would be + * result(4 bytes):Other stuff + */ + +#define UPDATE_MAX_OPS 10 +#define UPDATE_BUFFER_MAGIC_V1 0xBDDE0001 +#define UPDATE_BUFFER_MAGIC UPDATE_BUFFER_MAGIC_V1 +#define UPDATE_BUF_COUNT 8 +enum object_update_op { + OBJ_CREATE = 1, + OBJ_DESTROY = 2, + OBJ_REF_ADD = 3, + OBJ_REF_DEL = 4, + OBJ_ATTR_SET = 5, + OBJ_ATTR_GET = 6, + OBJ_XATTR_SET = 7, + OBJ_XATTR_GET = 8, + OBJ_INDEX_LOOKUP = 9, + OBJ_INDEX_INSERT = 10, + OBJ_INDEX_DELETE = 11, + OBJ_LAST +}; + +struct update { + __u32 u_type; + __u32 u_batchid; + struct lu_fid u_fid; + __u32 u_lens[UPDATE_BUF_COUNT]; + __u32 u_bufs[0]; +}; + +struct update_buf { + __u32 ub_magic; + __u32 ub_count; + __u32 ub_bufs[0]; +}; + +#define UPDATE_REPLY_V1 0x00BD0001 +struct update_reply { + __u32 ur_version; + __u32 ur_count; + __u32 ur_lens[0]; +}; + +void lustre_swab_update_buf(struct update_buf *ub); +void lustre_swab_update_reply_buf(struct update_reply *ur); + +/** layout swap request structure + * fid1 and fid2 are in mdt_body + */ +struct mdc_swap_layouts { + __u64 msl_flags; +} __packed; + +void lustre_swab_swap_layouts(struct mdc_swap_layouts *msl); + +#endif +/** @} lustreidl */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h new file mode 100644 index 000000000000..1c87a61a7fc1 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_lfsck_user.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/include/lustre/lustre_lfsck_user.h + * + * Lustre LFSCK userspace interfaces. + * + * Author: Fan Yong <yong.fan@whamcloud.com> + */ + +#ifndef _LUSTRE_LFSCK_USER_H +# define _LUSTRE_LFSCK_USER_H + +enum lfsck_param_flags { + /* Reset LFSCK iterator position to the device beginning. */ + LPF_RESET = 0x0001, + + /* Exit when fail. */ + LPF_FAILOUT = 0x0002, + + /* Dryrun mode, only check without modification */ + LPF_DRYRUN = 0x0004, +}; + +enum lfsck_type { + /* For MDT-OST consistency check/repair. */ + LT_LAYOUT = 0x0001, + + /* For MDT-MDT consistency check/repair. */ + LT_DNE = 0x0002, + + /* For FID-in-dirent and linkEA consistency check/repair. */ + LT_NAMESPACE = 0x0004, +}; + +#define LFSCK_VERSION_V1 1 +#define LFSCK_VERSION_V2 2 + +#define LFSCK_TYPES_ALL ((__u16)(~0)) +#define LFSCK_TYPES_DEF ((__u16)0) +#define LFSCK_TYPES_SUPPORTED LT_NAMESPACE + +#define LFSCK_SPEED_NO_LIMIT 0 +#define LFSCK_SPEED_LIMIT_DEF LFSCK_SPEED_NO_LIMIT + +enum lfsck_start_valid { + LSV_SPEED_LIMIT = 0x00000001, + LSV_ERROR_HANDLE = 0x00000002, + LSV_DRYRUN = 0x00000004, +}; + +/* Arguments for starting lfsck. */ +struct lfsck_start { + /* Which arguments are valid, see 'enum lfsck_start_valid'. */ + __u32 ls_valid; + + /* How many items can be scanned at most per second. */ + __u32 ls_speed_limit; + + /* For compatibility between user space tools and kernel service. */ + __u16 ls_version; + + /* Which LFSCK components to be (have been) started. */ + __u16 ls_active; + + /* Flags for the LFSCK, see 'enum lfsck_param_flags'. */ + __u16 ls_flags; + + /* For 64-bits aligned. */ + __u16 ls_padding; +}; + +#endif /* _LUSTRE_LFSCK_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustre_user.h b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h new file mode 100644 index 000000000000..7e9f57507f04 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustre_user.h @@ -0,0 +1,1145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre/lustre_user.h + * + * Lustre public user-space interface definitions. + */ + +#ifndef _LUSTRE_USER_H +#define _LUSTRE_USER_H + +/** \defgroup lustreuser lustreuser + * + * @{ + */ + +#include <lustre/ll_fiemap.h> +#include <linux/lustre_user.h> + +/* for statfs() */ +#define LL_SUPER_MAGIC 0x0BD00BD0 + +#ifndef FSFILT_IOC_GETFLAGS +#define FSFILT_IOC_GETFLAGS _IOR('f', 1, long) +#define FSFILT_IOC_SETFLAGS _IOW('f', 2, long) +#define FSFILT_IOC_GETVERSION _IOR('f', 3, long) +#define FSFILT_IOC_SETVERSION _IOW('f', 4, long) +#define FSFILT_IOC_GETVERSION_OLD _IOR('v', 1, long) +#define FSFILT_IOC_SETVERSION_OLD _IOW('v', 2, long) +#define FSFILT_IOC_FIEMAP _IOWR('f', 11, struct ll_user_fiemap) +#endif + +/* FIEMAP flags supported by Lustre */ +#define LUSTRE_FIEMAP_FLAGS_COMPAT (FIEMAP_FLAG_SYNC | FIEMAP_FLAG_DEVICE_ORDER) + +enum obd_statfs_state { + OS_STATE_DEGRADED = 0x00000001, /**< RAID degraded/rebuilding */ + OS_STATE_READONLY = 0x00000002, /**< filesystem is read-only */ + OS_STATE_RDONLY_1 = 0x00000004, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_2 = 0x00000008, /**< obsolete 1.6, was EROFS=30 */ + OS_STATE_RDONLY_3 = 0x00000010, /**< obsolete 1.6, was EROFS=30 */ +}; + +struct obd_statfs { + __u64 os_type; + __u64 os_blocks; + __u64 os_bfree; + __u64 os_bavail; + __u64 os_files; + __u64 os_ffree; + __u8 os_fsid[40]; + __u32 os_bsize; + __u32 os_namelen; + __u64 os_maxbytes; + __u32 os_state; /**< obd_statfs_state OS_STATE_* flag */ + __u32 os_fprecreated; /* objs available now to the caller */ + /* used in QoS code to find preferred + * OSTs */ + __u32 os_spare2; + __u32 os_spare3; + __u32 os_spare4; + __u32 os_spare5; + __u32 os_spare6; + __u32 os_spare7; + __u32 os_spare8; + __u32 os_spare9; +}; + +/** + * File IDentifier. + * + * FID is a cluster-wide unique identifier of a file or an object (stripe). + * FIDs are never reused. + **/ +struct lu_fid { + /** + * FID sequence. Sequence is a unit of migration: all files (objects) + * with FIDs from a given sequence are stored on the same server. + * Lustre should support 2^64 objects, so even if each sequence + * has only a single object we can still enumerate 2^64 objects. + **/ + __u64 f_seq; + /* FID number within sequence. */ + __u32 f_oid; + /** + * FID version, used to distinguish different versions (in the sense + * of snapshots, etc.) of the same file system object. Not currently + * used. + **/ + __u32 f_ver; +}; + +struct filter_fid { + struct lu_fid ff_parent; /* ff_parent.f_ver == file stripe number */ +}; + +/* keep this one for compatibility */ +struct filter_fid_old { + struct lu_fid ff_parent; + __u64 ff_objid; + __u64 ff_seq; +}; + +/* Userspace should treat lu_fid as opaque, and only use the following methods + * to print or parse them. Other functions (e.g. compare, swab) could be moved + * here from lustre_idl.h if needed. */ +typedef struct lu_fid lustre_fid; + +/** + * Following struct for object attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + * Added to all objects since Lustre 2.4 as contains self FID + */ +struct lustre_mdt_attrs { + /** + * Bitfield for supported data in this structure. From enum lma_compat. + * lma_self_fid and lma_flags are always available. + */ + __u32 lma_compat; + /** + * Per-file incompat feature list. Lustre version should support all + * flags set in this field. The supported feature mask is available in + * LMA_INCOMPAT_SUPP. + */ + __u32 lma_incompat; + /** FID of this inode */ + struct lu_fid lma_self_fid; +}; + +/** + * Prior to 2.4, the LMA structure also included SOM attributes which has since + * been moved to a dedicated xattr + * lma_flags was also removed because of lma_compat/incompat fields. + */ +#define LMA_OLD_SIZE (sizeof(struct lustre_mdt_attrs) + 5 * sizeof(__u64)) + +/** + * OST object IDentifier. + */ +struct ost_id { + union { + struct ostid { + __u64 oi_id; + __u64 oi_seq; + } oi; + struct lu_fid oi_fid; + }; +}; + +#define DOSTID LPX64":"LPU64 +#define POSTID(oi) ostid_seq(oi), ostid_id(oi) + +/* + * The ioctl naming rules: + * LL_* - works on the currently opened filehandle instead of parent dir + * *_OBD_* - gets data for both OSC or MDC (LOV, LMV indirectly) + * *_MDC_* - gets/sets data related to MDC + * *_LOV_* - gets/sets data related to OSC/LOV + * *FILE* - called on parent dir and passes in a filename + * *STRIPE* - set/get lov_user_md + * *INFO - set/get lov_user_mds_data + */ +/* see <lustre_lib.h> for ioctl numberss 101-150 */ +#define LL_IOC_GETFLAGS _IOR ('f', 151, long) +#define LL_IOC_SETFLAGS _IOW ('f', 152, long) +#define LL_IOC_CLRFLAGS _IOW ('f', 153, long) +/* LL_IOC_LOV_SETSTRIPE: See also OBD_IOC_LOV_SETSTRIPE */ +#define LL_IOC_LOV_SETSTRIPE _IOW ('f', 154, long) +/* LL_IOC_LOV_GETSTRIPE: See also OBD_IOC_LOV_GETSTRIPE */ +#define LL_IOC_LOV_GETSTRIPE _IOW ('f', 155, long) +/* LL_IOC_LOV_SETEA: See also OBD_IOC_LOV_SETEA */ +#define LL_IOC_LOV_SETEA _IOW ('f', 156, long) +#define LL_IOC_RECREATE_OBJ _IOW ('f', 157, long) +#define LL_IOC_RECREATE_FID _IOW ('f', 157, struct lu_fid) +#define LL_IOC_GROUP_LOCK _IOW ('f', 158, long) +#define LL_IOC_GROUP_UNLOCK _IOW ('f', 159, long) +/* LL_IOC_QUOTACHECK: See also OBD_IOC_QUOTACHECK */ +#define LL_IOC_QUOTACHECK _IOW ('f', 160, int) +/* LL_IOC_POLL_QUOTACHECK: See also OBD_IOC_POLL_QUOTACHECK */ +#define LL_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* LL_IOC_QUOTACTL: See also OBD_IOC_QUOTACTL */ +#define LL_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +#define IOC_OBD_STATFS _IOWR('f', 164, struct obd_statfs *) +#define IOC_LOV_GETINFO _IOWR('f', 165, struct lov_user_mds_data *) +#define LL_IOC_FLUSHCTX _IOW ('f', 166, long) +#define LL_IOC_RMTACL _IOW ('f', 167, long) +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) +#define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) +#define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) +#define LL_IOC_LLOOP_INFO _IOWR('f', 171, struct lu_fid) +#define LL_IOC_LLOOP_DETACH_BYDEV _IOWR('f', 172, long) +#define LL_IOC_PATH2FID _IOR ('f', 173, long) +#define LL_IOC_GET_CONNECT_FLAGS _IOWR('f', 174, __u64 *) +#define LL_IOC_GET_MDTIDX _IOR ('f', 175, int) + +/* see <lustre_lib.h> for ioctl numbers 177-210 */ + +#define LL_IOC_HSM_STATE_GET _IOR('f', 211, struct hsm_user_state) +#define LL_IOC_HSM_STATE_SET _IOW('f', 212, struct hsm_state_set) +#define LL_IOC_HSM_CT_START _IOW('f', 213, struct lustre_kernelcomm) +#define LL_IOC_HSM_COPY_START _IOW('f', 214, struct hsm_copy *) +#define LL_IOC_HSM_COPY_END _IOW('f', 215, struct hsm_copy *) +#define LL_IOC_HSM_PROGRESS _IOW('f', 216, struct hsm_user_request) +#define LL_IOC_HSM_REQUEST _IOW('f', 217, struct hsm_user_request) +#define LL_IOC_DATA_VERSION _IOR('f', 218, struct ioc_data_version) +#define LL_IOC_LOV_SWAP_LAYOUTS _IOW('f', 219, \ + struct lustre_swap_layouts) +#define LL_IOC_HSM_ACTION _IOR('f', 220, \ + struct hsm_current_action) +/* see <lustre_lib.h> for ioctl numbers 221-232 */ + +#define LL_IOC_LMV_SETSTRIPE _IOWR('f', 240, struct lmv_user_md) +#define LL_IOC_LMV_GETSTRIPE _IOWR('f', 241, struct lmv_user_md) +#define LL_IOC_REMOVE_ENTRY _IOWR('f', 242, __u64) + +#define LL_STATFS_LMV 1 +#define LL_STATFS_LOV 2 +#define LL_STATFS_NODELAY 4 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_LOOKUP _IOWR(IOC_MDC_TYPE, 20, struct obd_device *) +#define IOC_MDC_GETFILESTRIPE _IOWR(IOC_MDC_TYPE, 21, struct lov_user_md *) +#define IOC_MDC_GETFILEINFO _IOWR(IOC_MDC_TYPE, 22, struct lov_user_mds_data *) +#define LL_IOC_MDC_GETINFO _IOWR(IOC_MDC_TYPE, 23, struct lov_user_mds_data *) + +/* Keep these for backward compartability. */ +#define LL_IOC_OBD_STATFS IOC_OBD_STATFS +#define IOC_MDC_GETSTRIPE IOC_MDC_GETFILESTRIPE + + +#define MAX_OBD_NAME 128 /* If this changes, a NEW ioctl must be added */ + +/* Hopefully O_LOV_DELAY_CREATE does not conflict with standard O_xxx flags. + * Previously it was defined as 0100000000 and conflicts with FMODE_NONOTIFY + * which was added since kernel 2.6.36, so we redefine it as 020000000. + * To be compatible with old version's statically linked binary, finally we + * define it as (020000000 | 0100000000). + * */ +#define O_LOV_DELAY_CREATE 0120000000 + +#define LL_FILE_IGNORE_LOCK 0x00000001 +#define LL_FILE_GROUP_LOCKED 0x00000002 +#define LL_FILE_READAHEA 0x00000004 +#define LL_FILE_LOCKED_DIRECTIO 0x00000008 /* client-side locks with dio */ +#define LL_FILE_LOCKLESS_IO 0x00000010 /* server-side locks with cio */ +#define LL_FILE_RMTACL 0x00000020 + +#define LOV_USER_MAGIC_V1 0x0BD10BD0 +#define LOV_USER_MAGIC LOV_USER_MAGIC_V1 +#define LOV_USER_MAGIC_JOIN_V1 0x0BD20BD0 +#define LOV_USER_MAGIC_V3 0x0BD30BD0 + +#define LMV_MAGIC_V1 0x0CD10CD0 /*normal stripe lmv magic */ +#define LMV_USER_MAGIC 0x0CD20CD0 /*default lmv magic*/ + +#define LOV_PATTERN_RAID0 0x001 +#define LOV_PATTERN_RAID1 0x002 +#define LOV_PATTERN_FIRST 0x100 + +#define LOV_MAXPOOLNAME 16 +#define LOV_POOLNAMEF "%.16s" + +#define LOV_MIN_STRIPE_BITS 16 /* maximum PAGE_SIZE (ia64), power of 2 */ +#define LOV_MIN_STRIPE_SIZE (1 << LOV_MIN_STRIPE_BITS) +#define LOV_MAX_STRIPE_COUNT_OLD 160 +/* This calculation is crafted so that input of 4096 will result in 160 + * which in turn is equal to old maximal stripe count. + * XXX: In fact this is too simpified for now, what it also need is to get + * ea_type argument to clearly know how much space each stripe consumes. + * + * The limit of 12 pages is somewhat arbitrary, but is a reasonably large + * allocation that is sufficient for the current generation of systems. + * + * (max buffer size - lov+rpc header) / sizeof(struct lov_ost_data_v1) */ +#define LOV_MAX_STRIPE_COUNT 2000 /* ((12 * 4096 - 256) / 24) */ +#define LOV_ALL_STRIPES 0xffff /* only valid for directories */ +#define LOV_V1_INSANE_STRIPE_COUNT 65532 /* maximum stripe count bz13933 */ + +#define lov_user_ost_data lov_user_ost_data_v1 +struct lov_user_ost_data_v1 { /* per-stripe data structure */ + struct ost_id l_ost_oi; /* OST object ID */ + __u32 l_ost_gen; /* generation of this OST index */ + __u32 l_ost_idx; /* OST index in LOV */ +} __attribute__((packed)); + +#define lov_user_md lov_user_md_v1 +struct lov_user_md_v1 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V1 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed, __may_alias__)); + +struct lov_user_md_v3 { /* LOV EA user data (host-endian) */ + __u32 lmm_magic; /* magic number = LOV_USER_MAGIC_V3 */ + __u32 lmm_pattern; /* LOV_PATTERN_RAID0, LOV_PATTERN_RAID1 */ + struct ost_id lmm_oi; /* LOV object ID */ + __u32 lmm_stripe_size; /* size of stripe in bytes */ + __u16 lmm_stripe_count; /* num stripes in use for this object */ + union { + __u16 lmm_stripe_offset; /* starting stripe offset in + * lmm_objects, use when writing */ + __u16 lmm_layout_gen; /* layout generation number + * used when reading */ + }; + char lmm_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + struct lov_user_ost_data_v1 lmm_objects[0]; /* per-stripe data */ +} __attribute__((packed)); + +/* Compile with -D_LARGEFILE64_SOURCE or -D_GNU_SOURCE (or #define) to + * use this. It is unsafe to #define those values in this header as it + * is possible the application has already #included <sys/stat.h>. */ +#ifdef HAVE_LOV_USER_MDS_DATA +#define lov_user_mds_data lov_user_mds_data_v1 +struct lov_user_mds_data_v1 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v1 lmd_lmm; /* LOV EA V1 user data */ +} __attribute__((packed)); + +struct lov_user_mds_data_v3 { + lstat_t lmd_st; /* MDS stat struct */ + struct lov_user_md_v3 lmd_lmm; /* LOV EA V3 user data */ +} __attribute__((packed)); +#endif + +/* keep this to be the same size as lov_user_ost_data_v1 */ +struct lmv_user_mds_data { + struct lu_fid lum_fid; + __u32 lum_padding; + __u32 lum_mds; +}; + +/* lum_type */ +enum { + LMV_STRIPE_TYPE = 0, + LMV_DEFAULT_TYPE = 1, +}; + +#define lmv_user_md lmv_user_md_v1 +struct lmv_user_md_v1 { + __u32 lum_magic; /* must be the first field */ + __u32 lum_stripe_count; /* dirstripe count */ + __u32 lum_stripe_offset; /* MDT idx for default dirstripe */ + __u32 lum_hash_type; /* Dir stripe policy */ + __u32 lum_type; /* LMV type: default or normal */ + __u32 lum_padding1; + __u32 lum_padding2; + __u32 lum_padding3; + char lum_pool_name[LOV_MAXPOOLNAME]; + struct lmv_user_mds_data lum_objects[0]; +}; + +static inline int lmv_user_md_size(int stripes, int lmm_magic) +{ + return sizeof(struct lmv_user_md) + + stripes * sizeof(struct lmv_user_mds_data); +} + +extern void lustre_swab_lmv_user_md(struct lmv_user_md *lum); + +struct ll_recreate_obj { + __u64 lrc_id; + __u32 lrc_ost_idx; +}; + +struct ll_fid { + __u64 id; /* holds object id */ + __u32 generation; /* holds object generation */ + __u32 f_type; /* holds object type or stripe idx when passing it to + * OST for saving into EA. */ +}; + +#define UUID_MAX 40 +struct obd_uuid { + char uuid[UUID_MAX]; +}; + +static inline int obd_uuid_equals(const struct obd_uuid *u1, + const struct obd_uuid *u2) +{ + return strcmp((char *)u1->uuid, (char *)u2->uuid) == 0; +} + +static inline int obd_uuid_empty(struct obd_uuid *uuid) +{ + return uuid->uuid[0] == '\0'; +} + +static inline void obd_str2uuid(struct obd_uuid *uuid, const char *tmp) +{ + strncpy((char *)uuid->uuid, tmp, sizeof(*uuid)); + uuid->uuid[sizeof(*uuid) - 1] = '\0'; +} + +/* For printf's only, make sure uuid is terminated */ +static inline char *obd_uuid2str(struct obd_uuid *uuid) +{ + if (uuid->uuid[sizeof(*uuid) - 1] != '\0') { + /* Obviously not safe, but for printfs, no real harm done... + we're always null-terminated, even in a race. */ + static char temp[sizeof(*uuid)]; + memcpy(temp, uuid->uuid, sizeof(*uuid) - 1); + temp[sizeof(*uuid) - 1] = '\0'; + return temp; + } + return (char *)(uuid->uuid); +} + +/* Extract fsname from uuid (or target name) of a target + e.g. (myfs-OST0007_UUID -> myfs) + see also deuuidify. */ +static inline void obd_uuid2fsname(char *buf, char *uuid, int buflen) +{ + char *p; + + strncpy(buf, uuid, buflen - 1); + buf[buflen - 1] = '\0'; + p = strrchr(buf, '-'); + if (p) + *p = '\0'; +} + +/* printf display format + e.g. printf("file FID is "DFID"\n", PFID(fid)); */ +#define DFID_NOBRACE LPX64":0x%x:0x%x" +#define DFID "["DFID_NOBRACE"]" +#define PFID(fid) \ + (fid)->f_seq, \ + (fid)->f_oid, \ + (fid)->f_ver + +/* scanf input parse format -- strip '[' first. + e.g. sscanf(fidstr, SFID, RFID(&fid)); */ +/* #define SFID "0x"LPX64i":0x"LPSZX":0x"LPSZX"" +liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 4 has type 'unsigned int *' +liblustreapi.c:2893: warning: format '%lx' expects type 'long unsigned int *', but argument 5 has type 'unsigned int *' +*/ +#define SFID "0x"LPX64i":0x%x:0x%x" +#define RFID(fid) \ + &((fid)->f_seq), \ + &((fid)->f_oid), \ + &((fid)->f_ver) + + +/********* Quotas **********/ + +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */ + +#define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ + +struct if_quotacheck { + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +#define IDENTITY_DOWNCALL_MAGIC 0x6d6dd629 + +/* permission */ +#define N_PERMS_MAX 64 + +struct perm_downcall_data { + __u64 pdd_nid; + __u32 pdd_perm; + __u32 pdd_padding; +}; + +struct identity_downcall_data { + __u32 idd_magic; + __u32 idd_err; + __u32 idd_uid; + __u32 idd_gid; + __u32 idd_nperms; + __u32 idd_ngroups; + struct perm_downcall_data idd_perms[N_PERMS_MAX]; + __u32 idd_groups[0]; +}; + +/* for non-mapped uid/gid */ +#define NOBODY_UID 99 +#define NOBODY_GID 99 + +#define INVALID_ID (-1) + +enum { + RMT_LSETFACL = 1, + RMT_LGETFACL = 2, + RMT_RSETFACL = 3, + RMT_RGETFACL = 4 +}; + +#ifdef NEED_QUOTA_DEFS +#ifndef QIF_BLIMITS +#define QIF_BLIMITS 1 +#define QIF_SPACE 2 +#define QIF_ILIMITS 4 +#define QIF_INODES 8 +#define QIF_BTIME 16 +#define QIF_ITIME 32 +#define QIF_LIMITS (QIF_BLIMITS | QIF_ILIMITS) +#define QIF_USAGE (QIF_SPACE | QIF_INODES) +#define QIF_TIMES (QIF_BTIME | QIF_ITIME) +#define QIF_ALL (QIF_LIMITS | QIF_USAGE | QIF_TIMES) +#endif + +#endif /* !__KERNEL__ */ + +/* lustre volatile file support + * file name header: .^L^S^T^R:volatile" + */ +#define LUSTRE_VOLATILE_HDR ".\x0c\x13\x14\x12:VOLATILE" +#define LUSTRE_VOLATILE_HDR_LEN 14 +/* hdr + MDT index */ +#define LUSTRE_VOLATILE_IDX LUSTRE_VOLATILE_HDR":%.4X:" + +typedef enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +} lustre_quota_version_t; + +/* XXX: same as if_dqinfo struct in kernel */ +struct obd_dqinfo { + __u64 dqi_bgrace; + __u64 dqi_igrace; + __u32 dqi_flags; + __u32 dqi_valid; +}; + +/* XXX: same as if_dqblk struct in kernel, plus one padding */ +struct obd_dqblk { + __u64 dqb_bhardlimit; + __u64 dqb_bsoftlimit; + __u64 dqb_curspace; + __u64 dqb_ihardlimit; + __u64 dqb_isoftlimit; + __u64 dqb_curinodes; + __u64 dqb_btime; + __u64 dqb_itime; + __u32 dqb_valid; + __u32 dqb_padding; +}; + +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + +struct if_quotactl { + __u32 qc_cmd; + __u32 qc_type; + __u32 qc_id; + __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; + struct obd_dqinfo qc_dqinfo; + struct obd_dqblk qc_dqblk; + char obd_type[16]; + struct obd_uuid obd_uuid; +}; + +/* swap layout flags */ +#define SWAP_LAYOUTS_CHECK_DV1 (1 << 0) +#define SWAP_LAYOUTS_CHECK_DV2 (1 << 1) +#define SWAP_LAYOUTS_KEEP_MTIME (1 << 2) +#define SWAP_LAYOUTS_KEEP_ATIME (1 << 3) +struct lustre_swap_layouts { + __u64 sl_flags; + __u32 sl_fd; + __u32 sl_gid; + __u64 sl_dv1; + __u64 sl_dv2; +}; + + +/********* Changelogs **********/ +/** Changelog record types */ +enum changelog_rec_type { + CL_MARK = 0, + CL_CREATE = 1, /* namespace */ + CL_MKDIR = 2, /* namespace */ + CL_HARDLINK = 3, /* namespace */ + CL_SOFTLINK = 4, /* namespace */ + CL_MKNOD = 5, /* namespace */ + CL_UNLINK = 6, /* namespace */ + CL_RMDIR = 7, /* namespace */ + CL_RENAME = 8, /* namespace */ + CL_EXT = 9, /* namespace extended record (2nd half of rename) */ + CL_OPEN = 10, /* not currently used */ + CL_CLOSE = 11, /* may be written to log only with mtime change */ + CL_LAYOUT = 12, /* file layout/striping modified */ + CL_TRUNC = 13, + CL_SETATTR = 14, + CL_XATTR = 15, + CL_HSM = 16, /* HSM specific events, see flags */ + CL_MTIME = 17, /* Precedence: setattr > mtime > ctime > atime */ + CL_CTIME = 18, + CL_ATIME = 19, + CL_LAST +}; + +static inline const char *changelog_type2str(int type) { + static const char *changelog_str[] = { + "MARK", "CREAT", "MKDIR", "HLINK", "SLINK", "MKNOD", "UNLNK", + "RMDIR", "RENME", "RNMTO", "OPEN", "CLOSE", "LYOUT", "TRUNC", + "SATTR", "XATTR", "HSM", "MTIME", "CTIME", "ATIME", + }; + + if (type >= 0 && type < CL_LAST) + return changelog_str[type]; + return NULL; +} + +/* per-record flags */ +#define CLF_VERSION 0x1000 +#define CLF_EXT_VERSION 0x2000 +#define CLF_FLAGSHIFT 12 +#define CLF_FLAGMASK ((1U << CLF_FLAGSHIFT) - 1) +#define CLF_VERMASK (~CLF_FLAGMASK) +/* Anything under the flagmask may be per-type (if desired) */ +/* Flags for unlink */ +#define CLF_UNLINK_LAST 0x0001 /* Unlink of last hardlink */ +#define CLF_UNLINK_HSM_EXISTS 0x0002 /* File has something in HSM */ + /* HSM cleaning needed */ +/* Flags for rename */ +#define CLF_RENAME_LAST 0x0001 /* rename unlink last hardlink of target */ + +/* Flags for HSM */ +/* 12b used (from high weight to low weight): + * 2b for flags + * 3b for event + * 7b for error code + */ +#define CLF_HSM_ERR_L 0 /* HSM return code, 7 bits */ +#define CLF_HSM_ERR_H 6 +#define CLF_HSM_EVENT_L 7 /* HSM event, 3 bits, see enum hsm_event */ +#define CLF_HSM_EVENT_H 9 +#define CLF_HSM_FLAG_L 10 /* HSM flags, 2 bits, 1 used, 1 spare */ +#define CLF_HSM_FLAG_H 11 +#define CLF_HSM_SPARE_L 12 /* 4 spare bits */ +#define CLF_HSM_SPARE_H 15 +#define CLF_HSM_LAST 15 + +/* Remove bits higher than _h, then extract the value + * between _h and _l by shifting lower weigth to bit 0. */ +#define CLF_GET_BITS(_b, _h, _l) (((_b << (CLF_HSM_LAST - _h)) & 0xFFFF) \ + >> (CLF_HSM_LAST - _h + _l)) + +#define CLF_HSM_SUCCESS 0x00 +#define CLF_HSM_MAXERROR 0x7E +#define CLF_HSM_ERROVERFLOW 0x7F + +#define CLF_HSM_DIRTY 1 /* file is dirty after HSM request end */ + +/* 3 bits field => 8 values allowed */ +enum hsm_event { + HE_ARCHIVE = 0, + HE_RESTORE = 1, + HE_CANCEL = 2, + HE_RELEASE = 3, + HE_REMOVE = 4, + HE_STATE = 5, + HE_SPARE1 = 6, + HE_SPARE2 = 7, +}; + +static inline enum hsm_event hsm_get_cl_event(__u16 flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_EVENT_H, CLF_HSM_EVENT_L); +} + +static inline void hsm_set_cl_event(int *flags, enum hsm_event he) +{ + *flags |= (he << CLF_HSM_EVENT_L); +} + +static inline __u16 hsm_get_cl_flags(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_FLAG_H, CLF_HSM_FLAG_L); +} + +static inline void hsm_set_cl_flags(int *flags, int bits) +{ + *flags |= (bits << CLF_HSM_FLAG_L); +} + +static inline int hsm_get_cl_error(int flags) +{ + return CLF_GET_BITS(flags, CLF_HSM_ERR_H, CLF_HSM_ERR_L); +} + +static inline void hsm_set_cl_error(int *flags, int error) +{ + *flags |= (error << CLF_HSM_ERR_L); +} + +#define CR_MAXSIZE cfs_size_round(2*NAME_MAX + 1 + sizeof(struct changelog_rec)) + +struct changelog_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags&CLF_FLAGMASK)|CLF_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< parent fid */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +/* changelog_ext_rec is 2*sizeof(lu_fid) bigger than changelog_rec, to save + * space, only rename uses changelog_ext_rec, while others use changelog_rec to + * store records. + */ +struct changelog_ext_rec { + __u16 cr_namelen; + __u16 cr_flags; /**< (flags & CLF_FLAGMASK) | + CLF_EXT_VERSION */ + __u32 cr_type; /**< \a changelog_rec_type */ + __u64 cr_index; /**< changelog record number */ + __u64 cr_prev; /**< last index for this target fid */ + __u64 cr_time; + union { + lustre_fid cr_tfid; /**< target fid */ + __u32 cr_markerflags; /**< CL_MARK flags */ + }; + lustre_fid cr_pfid; /**< target parent fid */ + lustre_fid cr_sfid; /**< source fid, or zero */ + lustre_fid cr_spfid; /**< source parent fid, or zero */ + char cr_name[0]; /**< last element */ +} __attribute__((packed)); + +#define CHANGELOG_REC_EXTENDED(rec) \ + (((rec)->cr_flags & CLF_VERMASK) == CLF_EXT_VERSION) + +static inline int changelog_rec_size(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? sizeof(struct changelog_ext_rec): + sizeof(*rec); +} + +static inline char *changelog_rec_name(struct changelog_rec *rec) +{ + return CHANGELOG_REC_EXTENDED(rec) ? + ((struct changelog_ext_rec *)rec)->cr_name: rec->cr_name; +} + +static inline int changelog_rec_snamelen(struct changelog_ext_rec *rec) +{ + return rec->cr_namelen - strlen(rec->cr_name) - 1; +} + +static inline char *changelog_rec_sname(struct changelog_ext_rec *rec) +{ + return rec->cr_name + strlen(rec->cr_name) + 1; +} + +struct ioc_changelog { + __u64 icc_recno; + __u32 icc_mdtindex; + __u32 icc_id; + __u32 icc_flags; +}; + +enum changelog_message_type { + CL_RECORD = 10, /* message is a changelog_rec */ + CL_EOF = 11, /* at end of current changelog */ +}; + +/********* Misc **********/ + +struct ioc_data_version { + __u64 idv_version; + __u64 idv_flags; /* See LL_DV_xxx */ +}; +#define LL_DV_NOFLUSH 0x01 /* Do not take READ EXTENT LOCK before sampling + version. Dirty caches are left unchanged. */ + +#ifndef offsetof +# define offsetof(typ,memb) ((unsigned long)((char *)&(((typ *)0)->memb))) +#endif + +#define dot_lustre_name ".lustre" + + +/********* HSM **********/ + +/** HSM per-file state + * See HSM_FLAGS below. + */ +enum hsm_states { + HS_EXISTS = 0x00000001, + HS_DIRTY = 0x00000002, + HS_RELEASED = 0x00000004, + HS_ARCHIVED = 0x00000008, + HS_NORELEASE = 0x00000010, + HS_NOARCHIVE = 0x00000020, + HS_LOST = 0x00000040, +}; + +/* HSM user-setable flags. */ +#define HSM_USER_MASK (HS_NORELEASE | HS_NOARCHIVE | HS_DIRTY) + +/* Other HSM flags. */ +#define HSM_STATUS_MASK (HS_EXISTS | HS_LOST | HS_RELEASED | HS_ARCHIVED) + +/* + * All HSM-related possible flags that could be applied to a file. + * This should be kept in sync with hsm_states. + */ +#define HSM_FLAGS_MASK (HSM_USER_MASK | HSM_STATUS_MASK) + +/** + * HSMÂ request progress state + */ +enum hsm_progress_states { + HPS_WAITING = 1, + HPS_RUNNING = 2, + HPS_DONE = 3, +}; +#define HPS_NONE 0 + +static inline char *hsm_progress_state2name(enum hsm_progress_states s) +{ + switch (s) { + case HPS_WAITING: return "waiting"; + case HPS_RUNNING: return "running"; + case HPS_DONE: return "done"; + default: return "unknown"; + } +} + +struct hsm_extent { + __u64 offset; + __u64 length; +} __attribute__((packed)); + +/** + * Current HSM states of a Lustre file. + * + * This structure purpose is to be sent to user-space mainly. It describes the + * current HSM flags and in-progress action. + */ +struct hsm_user_state { + /** Current HSM states, from enum hsm_states. */ + __u32 hus_states; + __u32 hus_archive_id; + /** The current undergoing action, if there is one */ + __u32 hus_in_progress_state; + __u32 hus_in_progress_action; + struct hsm_extent hus_in_progress_location; + char hus_extended_info[]; +}; + +struct hsm_state_set_ioc { + struct lu_fid hssi_fid; + __u64 hssi_setmask; + __u64 hssi_clearmask; +}; + +/* + * This structure describes the current in-progress action for a file. + * it is retuned to user space and send over the wire + */ +struct hsm_current_action { + /** The current undergoing action, if there is one */ + /* state is one of hsm_progress_states */ + __u32 hca_state; + /* action is one of hsm_user_action */ + __u32 hca_action; + struct hsm_extent hca_location; +}; + +/***** HSM user requests ******/ +/* User-generated (lfs/ioctl) request types */ +enum hsm_user_action { + HUA_NONE = 1, /* no action (noop) */ + HUA_ARCHIVE = 10, /* copy to hsm */ + HUA_RESTORE = 11, /* prestage */ + HUA_RELEASE = 12, /* drop ost objects */ + HUA_REMOVE = 13, /* remove from archive */ + HUA_CANCEL = 14 /* cancel a request */ +}; + +static inline char *hsm_user_action2name(enum hsm_user_action a) +{ + switch (a) { + case HUA_NONE: return "NOOP"; + case HUA_ARCHIVE: return "ARCHIVE"; + case HUA_RESTORE: return "RESTORE"; + case HUA_RELEASE: return "RELEASE"; + case HUA_REMOVE: return "REMOVE"; + case HUA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* + * List of hr_flags (bit field) + */ +#define HSM_FORCE_ACTION 0x0001 +/* used by CT, connot be set by user */ +#define HSM_GHOST_COPY 0x0002 + +/** + * Contains all the fixed part of struct hsm_user_request. + * + */ +struct hsm_request { + __u32 hr_action; /* enum hsm_user_action */ + __u32 hr_archive_id; /* archive id, used only with HUA_ARCHIVE */ + __u64 hr_flags; /* request flags */ + __u32 hr_itemcount; /* item count in hur_user_item vector */ + __u32 hr_data_len; +}; + +struct hsm_user_item { + lustre_fid hui_fid; + struct hsm_extent hui_extent; +} __attribute__((packed)); + +struct hsm_user_request { + struct hsm_request hur_request; + struct hsm_user_item hur_user_item[0]; + /* extra data blob at end of struct (after all + * hur_user_items), only use helpers to access it + */ +} __attribute__((packed)); + +/** Return pointer to data field in a hsm user request */ +static inline void *hur_data(struct hsm_user_request *hur) +{ + return &(hur->hur_user_item[hur->hur_request.hr_itemcount]); +} + +/** Compute the current length of the provided hsm_user_request. */ +static inline int hur_len(struct hsm_user_request *hur) +{ + return offsetof(struct hsm_user_request, + hur_user_item[hur->hur_request.hr_itemcount]) + + hur->hur_request.hr_data_len; +} + +/****** HSM RPCs to copytool *****/ +/* Message types the copytool may receive */ +enum hsm_message_type { + HMT_ACTION_LIST = 100, /* message is a hsm_action_list */ +}; + +/* Actions the copytool may be instructed to take for a given action_item */ +enum hsm_copytool_action { + HSMA_NONE = 10, /* no action */ + HSMA_ARCHIVE = 20, /* arbitrary offset */ + HSMA_RESTORE = 21, + HSMA_REMOVE = 22, + HSMA_CANCEL = 23 +}; + +static inline char *hsm_copytool_action2name(enum hsm_copytool_action a) +{ + switch (a) { + case HSMA_NONE: return "NOOP"; + case HSMA_ARCHIVE: return "ARCHIVE"; + case HSMA_RESTORE: return "RESTORE"; + case HSMA_REMOVE: return "REMOVE"; + case HSMA_CANCEL: return "CANCEL"; + default: return "UNKNOWN"; + } +} + +/* Copytool item action description */ +struct hsm_action_item { + __u32 hai_len; /* valid size of this struct */ + __u32 hai_action; /* hsm_copytool_action, but use known size */ + lustre_fid hai_fid; /* Lustre FID to operated on */ + lustre_fid hai_dfid; /* fid used for data access */ + struct hsm_extent hai_extent; /* byte range to operate on */ + __u64 hai_cookie; /* action cookie from coordinator */ + __u64 hai_gid; /* grouplock id */ + char hai_data[0]; /* variable length */ +} __attribute__((packed)); + +/* + * helper function which print in hexa the first bytes of + * hai opaque field + * \param hai [IN] record to print + * \param buffer [OUT] output buffer + * \param len [IN] max buffer len + * \retval buffer + */ +static inline char *hai_dump_data_field(struct hsm_action_item *hai, + char *buffer, int len) +{ + int i, sz, data_len; + char *ptr; + + ptr = buffer; + sz = len; + data_len = hai->hai_len - sizeof(*hai); + for (i = 0 ; (i < data_len) && (sz > 0) ; i++) + { + int cnt; + + cnt = snprintf(ptr, sz, "%.2X", + (unsigned char)hai->hai_data[i]); + ptr += cnt; + sz -= cnt; + } + *ptr = '\0'; + return buffer; +} + +/* Copytool action list */ +#define HAL_VERSION 1 +#define HAL_MAXSIZE LNET_MTU /* bytes, used in userspace only */ +struct hsm_action_list { + __u32 hal_version; + __u32 hal_count; /* number of hai's to follow */ + __u64 hal_compound_id; /* returned by coordinator */ + __u64 hal_flags; + __u32 hal_archive_id; /* which archive backend */ + __u32 padding1; + char hal_fsname[0]; /* null-terminated */ + /* struct hsm_action_item[hal_count] follows, aligned on 8-byte + boundaries. See hai_zero */ +} __attribute__((packed)); + +#ifndef HAVE_CFS_SIZE_ROUND +static inline int cfs_size_round (int val) +{ + return (val + 7) & (~0x7); +} +#define HAVE_CFS_SIZE_ROUND +#endif + +/* Return pointer to first hai in action list */ +static inline struct hsm_action_item * hai_zero(struct hsm_action_list *hal) +{ + return (struct hsm_action_item *)(hal->hal_fsname + + cfs_size_round(strlen(hal-> \ + hal_fsname))); +} +/* Return pointer to next hai */ +static inline struct hsm_action_item * hai_next(struct hsm_action_item *hai) +{ + return (struct hsm_action_item *)((char *)hai + + cfs_size_round(hai->hai_len)); +} + +/* Return size of an hsm_action_list */ +static inline int hal_size(struct hsm_action_list *hal) +{ + int i, sz; + struct hsm_action_item *hai; + + sz = sizeof(*hal) + cfs_size_round(strlen(hal->hal_fsname)); + hai = hai_zero(hal); + for (i = 0 ; i < hal->hal_count ; i++) { + sz += cfs_size_round(hai->hai_len); + hai = hai_next(hai); + } + return(sz); +} + +/* Copytool progress reporting */ +#define HP_FLAG_COMPLETED 0x01 +#define HP_FLAG_RETRY 0x02 + +struct hsm_progress { + lustre_fid hp_fid; + __u64 hp_cookie; + struct hsm_extent hp_extent; + __u16 hp_flags; + __u16 hp_errval; /* positive val */ + __u32 padding; +}; + +/** + * Use by copytool during any hsm request they handled. + * This structure is initialized by llapi_hsm_copy_start() + * which is an helper over the ioctl() interface + * Store Lustre, internal use only, data. + */ +struct hsm_copy { + __u64 hc_data_version; + __u16 hc_flags; + __u16 hc_errval; /* positive val */ + __u32 padding; + struct hsm_action_item hc_hai; +}; + +/** @} lustreuser */ + +#endif /* _LUSTRE_USER_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre/lustreapi.h b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h new file mode 100644 index 000000000000..63da66506639 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre/lustreapi.h @@ -0,0 +1,310 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Whamcloud, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTREAPI_H_ +#define _LUSTREAPI_H_ + +/** \defgroup llapi llapi + * + * @{ + */ + +#include <lustre/lustre_user.h> + +typedef void (*llapi_cb_t)(char *obd_type_name, char *obd_name, char *obd_uuid, void *args); + +/* lustreapi message severity level */ +enum llapi_message_level { + LLAPI_MSG_OFF = 0, + LLAPI_MSG_FATAL = 1, + LLAPI_MSG_ERROR = 2, + LLAPI_MSG_WARN = 3, + LLAPI_MSG_NORMAL = 4, + LLAPI_MSG_INFO = 5, + LLAPI_MSG_DEBUG = 6, + LLAPI_MSG_MAX +}; + +/* the bottom three bits reserved for llapi_message_level */ +#define LLAPI_MSG_MASK 0x00000007 +#define LLAPI_MSG_NO_ERRNO 0x00000010 + +extern void llapi_msg_set_level(int level); +extern void llapi_error(int level, int rc, char *fmt, ...); +#define llapi_err_noerrno(level, fmt, a...) \ + llapi_error((level) | LLAPI_MSG_NO_ERRNO, 0, fmt, ## a) +extern void llapi_printf(int level, char *fmt, ...); +extern int llapi_file_create(const char *name, unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern); +extern int llapi_file_open(const char *name, int flags, int mode, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern); +extern int llapi_file_create_pool(const char *name, + unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +extern int llapi_file_open_pool(const char *name, int flags, int mode, + unsigned long long stripe_size, + int stripe_offset, int stripe_count, + int stripe_pattern, char *pool_name); +extern int llapi_poollist(const char *name); +extern int llapi_get_poollist(const char *name, char **poollist, int list_size, + char *buffer, int buffer_size); +extern int llapi_get_poolmembers(const char *poolname, char **members, + int list_size, char *buffer, int buffer_size); +extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); +#define HAVE_LLAPI_FILE_LOOKUP +extern int llapi_file_lookup(int dirfd, const char *name); + +#define VERBOSE_COUNT 0x1 +#define VERBOSE_SIZE 0x2 +#define VERBOSE_OFFSET 0x4 +#define VERBOSE_POOL 0x8 +#define VERBOSE_DETAIL 0x10 +#define VERBOSE_OBJID 0x20 +#define VERBOSE_GENERATION 0x40 +#define VERBOSE_MDTINDEX 0x80 +#define VERBOSE_ALL (VERBOSE_COUNT | VERBOSE_SIZE | VERBOSE_OFFSET | \ + VERBOSE_POOL | VERBOSE_OBJID | VERBOSE_GENERATION) + +struct find_param { + unsigned int maxdepth; + time_t atime; + time_t mtime; + time_t ctime; + int asign; /* cannot be bitfields due to using pointers to */ + int csign; /* access them during argument parsing. */ + int msign; + int type; + int size_sign:2, /* these need to be signed values */ + stripesize_sign:2, + stripecount_sign:2; + unsigned long long size; + unsigned long long size_units; + uid_t uid; + gid_t gid; + + unsigned long zeroend:1, + recursive:1, + exclude_pattern:1, + exclude_type:1, + exclude_obd:1, + exclude_mdt:1, + exclude_gid:1, + exclude_uid:1, + check_gid:1, /* group ID */ + check_uid:1, /* user ID */ + check_pool:1, /* LOV pool name */ + check_size:1, /* file size */ + exclude_pool:1, + exclude_size:1, + exclude_atime:1, + exclude_mtime:1, + exclude_ctime:1, + get_lmv:1, /* get MDT list from LMV */ + raw:1, /* do not fill in defaults */ + check_stripesize:1, /* LOV stripe size */ + exclude_stripesize:1, + check_stripecount:1, /* LOV stripe count */ + exclude_stripecount:1; + + int verbose; + int quiet; + + /* regular expression */ + char *pattern; + + char *print_fmt; + + struct obd_uuid *obduuid; + int num_obds; + int num_alloc_obds; + int obdindex; + int *obdindexes; + + struct obd_uuid *mdtuuid; + int num_mdts; + int num_alloc_mdts; + int mdtindex; + int *mdtindexes; + int file_mdtindex; + + int lumlen; + struct lov_user_mds_data *lmd; + + char poolname[LOV_MAXPOOLNAME + 1]; + + int fp_lmv_count; + struct lmv_user_md *fp_lmv_md; + + unsigned long long stripesize; + unsigned long long stripesize_units; + unsigned long long stripecount; + + /* In-process parameters. */ + unsigned long got_uuids:1, + obds_printed:1, + have_fileinfo:1; /* file attrs and LOV xattr */ + unsigned int depth; + dev_t st_dev; +}; + +extern int llapi_ostlist(char *path, struct find_param *param); +extern int llapi_uuid_match(char *real_uuid, char *search_uuid); +extern int llapi_getstripe(char *path, struct find_param *param); +extern int llapi_find(char *path, struct find_param *param); + +extern int llapi_file_fget_mdtidx(int fd, int *mdtidx); +extern int llapi_dir_create_pool(const char *name, int flags, int stripe_offset, + int stripe_count, int stripe_pattern, + char *poolname); +int llapi_direntry_remove(char *dname); +extern int llapi_obd_statfs(char *path, __u32 type, __u32 index, + struct obd_statfs *stat_buf, + struct obd_uuid *uuid_buf); +extern int llapi_ping(char *obd_type, char *obd_name); +extern int llapi_target_check(int num_types, char **obd_types, char *dir); +extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); +extern int llapi_file_get_lmv_uuid(const char *path, struct obd_uuid *lmv_uuid); +extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); +extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); +extern int llapi_lmv_get_uuids(int fd, struct obd_uuid *uuidp, int *mdt_count); +extern int llapi_is_lustre_mnttype(const char *type); +extern int llapi_search_ost(char *fsname, char *poolname, char *ostname); +extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt); +extern int parse_size(char *optarg, unsigned long long *size, + unsigned long long *size_units, int bytes_spec); +extern int llapi_search_mounts(const char *pathname, int index, + char *mntdir, char *fsname); +extern int llapi_search_fsname(const char *pathname, char *fsname); +extern int llapi_getname(const char *path, char *buf, size_t size); + +extern void llapi_ping_target(char *obd_type, char *obd_name, + char *obd_uuid, void *args); + +extern int llapi_search_rootpath(char *pathname, const char *fsname); + +struct mntent; +#define HAVE_LLAPI_IS_LUSTRE_MNT +extern int llapi_is_lustre_mnt(struct mntent *mnt); +extern int llapi_quotachown(char *path, int flag); +extern int llapi_quotacheck(char *mnt, int check_type); +extern int llapi_poll_quotacheck(char *mnt, struct if_quotacheck *qchk); +extern int llapi_quotactl(char *mnt, struct if_quotactl *qctl); +extern int llapi_target_iterate(int type_num, char **obd_type, void *args, + llapi_cb_t cb); +extern int llapi_get_connect_flags(const char *mnt, __u64 *flags); +extern int llapi_lsetfacl(int argc, char *argv[]); +extern int llapi_lgetfacl(int argc, char *argv[]); +extern int llapi_rsetfacl(int argc, char *argv[]); +extern int llapi_rgetfacl(int argc, char *argv[]); +extern int llapi_cp(int argc, char *argv[]); +extern int llapi_ls(int argc, char *argv[]); +extern int llapi_fid2path(const char *device, const char *fidstr, char *path, + int pathlen, long long *recno, int *linkno); +extern int llapi_path2fid(const char *path, lustre_fid *fid); +extern int llapi_fd2fid(const int fd, lustre_fid *fid); + +extern int llapi_get_version(char *buffer, int buffer_size, char **version); +extern int llapi_get_data_version(int fd, __u64 *data_version, __u64 flags); +extern int llapi_hsm_state_get(const char *path, struct hsm_user_state *hus); +extern int llapi_hsm_state_set(const char *path, __u64 setmask, __u64 clearmask, + __u32 archive_id); + +extern int llapi_create_volatile_idx(char *directory, int idx, int mode); +static inline int llapi_create_volatile(char *directory, int mode) +{ + return llapi_create_volatile_idx(directory, -1, mode); +} + + +extern int llapi_fswap_layouts(const int fd1, const int fd2, + __u64 dv1, __u64 dv2, __u64 flags); +extern int llapi_swap_layouts(const char *path1, const char *path2, + __u64 dv1, __u64 dv2, __u64 flags); + +/* Changelog interface. priv is private state, managed internally + by these functions */ +#define CHANGELOG_FLAG_FOLLOW 0x01 /* Not yet implemented */ +#define CHANGELOG_FLAG_BLOCK 0x02 /* Blocking IO makes sense in case of + slow user parsing of the records, but it also prevents us from cleaning + up if the records are not consumed. */ + +/* Records received are in extentded format now, though most of them are still + * written in disk in changelog_rec format (to save space and time), it's + * converted to extented format in the lustre api to ease changelog analysis. */ +#define HAVE_CHANGELOG_EXTEND_REC 1 + +extern int llapi_changelog_start(void **priv, int flags, const char *mdtname, + long long startrec); +extern int llapi_changelog_fini(void **priv); +extern int llapi_changelog_recv(void *priv, struct changelog_ext_rec **rech); +extern int llapi_changelog_free(struct changelog_ext_rec **rech); +/* Allow records up to endrec to be destroyed; requires registered id. */ +extern int llapi_changelog_clear(const char *mdtname, const char *idstr, + long long endrec); + +/* HSM copytool interface. + * priv is private state, managed internally by these functions + */ +struct hsm_copytool_private; +extern int llapi_hsm_copytool_start(struct hsm_copytool_private **priv, + char *fsname, int flags, + int archive_count, int *archives); +extern int llapi_hsm_copytool_fini(struct hsm_copytool_private **priv); +extern int llapi_hsm_copytool_recv(struct hsm_copytool_private *priv, + struct hsm_action_list **hal, int *msgsize); +extern int llapi_hsm_copytool_free(struct hsm_action_list **hal); +extern int llapi_hsm_copy_start(char *mnt, struct hsm_copy *copy, + const struct hsm_action_item *hai); +extern int llapi_hsm_copy_end(char *mnt, struct hsm_copy *copy, + const struct hsm_progress *hp); +extern int llapi_hsm_progress(char *mnt, struct hsm_progress *hp); +extern int llapi_hsm_import(const char *dst, int archive, struct stat *st, + unsigned long long stripe_size, int stripe_offset, + int stripe_count, int stripe_pattern, + char *pool_name, lustre_fid *newfid); + +/* HSM user interface */ +extern struct hsm_user_request *llapi_hsm_user_request_alloc(int itemcount, + int data_len); +extern int llapi_hsm_request(char *mnt, struct hsm_user_request *request); +extern int llapi_hsm_current_action(const char *path, + struct hsm_current_action *hca); +/** @} llapi */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_acl.h b/drivers/staging/lustre/lustre/include/lustre_acl.h new file mode 100644 index 000000000000..5cfb87b180c3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_acl.h @@ -0,0 +1,42 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_acl.h + */ + +#ifndef _LUSTRE_ACL_H +#define _LUSTRE_ACL_H + +#include <linux/lustre_acl.h> + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_capa.h b/drivers/staging/lustre/lustre/include/lustre_capa.h new file mode 100644 index 000000000000..d77bffc0b59d --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_capa.h @@ -0,0 +1,305 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_capa.h + * + * Author: Lai Siyao <lsy@clusterfs.com> + */ + +#ifndef __LINUX_CAPA_H_ +#define __LINUX_CAPA_H_ + +/** \defgroup capa capa + * + * @{ + */ + +/* + * capability + */ +#include <linux/crypto.h> +#include <lustre/lustre_idl.h> + +#define CAPA_TIMEOUT 1800 /* sec, == 30 min */ +#define CAPA_KEY_TIMEOUT (24 * 60 * 60) /* sec, == 1 days */ + +struct capa_hmac_alg { + const char *ha_name; + int ha_len; + int ha_keylen; +}; + +#define DEF_CAPA_HMAC_ALG(name, type, len, keylen) \ +[CAPA_HMAC_ALG_ ## type] = { \ + .ha_name = name, \ + .ha_len = len, \ + .ha_keylen = keylen, \ +} + +struct client_capa { + struct inode *inode; + struct list_head lli_list; /* link to lli_oss_capas */ +}; + +struct target_capa { + struct hlist_node c_hash; /* link to capa hash */ +}; + +struct obd_capa { + struct list_head c_list; /* link to capa_list */ + + struct lustre_capa c_capa; /* capa */ + atomic_t c_refc; /* ref count */ + cfs_time_t c_expiry; /* jiffies */ + spinlock_t c_lock; /* protect capa content */ + int c_site; + + union { + struct client_capa cli; + struct target_capa tgt; + } u; +}; + +enum { + CAPA_SITE_CLIENT = 0, + CAPA_SITE_SERVER, + CAPA_SITE_MAX +}; + +static inline struct lu_fid *capa_fid(struct lustre_capa *capa) +{ + return &capa->lc_fid; +} + +static inline __u64 capa_opc(struct lustre_capa *capa) +{ + return capa->lc_opc; +} + +static inline __u64 capa_uid(struct lustre_capa *capa) +{ + return capa->lc_uid; +} + +static inline __u64 capa_gid(struct lustre_capa *capa) +{ + return capa->lc_gid; +} + +static inline __u32 capa_flags(struct lustre_capa *capa) +{ + return capa->lc_flags & 0xffffff; +} + +static inline __u32 capa_alg(struct lustre_capa *capa) +{ + return (capa->lc_flags >> 24); +} + +static inline __u32 capa_keyid(struct lustre_capa *capa) +{ + return capa->lc_keyid; +} + +static inline __u64 capa_key_seq(struct lustre_capa_key *key) +{ + return key->lk_seq; +} + +static inline __u32 capa_key_keyid(struct lustre_capa_key *key) +{ + return key->lk_keyid; +} + +static inline __u32 capa_timeout(struct lustre_capa *capa) +{ + return capa->lc_timeout; +} + +static inline __u32 capa_expiry(struct lustre_capa *capa) +{ + return capa->lc_expiry; +} + +void _debug_capa(struct lustre_capa *, struct libcfs_debug_msg_data *, + const char *fmt, ... ); +#define DEBUG_CAPA(level, capa, fmt, args...) \ +do { \ + if (((level) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (level)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + _debug_capa((capa), &msgdata, fmt, ##args); \ + } \ +} while (0) + +#define DEBUG_CAPA_KEY(level, k, fmt, args...) \ +do { \ +CDEBUG(level, fmt " capability key@%p seq "LPU64" keyid %u\n", \ + ##args, k, capa_key_seq(k), capa_key_keyid(k)); \ +} while (0) + +typedef int (* renew_capa_cb_t)(struct obd_capa *, struct lustre_capa *); + +/* obdclass/capa.c */ +extern struct list_head capa_list[]; +extern spinlock_t capa_lock; +extern int capa_count[]; +extern struct kmem_cache *capa_cachep; + +struct hlist_head *init_capa_hash(void); +void cleanup_capa_hash(struct hlist_head *hash); + +struct obd_capa *capa_add(struct hlist_head *hash, + struct lustre_capa *capa); +struct obd_capa *capa_lookup(struct hlist_head *hash, + struct lustre_capa *capa, int alive); + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key); +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +void capa_cpy(void *dst, struct obd_capa *ocapa); +static inline struct obd_capa *alloc_capa(int site) +{ + struct obd_capa *ocapa; + + if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER)) + return ERR_PTR(-EINVAL); + + OBD_SLAB_ALLOC_PTR(ocapa, capa_cachep); + if (unlikely(!ocapa)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&ocapa->c_list); + atomic_set(&ocapa->c_refc, 1); + spin_lock_init(&ocapa->c_lock); + ocapa->c_site = site; + if (ocapa->c_site == CAPA_SITE_CLIENT) + INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + else + INIT_HLIST_NODE(&ocapa->u.tgt.c_hash); + + return ocapa; +} + +static inline struct obd_capa *capa_get(struct obd_capa *ocapa) +{ + if (!ocapa) + return NULL; + + atomic_inc(&ocapa->c_refc); + return ocapa; +} + +static inline void capa_put(struct obd_capa *ocapa) +{ + if (!ocapa) + return; + + if (atomic_read(&ocapa->c_refc) == 0) { + DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for"); + LBUG(); + } + + if (atomic_dec_and_test(&ocapa->c_refc)) { + LASSERT(list_empty(&ocapa->c_list)); + if (ocapa->c_site == CAPA_SITE_CLIENT) { + LASSERT(list_empty(&ocapa->u.cli.lli_list)); + } else { + struct hlist_node *hnode; + + hnode = &ocapa->u.tgt.c_hash; + LASSERT(!hnode->next && !hnode->pprev); + } + OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa)); + } +} + +static inline int open_flags_to_accmode(int flags) +{ + int mode = flags; + + if ((mode + 1) & O_ACCMODE) + mode++; + if (mode & O_TRUNC) + mode |= 2; + + return mode; +} + +static inline __u64 capa_open_opc(int mode) +{ + return mode & FMODE_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_READ; +} + +static inline void set_capa_expiry(struct obd_capa *ocapa) +{ + cfs_time_t expiry = cfs_time_sub((cfs_time_t)ocapa->c_capa.lc_expiry, + cfs_time_current_sec()); + ocapa->c_expiry = cfs_time_add(cfs_time_current(), + cfs_time_seconds(expiry)); +} + +static inline int capa_is_expired_sec(struct lustre_capa *capa) +{ + return (capa->lc_expiry - cfs_time_current_sec() <= 0); +} + +static inline int capa_is_expired(struct obd_capa *ocapa) +{ + return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current()); +} + +static inline int capa_opc_supported(struct lustre_capa *capa, __u64 opc) +{ + return (capa_opc(capa) & opc) == opc; +} + +struct filter_capa_key { + struct list_head k_list; + struct lustre_capa_key k_key; +}; + +enum { + LC_ID_NONE = 0, + LC_ID_PLAIN = 1, + LC_ID_CONVERT = 2 +}; + +#define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT) + +/** @} capa */ + +#endif /* __LINUX_CAPA_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_cfg.h b/drivers/staging/lustre/lustre/include/lustre_cfg.h new file mode 100644 index 000000000000..f12429f38215 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_cfg.h @@ -0,0 +1,299 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_CFG_H +#define _LUSTRE_CFG_H + +/** \defgroup cfg cfg + * + * @{ + */ + +/* + * 1cf6 + * lcfG + */ +#define LUSTRE_CFG_VERSION 0x1cf60001 +#define LUSTRE_CFG_MAX_BUFCOUNT 8 + +#define LCFG_HDR_SIZE(count) \ + cfs_size_round(offsetof (struct lustre_cfg, lcfg_buflens[(count)])) + +/** If the LCFG_REQUIRED bit is set in a configuration command, + * then the client is required to understand this parameter + * in order to mount the filesystem. If it does not understand + * a REQUIRED command the client mount will fail. */ +#define LCFG_REQUIRED 0x0001000 + +enum lcfg_command_type { + LCFG_ATTACH = 0x00cf001, /**< create a new obd instance */ + LCFG_DETACH = 0x00cf002, /**< destroy obd instance */ + LCFG_SETUP = 0x00cf003, /**< call type-specific setup */ + LCFG_CLEANUP = 0x00cf004, /**< call type-specific cleanup */ + LCFG_ADD_UUID = 0x00cf005, /**< add a nid to a niduuid */ + LCFG_DEL_UUID = 0x00cf006, /**< remove a nid from a niduuid */ + LCFG_MOUNTOPT = 0x00cf007, /**< create a profile (mdc, osc) */ + LCFG_DEL_MOUNTOPT = 0x00cf008, /**< destroy a profile */ + LCFG_SET_TIMEOUT = 0x00cf009, /**< set obd_timeout */ + LCFG_SET_UPCALL = 0x00cf00a, /**< deprecated */ + LCFG_ADD_CONN = 0x00cf00b, /**< add a failover niduuid to an obd */ + LCFG_DEL_CONN = 0x00cf00c, /**< remove a failover niduuid */ + LCFG_LOV_ADD_OBD = 0x00cf00d, /**< add an osc to a lov */ + LCFG_LOV_DEL_OBD = 0x00cf00e, /**< remove an osc from a lov */ + LCFG_PARAM = 0x00cf00f, /**< set a proc parameter */ + LCFG_MARKER = 0x00cf010, /**< metadata about next cfg rec */ + LCFG_LOG_START = 0x00ce011, /**< mgc only, process a cfg log */ + LCFG_LOG_END = 0x00ce012, /**< stop processing updates */ + LCFG_LOV_ADD_INA = 0x00ce013, /**< like LOV_ADD_OBD, inactive */ + LCFG_ADD_MDC = 0x00cf014, /**< add an mdc to a lmv */ + LCFG_DEL_MDC = 0x00cf015, /**< remove an mdc from a lmv */ + LCFG_SPTLRPC_CONF = 0x00ce016, /**< security */ + LCFG_POOL_NEW = 0x00ce020, /**< create an ost pool name */ + LCFG_POOL_ADD = 0x00ce021, /**< add an ost to a pool */ + LCFG_POOL_REM = 0x00ce022, /**< remove an ost from a pool */ + LCFG_POOL_DEL = 0x00ce023, /**< destroy an ost pool name */ + LCFG_SET_LDLM_TIMEOUT = 0x00ce030, /**< set ldlm_timeout */ + LCFG_PRE_CLEANUP = 0x00cf031, /**< call type-specific pre + * cleanup cleanup */ +}; + +struct lustre_cfg_bufs { + void *lcfg_buf[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_buflen[LUSTRE_CFG_MAX_BUFCOUNT]; + __u32 lcfg_bufcount; +}; + +struct lustre_cfg { + __u32 lcfg_version; + __u32 lcfg_command; + + __u32 lcfg_num; + __u32 lcfg_flags; + __u64 lcfg_nid; + __u32 lcfg_nal; /* not used any more */ + + __u32 lcfg_bufcount; + __u32 lcfg_buflens[0]; +}; + +enum cfg_record_type { + PORTALS_CFG_TYPE = 1, + LUSTRE_CFG_TYPE = 123, +}; + +#define LUSTRE_CFG_BUFLEN(lcfg, idx) \ + ((lcfg)->lcfg_bufcount <= (idx) \ + ? 0 \ + : (lcfg)->lcfg_buflens[(idx)]) + +static inline void lustre_cfg_bufs_set(struct lustre_cfg_bufs *bufs, + __u32 index, + void *buf, + __u32 buflen) +{ + if (index >= LUSTRE_CFG_MAX_BUFCOUNT) + return; + if (bufs == NULL) + return; + + if (bufs->lcfg_bufcount <= index) + bufs->lcfg_bufcount = index + 1; + + bufs->lcfg_buf[index] = buf; + bufs->lcfg_buflen[index] = buflen; +} + +static inline void lustre_cfg_bufs_set_string(struct lustre_cfg_bufs *bufs, + __u32 index, + char *str) +{ + lustre_cfg_bufs_set(bufs, index, str, str ? strlen(str) + 1 : 0); +} + +static inline void lustre_cfg_bufs_reset(struct lustre_cfg_bufs *bufs, char *name) +{ + memset((bufs), 0, sizeof(*bufs)); + if (name) + lustre_cfg_bufs_set_string(bufs, 0, name); +} + +static inline void *lustre_cfg_buf(struct lustre_cfg *lcfg, int index) +{ + int i; + int offset; + int bufcount; + LASSERT (lcfg != NULL); + LASSERT (index >= 0); + + bufcount = lcfg->lcfg_bufcount; + if (index >= bufcount) + return NULL; + + offset = LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < index; i++) + offset += cfs_size_round(lcfg->lcfg_buflens[i]); + return (char *)lcfg + offset; +} + +static inline void lustre_cfg_bufs_init(struct lustre_cfg_bufs *bufs, + struct lustre_cfg *lcfg) +{ + int i; + bufs->lcfg_bufcount = lcfg->lcfg_bufcount; + for (i = 0; i < bufs->lcfg_bufcount; i++) { + bufs->lcfg_buflen[i] = lcfg->lcfg_buflens[i]; + bufs->lcfg_buf[i] = lustre_cfg_buf(lcfg, i); + } +} + +static inline char *lustre_cfg_string(struct lustre_cfg *lcfg, int index) +{ + char *s; + + if (lcfg->lcfg_buflens[index] == 0) + return NULL; + + s = lustre_cfg_buf(lcfg, index); + if (s == NULL) + return NULL; + + /* + * make sure it's NULL terminated, even if this kills a char + * of data. Try to use the padding first though. + */ + if (s[lcfg->lcfg_buflens[index] - 1] != '\0') { + int last = min((int)lcfg->lcfg_buflens[index], + cfs_size_round(lcfg->lcfg_buflens[index]) - 1); + char lost = s[last]; + s[last] = '\0'; + if (lost != '\0') { + CWARN("Truncated buf %d to '%s' (lost '%c'...)\n", + index, s, lost); + } + } + return s; +} + +static inline int lustre_cfg_len(__u32 bufcount, __u32 *buflens) +{ + int i; + int len; + ENTRY; + + len = LCFG_HDR_SIZE(bufcount); + for (i = 0; i < bufcount; i++) + len += cfs_size_round(buflens[i]); + + RETURN(cfs_size_round(len)); +} + + +#include <obd_support.h> + +static inline struct lustre_cfg *lustre_cfg_new(int cmd, + struct lustre_cfg_bufs *bufs) +{ + struct lustre_cfg *lcfg; + char *ptr; + int i; + + ENTRY; + + OBD_ALLOC(lcfg, lustre_cfg_len(bufs->lcfg_bufcount, + bufs->lcfg_buflen)); + if (!lcfg) + RETURN(ERR_PTR(-ENOMEM)); + + lcfg->lcfg_version = LUSTRE_CFG_VERSION; + lcfg->lcfg_command = cmd; + lcfg->lcfg_bufcount = bufs->lcfg_bufcount; + + ptr = (char *)lcfg + LCFG_HDR_SIZE(lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + lcfg->lcfg_buflens[i] = bufs->lcfg_buflen[i]; + LOGL((char *)bufs->lcfg_buf[i], bufs->lcfg_buflen[i], ptr); + } + RETURN(lcfg); +} + +static inline void lustre_cfg_free(struct lustre_cfg *lcfg) +{ + int len; + + len = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); + + OBD_FREE(lcfg, len); + EXIT; + return; +} + +static inline int lustre_cfg_sanity_check(void *buf, int len) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)buf; + ENTRY; + if (!lcfg) + RETURN(-EINVAL); + + /* check that the first bits of the struct are valid */ + if (len < LCFG_HDR_SIZE(0)) + RETURN(-EINVAL); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) + RETURN(-EINVAL); + + if (lcfg->lcfg_bufcount >= LUSTRE_CFG_MAX_BUFCOUNT) + RETURN(-EINVAL); + + /* check that the buflens are valid */ + if (len < LCFG_HDR_SIZE(lcfg->lcfg_bufcount)) + RETURN(-EINVAL); + + /* make sure all the pointers point inside the data */ + if (len < lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens)) + RETURN(-EINVAL); + + RETURN(0); +} + +#include <lustre/lustre_user.h> + +#ifndef INVALID_UID +#define INVALID_UID (-1) +#endif + +/** @} cfg */ + +#endif // _LUSTRE_CFG_H diff --git a/drivers/staging/lustre/lustre/include/lustre_debug.h b/drivers/staging/lustre/lustre/include/lustre_debug.h new file mode 100644 index 000000000000..3d9e4462af43 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_debug.h @@ -0,0 +1,76 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_DEBUG_H +#define _LUSTRE_DEBUG_H + +/** \defgroup debug debug + * + * @{ + */ + +#include <lustre_net.h> +#include <obd.h> + +#include <linux/lustre_debug.h> + +#define ASSERT_MAX_SIZE_MB 60000ULL +#define ASSERT_PAGE_INDEX(index, OP) \ +do { if (index > ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)) { \ + CERROR("bad page index %lu > %llu\n", index, \ + ASSERT_MAX_SIZE_MB << (20 - PAGE_CACHE_SHIFT)); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +#define ASSERT_FILE_OFFSET(offset, OP) \ +do { if (offset > ASSERT_MAX_SIZE_MB << 20) { \ + CERROR("bad file offset %llu > %llu\n", offset, \ + ASSERT_MAX_SIZE_MB << 20); \ + libcfs_debug = ~0UL; \ + OP; \ +}} while(0) + +/* lib/debug.c */ +void dump_lniobuf(struct niobuf_local *lnb); +int dump_req(struct ptlrpc_request *req); +void dump_lsm(int level, struct lov_stripe_md *lsm); +int block_debug_setup(void *addr, int len, __u64 off, __u64 id); +int block_debug_check(char *who, void *addr, int len, __u64 off, __u64 id); + +/** @} debug */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_disk.h b/drivers/staging/lustre/lustre/include/lustre_disk.h new file mode 100644 index 000000000000..8db6086ea4ea --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_disk.h @@ -0,0 +1,543 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_disk.h + * + * Lustre disk format definitions. + * + * Author: Nathan Rutman <nathan@clusterfs.com> + */ + +#ifndef _LUSTRE_DISK_H +#define _LUSTRE_DISK_H + +/** \defgroup disk disk + * + * @{ + */ + +#include <linux/libcfs/libcfs.h> +#include <linux/lnet/types.h> + +/****************** on-disk files *********************/ + +#define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ +#define MOUNT_CONFIGS_DIR "CONFIGS" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" +#define LOV_OBJID "lov_objid" +#define LOV_OBJSEQ "lov_objseq" +#define HEALTH_CHECK "health_check" +#define CAPA_KEYS "capa_keys" +#define CHANGELOG_USERS "changelog_users" +#define MGS_NIDTBL_DIR "NIDTBL_VERSIONS" +#define QMT_DIR "quota_master" +#define QSD_DIR "quota_slave" + +/****************** persistent mount data *********************/ + +#define LDD_F_SV_TYPE_MDT 0x0001 +#define LDD_F_SV_TYPE_OST 0x0002 +#define LDD_F_SV_TYPE_MGS 0x0004 +#define LDD_F_SV_TYPE_MASK (LDD_F_SV_TYPE_MDT | \ + LDD_F_SV_TYPE_OST | \ + LDD_F_SV_TYPE_MGS) +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server */ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate config logs for this fs or server */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +#define LDD_F_UPGRADE14 0x0200 +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** all nodes are specified as service nodes */ +#define LDD_F_NO_PRIMNODE 0x1000 +/** IR enable flag */ +#define LDD_F_IR_CAPABLE 0x2000 +/** the MGS refused to register the target. */ +#define LDD_F_ERROR 0x4000 + +/* opc for target register */ +#define LDD_F_OPC_REG 0x10000000 +#define LDD_F_OPC_UNREG 0x20000000 +#define LDD_F_OPC_READY 0x40000000 +#define LDD_F_OPC_MASK 0xf0000000 + +#define LDD_F_ONDISK_MASK (LDD_F_SV_TYPE_MASK) + +#define LDD_F_MASK 0xFFFF + +enum ldd_mount_type { + LDD_MT_EXT3 = 0, + LDD_MT_LDISKFS, + LDD_MT_SMFS, + LDD_MT_REISERFS, + LDD_MT_LDISKFS2, + LDD_MT_ZFS, + LDD_MT_LAST +}; + +static inline char *mt_str(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "ext3", + "ldiskfs", + "smfs", + "reiserfs", + "ldiskfs2", + "zfs", + }; + return mount_type_string[mt]; +} + +static inline char *mt_type(enum ldd_mount_type mt) +{ + static char *mount_type_string[] = { + "osd-ldiskfs", + "osd-ldiskfs", + "osd-smfs", + "osd-reiserfs", + "osd-ldiskfs", + "osd-zfs", + }; + return mount_type_string[mt]; +} + +#define LDD_INCOMPAT_SUPP 0 +#define LDD_ROCOMPAT_SUPP 0 + +#define LDD_MAGIC 0x1dd00001 + +/* On-disk configuration file. In host-endian order. */ +struct lustre_disk_data { + __u32 ldd_magic; + __u32 ldd_feature_compat; /* compatible feature flags */ + __u32 ldd_feature_rocompat;/* read-only compatible feature flags */ + __u32 ldd_feature_incompat;/* incompatible feature flags */ + + __u32 ldd_config_ver; /* config rewrite count - not used */ + __u32 ldd_flags; /* LDD_SV_TYPE */ + __u32 ldd_svindex; /* server index (0001), must match + svname */ + __u32 ldd_mount_type; /* target fs type LDD_MT_* */ + char ldd_fsname[64]; /* filesystem this server is part of, + MTI_NAME_MAXLEN */ + char ldd_svname[64]; /* this server's name (lustre-mdt0001)*/ + __u8 ldd_uuid[40]; /* server UUID (COMPAT_146) */ + +/*200*/ char ldd_userdata[1024 - 200]; /* arbitrary user string */ +/*1024*/__u8 ldd_padding[4096 - 1024]; +/*4096*/char ldd_mount_opts[4096]; /* target fs mount opts */ +/*8192*/char ldd_params[4096]; /* key=value pairs */ +}; + + +#define IS_MDT(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MDT) +#define IS_OST(data) ((data)->lsi_flags & LDD_F_SV_TYPE_OST) +#define IS_MGS(data) ((data)->lsi_flags & LDD_F_SV_TYPE_MGS) +#define IS_SERVER(data) ((data)->lsi_flags & (LDD_F_SV_TYPE_MGS | \ + LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) +#define MT_STR(data) mt_str((data)->ldd_mount_type) + +/* Make the mdt/ost server obd name based on the filesystem name */ +static inline int server_make_name(__u32 flags, __u16 index, char *fs, + char *name) +{ + if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { + if (!(flags & LDD_F_SV_ALL)) + sprintf(name, "%.8s%c%s%04x", fs, + (flags & LDD_F_VIRGIN) ? ':' : + ((flags & LDD_F_WRITECONF) ? '=' : '-'), + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", + index); + } else if (flags & LDD_F_SV_TYPE_MGS) { + sprintf(name, "MGS"); + } else { + CERROR("unknown server type %#x\n", flags); + return 1; + } + return 0; +} + +/****************** mount command *********************/ + +/* The lmd is only used internally by Lustre; mount simply passes + everything as string options */ + +#define LMD_MAGIC 0xbdacbd03 + +/* gleaned from the mount command - no persistent info here */ +struct lustre_mount_data { + __u32 lmd_magic; + __u32 lmd_flags; /* lustre mount flags */ + int lmd_mgs_failnodes; /* mgs failover node count */ + int lmd_exclude_count; + int lmd_recovery_time_soft; + int lmd_recovery_time_hard; + char *lmd_dev; /* device name */ + char *lmd_profile; /* client only */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ + char *lmd_opts; /* lustre mount options (as opposed to + _device_ mount options) */ + char *lmd_params; /* lustre params */ + __u32 *lmd_exclude; /* array of OSTs to ignore */ + char *lmd_mgs; /* MGS nid */ + char *lmd_osd_type; /* OSD type */ +}; + +#define LMD_FLG_SERVER 0x0001 /* Mounting a server */ +#define LMD_FLG_CLIENT 0x0002 /* Mounting a client */ +#define LMD_FLG_ABORT_RECOV 0x0008 /* Abort recovery */ +#define LMD_FLG_NOSVC 0x0010 /* Only start MGS/MGC for servers, + no other services */ +#define LMD_FLG_NOMGS 0x0020 /* Only start target for servers, reusing + existing MGS services */ +#define LMD_FLG_WRITECONF 0x0040 /* Rewrite config log */ +#define LMD_FLG_NOIR 0x0080 /* NO imperative recovery */ +#define LMD_FLG_NOSCRUB 0x0100 /* Do not trigger scrub automatically */ +#define LMD_FLG_MGS 0x0200 /* Also start MGS along with server */ +#define LMD_FLG_IAM 0x0400 /* IAM dir */ +#define LMD_FLG_NO_PRIMNODE 0x0800 /* all nodes are service nodes */ +#define LMD_FLG_VIRGIN 0x1000 /* the service registers first time */ +#define LMD_FLG_UPDATE 0x2000 /* update parameters */ + +#define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) + + +/****************** last_rcvd file *********************/ + +/** version recovery epoch */ +#define LR_EPOCH_BITS 32 +#define lr_epoch(a) ((a) >> LR_EPOCH_BITS) +#define LR_EXPIRE_INTERVALS 16 /**< number of intervals to track transno */ +#define ENOENT_VERSION 1 /** 'virtual' version of non-existent object */ + +#define LR_SERVER_SIZE 512 +#define LR_CLIENT_START 8192 +#define LR_CLIENT_SIZE 128 +#if LR_CLIENT_START < LR_SERVER_SIZE +#error "Can't have LR_CLIENT_START < LR_SERVER_SIZE" +#endif + +/* + * This limit is arbitrary (131072 clients on x86), but it is convenient to use + * 2^n * PAGE_CACHE_SIZE * 8 for the number of bits that fit an order-n allocation. + * If we need more than 131072 clients (order-2 allocation on x86) then this + * should become an array of single-page pointers that are allocated on demand. + */ +#if (128 * 1024UL) > (PAGE_CACHE_SIZE * 8) +#define LR_MAX_CLIENTS (128 * 1024UL) +#else +#define LR_MAX_CLIENTS (PAGE_CACHE_SIZE * 8) +#endif + +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 +/** 2.0 server, interop flag to show server version is changed */ +#define OBD_COMPAT_20 0x00000008 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** Size-on-MDS is enabled */ +#define OBD_INCOMPAT_SOM 0x00000020 +/** filesystem using iam format to store directory entries */ +#define OBD_INCOMPAT_IAM_DIR 0x00000040 +/** LMA attribute contains per-inode incompatible flags */ +#define OBD_INCOMPAT_LMA 0x00000080 +/** lmm_stripe_count has been shrunk from __u32 to __u16 and the remaining 16 + * bits are now used to store a generation. Once we start changing the layout + * and bumping the generation, old versions expecting a 32-bit lmm_stripe_count + * will be confused by interpreting stripe_count | gen << 16 as the actual + * stripe count */ +#define OBD_INCOMPAT_LMM_VER 0x00000100 +/** multiple OI files for MDT */ +#define OBD_INCOMPAT_MULTI_OI 0x00000200 + +/* Data stored per server at the head of the last_rcvd file. In le32 order. + This should be common to filter_internal.h, lustre_mds.h */ +struct lr_server_data { + __u8 lsd_uuid[40]; /* server UUID */ + __u64 lsd_last_transno; /* last completed transaction ID */ + __u64 lsd_compat14; /* reserved - compat with old last_rcvd */ + __u64 lsd_mount_count; /* incarnation number */ + __u32 lsd_feature_compat; /* compatible feature flags */ + __u32 lsd_feature_rocompat;/* read-only compatible feature flags */ + __u32 lsd_feature_incompat;/* incompatible feature flags */ + __u32 lsd_server_size; /* size of server data area */ + __u32 lsd_client_start; /* start of per-client data area */ + __u16 lsd_client_size; /* size of per-client data area */ + __u16 lsd_subdir_count; /* number of subdirectories for objects */ + __u64 lsd_catalog_oid; /* recovery catalog object id */ + __u32 lsd_catalog_ogen; /* recovery catalog inode generation */ + __u8 lsd_peeruuid[40]; /* UUID of MDS associated with this OST */ + __u32 lsd_osd_index; /* index number of OST in LOV */ + __u32 lsd_padding1; /* was lsd_mdt_index, unused in 2.4.0 */ + __u32 lsd_start_epoch; /* VBR: start epoch from last boot */ + /** transaction values since lsd_trans_table_time */ + __u64 lsd_trans_table[LR_EXPIRE_INTERVALS]; + /** start point of transno table below */ + __u32 lsd_trans_table_time; /* time of first slot in table above */ + __u32 lsd_expire_intervals; /* LR_EXPIRE_INTERVALS */ + __u8 lsd_padding[LR_SERVER_SIZE - 288]; +}; + +/* Data stored per client in the last_rcvd file. In le32 order. */ +struct lsd_client_data { + __u8 lcd_uuid[40]; /* client UUID */ + __u64 lcd_last_transno; /* last completed transaction ID */ + __u64 lcd_last_xid; /* xid for the last transaction */ + __u32 lcd_last_result; /* result from last RPC */ + __u32 lcd_last_data; /* per-op data (disposition for open &c.) */ + /* for MDS_CLOSE requests */ + __u64 lcd_last_close_transno; /* last completed transaction ID */ + __u64 lcd_last_close_xid; /* xid for the last transaction */ + __u32 lcd_last_close_result; /* result from last RPC */ + __u32 lcd_last_close_data; /* per-op data */ + /* VBR: last versions */ + __u64 lcd_pre_versions[4]; + __u32 lcd_last_epoch; + /** orphans handling for delayed export rely on that */ + __u32 lcd_first_epoch; + __u8 lcd_padding[LR_CLIENT_SIZE - 128]; +}; + +/* bug20354: the lcd_uuid for export of clients may be wrong */ +static inline void check_lcd(char *obd_name, int index, + struct lsd_client_data *lcd) +{ + int length = sizeof(lcd->lcd_uuid); + if (strnlen((char*)lcd->lcd_uuid, length) == length) { + lcd->lcd_uuid[length - 1] = '\0'; + + LCONSOLE_ERROR("the client UUID (%s) on %s for exports" + "stored in last_rcvd(index = %d) is bad!\n", + lcd->lcd_uuid, obd_name, index); + } +} + +/* last_rcvd handling */ +static inline void lsd_le_to_cpu(struct lr_server_data *buf, + struct lr_server_data *lsd) +{ + int i; + memcpy(lsd->lsd_uuid, buf->lsd_uuid, sizeof(lsd->lsd_uuid)); + lsd->lsd_last_transno = le64_to_cpu(buf->lsd_last_transno); + lsd->lsd_compat14 = le64_to_cpu(buf->lsd_compat14); + lsd->lsd_mount_count = le64_to_cpu(buf->lsd_mount_count); + lsd->lsd_feature_compat = le32_to_cpu(buf->lsd_feature_compat); + lsd->lsd_feature_rocompat = le32_to_cpu(buf->lsd_feature_rocompat); + lsd->lsd_feature_incompat = le32_to_cpu(buf->lsd_feature_incompat); + lsd->lsd_server_size = le32_to_cpu(buf->lsd_server_size); + lsd->lsd_client_start = le32_to_cpu(buf->lsd_client_start); + lsd->lsd_client_size = le16_to_cpu(buf->lsd_client_size); + lsd->lsd_subdir_count = le16_to_cpu(buf->lsd_subdir_count); + lsd->lsd_catalog_oid = le64_to_cpu(buf->lsd_catalog_oid); + lsd->lsd_catalog_ogen = le32_to_cpu(buf->lsd_catalog_ogen); + memcpy(lsd->lsd_peeruuid, buf->lsd_peeruuid, sizeof(lsd->lsd_peeruuid)); + lsd->lsd_osd_index = le32_to_cpu(buf->lsd_osd_index); + lsd->lsd_padding1 = le32_to_cpu(buf->lsd_padding1); + lsd->lsd_start_epoch = le32_to_cpu(buf->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + lsd->lsd_trans_table[i] = le64_to_cpu(buf->lsd_trans_table[i]); + lsd->lsd_trans_table_time = le32_to_cpu(buf->lsd_trans_table_time); + lsd->lsd_expire_intervals = le32_to_cpu(buf->lsd_expire_intervals); +} + +static inline void lsd_cpu_to_le(struct lr_server_data *lsd, + struct lr_server_data *buf) +{ + int i; + memcpy(buf->lsd_uuid, lsd->lsd_uuid, sizeof(buf->lsd_uuid)); + buf->lsd_last_transno = cpu_to_le64(lsd->lsd_last_transno); + buf->lsd_compat14 = cpu_to_le64(lsd->lsd_compat14); + buf->lsd_mount_count = cpu_to_le64(lsd->lsd_mount_count); + buf->lsd_feature_compat = cpu_to_le32(lsd->lsd_feature_compat); + buf->lsd_feature_rocompat = cpu_to_le32(lsd->lsd_feature_rocompat); + buf->lsd_feature_incompat = cpu_to_le32(lsd->lsd_feature_incompat); + buf->lsd_server_size = cpu_to_le32(lsd->lsd_server_size); + buf->lsd_client_start = cpu_to_le32(lsd->lsd_client_start); + buf->lsd_client_size = cpu_to_le16(lsd->lsd_client_size); + buf->lsd_subdir_count = cpu_to_le16(lsd->lsd_subdir_count); + buf->lsd_catalog_oid = cpu_to_le64(lsd->lsd_catalog_oid); + buf->lsd_catalog_ogen = cpu_to_le32(lsd->lsd_catalog_ogen); + memcpy(buf->lsd_peeruuid, lsd->lsd_peeruuid, sizeof(buf->lsd_peeruuid)); + buf->lsd_osd_index = cpu_to_le32(lsd->lsd_osd_index); + buf->lsd_padding1 = cpu_to_le32(lsd->lsd_padding1); + buf->lsd_start_epoch = cpu_to_le32(lsd->lsd_start_epoch); + for (i = 0; i < LR_EXPIRE_INTERVALS; i++) + buf->lsd_trans_table[i] = cpu_to_le64(lsd->lsd_trans_table[i]); + buf->lsd_trans_table_time = cpu_to_le32(lsd->lsd_trans_table_time); + buf->lsd_expire_intervals = cpu_to_le32(lsd->lsd_expire_intervals); +} + +static inline void lcd_le_to_cpu(struct lsd_client_data *buf, + struct lsd_client_data *lcd) +{ + memcpy(lcd->lcd_uuid, buf->lcd_uuid, sizeof (lcd->lcd_uuid)); + lcd->lcd_last_transno = le64_to_cpu(buf->lcd_last_transno); + lcd->lcd_last_xid = le64_to_cpu(buf->lcd_last_xid); + lcd->lcd_last_result = le32_to_cpu(buf->lcd_last_result); + lcd->lcd_last_data = le32_to_cpu(buf->lcd_last_data); + lcd->lcd_last_close_transno = le64_to_cpu(buf->lcd_last_close_transno); + lcd->lcd_last_close_xid = le64_to_cpu(buf->lcd_last_close_xid); + lcd->lcd_last_close_result = le32_to_cpu(buf->lcd_last_close_result); + lcd->lcd_last_close_data = le32_to_cpu(buf->lcd_last_close_data); + lcd->lcd_pre_versions[0] = le64_to_cpu(buf->lcd_pre_versions[0]); + lcd->lcd_pre_versions[1] = le64_to_cpu(buf->lcd_pre_versions[1]); + lcd->lcd_pre_versions[2] = le64_to_cpu(buf->lcd_pre_versions[2]); + lcd->lcd_pre_versions[3] = le64_to_cpu(buf->lcd_pre_versions[3]); + lcd->lcd_last_epoch = le32_to_cpu(buf->lcd_last_epoch); + lcd->lcd_first_epoch = le32_to_cpu(buf->lcd_first_epoch); +} + +static inline void lcd_cpu_to_le(struct lsd_client_data *lcd, + struct lsd_client_data *buf) +{ + memcpy(buf->lcd_uuid, lcd->lcd_uuid, sizeof (lcd->lcd_uuid)); + buf->lcd_last_transno = cpu_to_le64(lcd->lcd_last_transno); + buf->lcd_last_xid = cpu_to_le64(lcd->lcd_last_xid); + buf->lcd_last_result = cpu_to_le32(lcd->lcd_last_result); + buf->lcd_last_data = cpu_to_le32(lcd->lcd_last_data); + buf->lcd_last_close_transno = cpu_to_le64(lcd->lcd_last_close_transno); + buf->lcd_last_close_xid = cpu_to_le64(lcd->lcd_last_close_xid); + buf->lcd_last_close_result = cpu_to_le32(lcd->lcd_last_close_result); + buf->lcd_last_close_data = cpu_to_le32(lcd->lcd_last_close_data); + buf->lcd_pre_versions[0] = cpu_to_le64(lcd->lcd_pre_versions[0]); + buf->lcd_pre_versions[1] = cpu_to_le64(lcd->lcd_pre_versions[1]); + buf->lcd_pre_versions[2] = cpu_to_le64(lcd->lcd_pre_versions[2]); + buf->lcd_pre_versions[3] = cpu_to_le64(lcd->lcd_pre_versions[3]); + buf->lcd_last_epoch = cpu_to_le32(lcd->lcd_last_epoch); + buf->lcd_first_epoch = cpu_to_le32(lcd->lcd_first_epoch); +} + +static inline __u64 lcd_last_transno(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_transno > lcd->lcd_last_close_transno ? + lcd->lcd_last_transno : lcd->lcd_last_close_transno); +} + +static inline __u64 lcd_last_xid(struct lsd_client_data *lcd) +{ + return (lcd->lcd_last_xid > lcd->lcd_last_close_xid ? + lcd->lcd_last_xid : lcd->lcd_last_close_xid); +} + +/****************** superblock additional info *********************/ + +struct ll_sb_info; + +struct lustre_sb_info { + int lsi_flags; + struct obd_device *lsi_mgc; /* mgc obd */ + struct lustre_mount_data *lsi_lmd; /* mount command info */ + struct ll_sb_info *lsi_llsbi; /* add'l client sbi info */ + struct dt_device *lsi_dt_dev; /* dt device to access disk fs*/ + struct vfsmount *lsi_srv_mnt; /* the one server mount */ + atomic_t lsi_mounts; /* references to the srv_mnt */ + char lsi_svname[MTI_NAME_MAXLEN]; + char lsi_osd_obdname[64]; + char lsi_osd_uuid[64]; + struct obd_export *lsi_osd_exp; + char lsi_osd_type[16]; + char lsi_fstype[16]; + struct backing_dev_info lsi_bdi; /* each client mountpoint needs + own backing_dev_info */ +}; + +#define LSI_UMOUNT_FAILOVER 0x00200000 +#define LSI_BDI_INITIALIZED 0x00400000 + +#define s2lsi(sb) ((struct lustre_sb_info *)((sb)->s_fs_info)) +#define s2lsi_nocast(sb) ((sb)->s_fs_info) + +#define get_profile_name(sb) (s2lsi(sb)->lsi_lmd->lmd_profile) +#define get_mount_flags(sb) (s2lsi(sb)->lsi_lmd->lmd_flags) +#define get_mntdev_name(sb) (s2lsi(sb)->lsi_lmd->lmd_dev) + + +/****************** mount lookup info *********************/ + +struct lustre_mount_info { + char *lmi_name; + struct super_block *lmi_sb; + struct vfsmount *lmi_mnt; + struct list_head lmi_list_chain; +}; + +/****************** prototypes *********************/ + +/* obd_mount.c */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr); +int server_name2index(const char *svname, __u32 *idx, const char **endptr); +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize); + +int lustre_put_lsi(struct super_block *sb); +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4); +int lustre_start_mgc(struct super_block *sb); +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)); +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)); +int lustre_common_put_super(struct super_block *sb); + + +int mgc_fsname2resid(char *fsname, struct ldlm_res_id *res_id, int type); + +/** @} disk */ + +#endif // _LUSTRE_DISK_H diff --git a/drivers/staging/lustre/lustre/include/lustre_dlm.h b/drivers/staging/lustre/lustre/include/lustre_dlm.h new file mode 100644 index 000000000000..317f928fc151 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_dlm.h @@ -0,0 +1,1671 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +/** \defgroup LDLM Lustre Distributed Lock Manager + * + * Lustre DLM is based on VAX DLM. + * Its two main roles are: + * - To provide locking assuring consistency of data on all Lustre nodes. + * - To allow clients to cache state protected by a lock by holding the + * lock until a conflicting lock is requested or it is expired by the LRU. + * + * @{ + */ + +#ifndef _LUSTRE_DLM_H__ +#define _LUSTRE_DLM_H__ + +#include <linux/lustre_dlm.h> + +#include <lustre_lib.h> +#include <lustre_net.h> +#include <lustre_import.h> +#include <lustre_handles.h> +#include <interval_tree.h> /* for interval_node{}, ldlm_extent */ +#include <lu_ref.h> + +struct obd_ops; +struct obd_device; + +#define OBD_LDLM_DEVICENAME "ldlm" + +#define LDLM_DEFAULT_LRU_SIZE (100 * num_online_cpus()) +#define LDLM_DEFAULT_MAX_ALIVE (cfs_time_seconds(36000)) +#define LDLM_CTIME_AGE_LIMIT (10) +#define LDLM_DEFAULT_PARALLEL_AST_LIMIT 1024 + +/** + * LDLM non-error return states + */ +typedef enum { + ELDLM_OK = 0, + + ELDLM_LOCK_CHANGED = 300, + ELDLM_LOCK_ABORTED = 301, + ELDLM_LOCK_REPLACED = 302, + ELDLM_NO_LOCK_DATA = 303, + ELDLM_LOCK_WOULDBLOCK = 304, + + ELDLM_NAMESPACE_EXISTS = 400, + ELDLM_BAD_NAMESPACE = 401 +} ldlm_error_t; + +/** + * LDLM namespace type. + * The "client" type is actually an indication that this is a narrow local view + * into complete namespace on the server. Such namespaces cannot make any + * decisions about lack of conflicts or do any autonomous lock granting without + * first speaking to a server. + */ +typedef enum { + LDLM_NAMESPACE_SERVER = 1 << 0, + LDLM_NAMESPACE_CLIENT = 1 << 1 +} ldlm_side_t; + +/** + * Declaration of flags sent through the wire. + **/ +#define LDLM_FL_LOCK_CHANGED 0x000001 /* extent, mode, or resource changed */ + +/** + * If the server returns one of these flags, then the lock was put on that list. + * If the client sends one of these flags (during recovery ONLY!), it wants the + * lock added to the specified list, no questions asked. + */ +#define LDLM_FL_BLOCK_GRANTED 0x000002 +#define LDLM_FL_BLOCK_CONV 0x000004 +#define LDLM_FL_BLOCK_WAIT 0x000008 + +/* Used to be LDLM_FL_CBPENDING 0x000010 moved to non-wire flags */ + +#define LDLM_FL_AST_SENT 0x000020 /* blocking or cancel packet was + * queued for sending. */ +/* Used to be LDLM_FL_WAIT_NOREPROC 0x000040 moved to non-wire flags */ +/* Used to be LDLM_FL_CANCEL 0x000080 moved to non-wire flags */ + +/** + * Lock is being replayed. This could probably be implied by the fact that one + * of BLOCK_{GRANTED,CONV,WAIT} is set, but that is pretty dangerous. + */ +#define LDLM_FL_REPLAY 0x000100 + +#define LDLM_FL_INTENT_ONLY 0x000200 /* Don't grant lock, just do intent. */ + +/* Used to be LDLM_FL_LOCAL_ONLY 0x000400 moved to non-wire flags */ +/* Used to be LDLM_FL_FAILED 0x000800 moved to non-wire flags */ + +#define LDLM_FL_HAS_INTENT 0x001000 /* lock request has intent */ + +/* Used to be LDLM_FL_CANCELING 0x002000 moved to non-wire flags */ +/* Used to be LDLM_FL_LOCAL 0x004000 moved to non-wire flags */ + +#define LDLM_FL_DISCARD_DATA 0x010000 /* discard (no writeback) on cancel */ + +#define LDLM_FL_NO_TIMEOUT 0x020000 /* Blocked by group lock - wait + * indefinitely */ + +/** file & record locking */ +#define LDLM_FL_BLOCK_NOWAIT 0x040000 /* Server told not to wait if blocked. + * For AGL, OST will not send glimpse + * callback. */ +#define LDLM_FL_TEST_LOCK 0x080000 // return blocking lock + +/* Used to be LDLM_FL_LVB_READY 0x100000 moved to non-wire flags */ +/* Used to be LDLM_FL_KMS_IGNORE 0x200000 moved to non-wire flags */ +/* Used to be LDLM_FL_NO_LRU 0x400000 moved to non-wire flags */ + +/* Immediatelly cancel such locks when they block some other locks. Send + * cancel notification to original lock holder, but expect no reply. This is + * for clients (like liblustre) that cannot be expected to reliably response + * to blocking AST. */ +#define LDLM_FL_CANCEL_ON_BLOCK 0x800000 + +/* Flags flags inherited from parent lock when doing intents. */ +#define LDLM_INHERIT_FLAGS (LDLM_FL_CANCEL_ON_BLOCK) + +/* Used to be LDLM_FL_CP_REQD 0x1000000 moved to non-wire flags */ +/* Used to be LDLM_FL_CLEANED 0x2000000 moved to non-wire flags */ +/* Used to be LDLM_FL_ATOMIC_CB 0x4000000 moved to non-wire flags */ +/* Used to be LDLM_FL_BL_AST 0x10000000 moved to non-wire flags */ +/* Used to be LDLM_FL_BL_DONE 0x20000000 moved to non-wire flags */ + +/* measure lock contention and return -EUSERS if locking contention is high */ +#define LDLM_FL_DENY_ON_CONTENTION 0x40000000 + +/* These are flags that are mapped into the flags and ASTs of blocking locks */ +#define LDLM_AST_DISCARD_DATA 0x80000000 /* Add FL_DISCARD to blocking ASTs */ + +/* Flags sent in AST lock_flags to be mapped into the receiving lock. */ +#define LDLM_AST_FLAGS (LDLM_FL_DISCARD_DATA) + +/* + * -------------------------------------------------------------------------- + * NOTE! Starting from this point, that is, LDLM_FL_* flags with values above + * 0x80000000 will not be sent over the wire. + * -------------------------------------------------------------------------- + */ + +/** + * Declaration of flags not sent through the wire. + **/ + +/** + * Used for marking lock as a target for -EINTR while cp_ast sleep + * emulation + race with upcoming bl_ast. + */ +#define LDLM_FL_FAIL_LOC 0x100000000ULL + +/** + * Used while processing the unused list to know that we have already + * handled this lock and decided to skip it. + */ +#define LDLM_FL_SKIPPED 0x200000000ULL +/* this lock is being destroyed */ +#define LDLM_FL_CBPENDING 0x400000000ULL +/* not a real flag, not saved in lock */ +#define LDLM_FL_WAIT_NOREPROC 0x800000000ULL +/* cancellation callback already run */ +#define LDLM_FL_CANCEL 0x1000000000ULL +#define LDLM_FL_LOCAL_ONLY 0x2000000000ULL +/* don't run the cancel callback under ldlm_cli_cancel_unused */ +#define LDLM_FL_FAILED 0x4000000000ULL +/* lock cancel has already been sent */ +#define LDLM_FL_CANCELING 0x8000000000ULL +/* local lock (ie, no srv/cli split) */ +#define LDLM_FL_LOCAL 0x10000000000ULL +/* XXX FIXME: This is being added to b_size as a low-risk fix to the fact that + * the LVB filling happens _after_ the lock has been granted, so another thread + * can match it before the LVB has been updated. As a dirty hack, we set + * LDLM_FL_LVB_READY only after we've done the LVB poop. + * this is only needed on LOV/OSC now, where LVB is actually used and callers + * must set it in input flags. + * + * The proper fix is to do the granting inside of the completion AST, which can + * be replaced with a LVB-aware wrapping function for OSC locks. That change is + * pretty high-risk, though, and would need a lot more testing. */ +#define LDLM_FL_LVB_READY 0x20000000000ULL +/* A lock contributes to the known minimum size (KMS) calculation until it has + * finished the part of its cancelation that performs write back on its dirty + * pages. It can remain on the granted list during this whole time. Threads + * racing to update the KMS after performing their writeback need to know to + * exclude each other's locks from the calculation as they walk the granted + * list. */ +#define LDLM_FL_KMS_IGNORE 0x40000000000ULL +/* completion AST to be executed */ +#define LDLM_FL_CP_REQD 0x80000000000ULL +/* cleanup_resource has already handled the lock */ +#define LDLM_FL_CLEANED 0x100000000000ULL +/* optimization hint: LDLM can run blocking callback from current context + * w/o involving separate thread. in order to decrease cs rate */ +#define LDLM_FL_ATOMIC_CB 0x200000000000ULL + +/* It may happen that a client initiates two operations, e.g. unlink and + * mkdir, such that the server sends a blocking AST for conflicting + * locks to this client for the first operation, whereas the second + * operation has canceled this lock and is waiting for rpc_lock which is + * taken by the first operation. LDLM_FL_BL_AST is set by + * ldlm_callback_handler() in the lock to prevent the Early Lock Cancel + * (ELC) code from cancelling it. + * + * LDLM_FL_BL_DONE is to be set by ldlm_cancel_callback() when lock + * cache is dropped to let ldlm_callback_handler() return EINVAL to the + * server. It is used when ELC RPC is already prepared and is waiting + * for rpc_lock, too late to send a separate CANCEL RPC. */ +#define LDLM_FL_BL_AST 0x400000000000ULL +#define LDLM_FL_BL_DONE 0x800000000000ULL +/* Don't put lock into the LRU list, so that it is not canceled due to aging. + * Used by MGC locks, they are cancelled only at unmount or by callback. */ +#define LDLM_FL_NO_LRU 0x1000000000000ULL + +/** + * The blocking callback is overloaded to perform two functions. These flags + * indicate which operation should be performed. + */ +#define LDLM_CB_BLOCKING 1 +#define LDLM_CB_CANCELING 2 + +/** + * \name Lock Compatibility Matrix. + * + * A lock has both a type (extent, flock, inode bits, or plain) and a mode. + * Lock types are described in their respective implementation files: + * ldlm_{extent,flock,inodebits,plain}.c. + * + * There are six lock modes along with a compatibility matrix to indicate if + * two locks are compatible. + * + * - EX: Exclusive mode. Before a new file is created, MDS requests EX lock + * on the parent. + * - PW: Protective Write (normal write) mode. When a client requests a write + * lock from an OST, a lock with PW mode will be issued. + * - PR: Protective Read (normal read) mode. When a client requests a read from + * an OST, a lock with PR mode will be issued. Also, if the client opens a + * file for execution, it is granted a lock with PR mode. + * - CW: Concurrent Write mode. The type of lock that the MDS grants if a client + * requests a write lock during a file open operation. + * - CR Concurrent Read mode. When a client performs a path lookup, MDS grants + * an inodebit lock with the CR mode on the intermediate path component. + * - NL Null mode. + * + * <PRE> + * NL CR CW PR PW EX + * NL 1 1 1 1 1 1 + * CR 1 1 1 1 1 0 + * CW 1 1 1 0 0 0 + * PR 1 1 0 1 0 0 + * PW 1 1 0 0 0 0 + * EX 1 0 0 0 0 0 + * </PRE> + */ +/** @{ */ +#define LCK_COMPAT_EX LCK_NL +#define LCK_COMPAT_PW (LCK_COMPAT_EX | LCK_CR) +#define LCK_COMPAT_PR (LCK_COMPAT_PW | LCK_PR) +#define LCK_COMPAT_CW (LCK_COMPAT_PW | LCK_CW) +#define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) +#define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) +#define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) +/** @} Lock Compatibility Matrix */ + +extern ldlm_mode_t lck_compat_array[]; + +static inline void lockmode_verify(ldlm_mode_t mode) +{ + LASSERT(mode > LCK_MINMODE && mode < LCK_MAXMODE); +} + +static inline int lockmode_compat(ldlm_mode_t exist_mode, ldlm_mode_t new_mode) +{ + return (lck_compat_array[exist_mode] & new_mode); +} + +/* + * + * cluster name spaces + * + */ + +#define DLM_OST_NAMESPACE 1 +#define DLM_MDS_NAMESPACE 2 + +/* XXX + - do we just separate this by security domains and use a prefix for + multiple namespaces in the same domain? + - +*/ + +/** + * Locking rules for LDLM: + * + * lr_lock + * + * lr_lock + * waiting_locks_spinlock + * + * lr_lock + * led_lock + * + * lr_lock + * ns_lock + * + * lr_lvb_mutex + * lr_lock + * + */ + +struct ldlm_pool; +struct ldlm_lock; +struct ldlm_resource; +struct ldlm_namespace; + +/** + * Operations on LDLM pools. + * LDLM pool is a pool of locks in the namespace without any implicitly + * specified limits. + * Locks in the pool are organized in LRU. + * Local memory pressure or server instructions (e.g. mempressure on server) + * can trigger freeing of locks from the pool + */ +struct ldlm_pool_ops { + /** Recalculate pool \a pl usage */ + int (*po_recalc)(struct ldlm_pool *pl); + /** Cancel at least \a nr locks from pool \a pl */ + int (*po_shrink)(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask); + int (*po_setup)(struct ldlm_pool *pl, int limit); +}; + +/** One second for pools thread check interval. Each pool has own period. */ +#define LDLM_POOLS_THREAD_PERIOD (1) + +/** ~6% margin for modest pools. See ldlm_pool.c for details. */ +#define LDLM_POOLS_MODEST_MARGIN_SHIFT (4) + +/** Default recalc period for server side pools in sec. */ +#define LDLM_POOL_SRV_DEF_RECALC_PERIOD (1) + +/** Default recalc period for client side pools in sec. */ +#define LDLM_POOL_CLI_DEF_RECALC_PERIOD (10) + +/** + * LDLM pool structure to track granted locks. + * For purposes of determining when to release locks on e.g. memory pressure. + * This feature is commonly referred to as lru_resize. + */ +struct ldlm_pool { + /** Pool proc directory. */ + proc_dir_entry_t *pl_proc_dir; + /** Pool name, must be long enough to hold compound proc entry name. */ + char pl_name[100]; + /** Lock for protecting SLV/CLV updates. */ + spinlock_t pl_lock; + /** Number of allowed locks in in pool, both, client and server side. */ + atomic_t pl_limit; + /** Number of granted locks in */ + atomic_t pl_granted; + /** Grant rate per T. */ + atomic_t pl_grant_rate; + /** Cancel rate per T. */ + atomic_t pl_cancel_rate; + /** Server lock volume (SLV). Protected by pl_lock. */ + __u64 pl_server_lock_volume; + /** Current biggest client lock volume. Protected by pl_lock. */ + __u64 pl_client_lock_volume; + /** Lock volume factor. SLV on client is calculated as following: + * server_slv * lock_volume_factor. */ + atomic_t pl_lock_volume_factor; + /** Time when last SLV from server was obtained. */ + time_t pl_recalc_time; + /** Recalculation period for pool. */ + time_t pl_recalc_period; + /** Recalculation and shrink operations. */ + struct ldlm_pool_ops *pl_ops; + /** Number of planned locks for next period. */ + int pl_grant_plan; + /** Pool statistics. */ + struct lprocfs_stats *pl_stats; +}; + +typedef int (*ldlm_res_policy)(struct ldlm_namespace *, struct ldlm_lock **, + void *req_cookie, ldlm_mode_t mode, __u64 flags, + void *data); + +typedef int (*ldlm_cancel_for_recovery)(struct ldlm_lock *lock); + +/** + * LVB operations. + * LVB is Lock Value Block. This is a special opaque (to LDLM) value that could + * be associated with an LDLM lock and transferred from client to server and + * back. + * + * Currently LVBs are used by: + * - OSC-OST code to maintain current object size/times + * - layout lock code to return the layout when the layout lock is granted + */ +struct ldlm_valblock_ops { + int (*lvbo_init)(struct ldlm_resource *res); + int (*lvbo_update)(struct ldlm_resource *res, + struct ptlrpc_request *r, + int increase); + int (*lvbo_free)(struct ldlm_resource *res); + /* Return size of lvb data appropriate RPC size can be reserved */ + int (*lvbo_size)(struct ldlm_lock *lock); + /* Called to fill in lvb data to RPC buffer @buf */ + int (*lvbo_fill)(struct ldlm_lock *lock, void *buf, int buflen); +}; + +/** + * LDLM pools related, type of lock pool in the namespace. + * Greedy means release cached locks aggressively + */ +typedef enum { + LDLM_NAMESPACE_GREEDY = 1 << 0, + LDLM_NAMESPACE_MODEST = 1 << 1 +} ldlm_appetite_t; + +/** + * Default values for the "max_nolock_size", "contention_time" and + * "contended_locks" namespace tunables. + */ +#define NS_DEFAULT_MAX_NOLOCK_BYTES 0 +#define NS_DEFAULT_CONTENTION_SECONDS 2 +#define NS_DEFAULT_CONTENDED_LOCKS 32 + +struct ldlm_ns_bucket { + /** back pointer to namespace */ + struct ldlm_namespace *nsb_namespace; + /** + * Estimated lock callback time. Used by adaptive timeout code to + * avoid spurious client evictions due to unresponsiveness when in + * fact the network or overall system load is at fault + */ + struct adaptive_timeout nsb_at_estimate; +}; + +enum { + /** LDLM namespace lock stats */ + LDLM_NSS_LOCKS = 0, + LDLM_NSS_LAST +}; + +typedef enum { + /** invalide type */ + LDLM_NS_TYPE_UNKNOWN = 0, + /** mdc namespace */ + LDLM_NS_TYPE_MDC, + /** mds namespace */ + LDLM_NS_TYPE_MDT, + /** osc namespace */ + LDLM_NS_TYPE_OSC, + /** ost namespace */ + LDLM_NS_TYPE_OST, + /** mgc namespace */ + LDLM_NS_TYPE_MGC, + /** mgs namespace */ + LDLM_NS_TYPE_MGT, +} ldlm_ns_type_t; + +/** + * LDLM Namespace. + * + * Namespace serves to contain locks related to a particular service. + * There are two kinds of namespaces: + * - Server namespace has knowledge of all locks and is therefore authoritative + * to make decisions like what locks could be granted and what conflicts + * exist during new lock enqueue. + * - Client namespace only has limited knowledge about locks in the namespace, + * only seeing locks held by the client. + * + * Every Lustre service has one server namespace present on the server serving + * that service. Every client connected to the service has a client namespace + * for it. + * Every lock obtained by client in that namespace is actually represented by + * two in-memory locks. One on the server and one on the client. The locks are + * linked by a special cookie by which one node can tell to the other which lock + * it actually means during communications. Such locks are called remote locks. + * The locks held by server only without any reference to a client are called + * local locks. + */ +struct ldlm_namespace { + /** Backward link to OBD, required for LDLM pool to store new SLV. */ + struct obd_device *ns_obd; + + /** Flag indicating if namespace is on client instead of server */ + ldlm_side_t ns_client; + + /** Resource hash table for namespace. */ + cfs_hash_t *ns_rs_hash; + + /** serialize */ + spinlock_t ns_lock; + + /** big refcount (by bucket) */ + atomic_t ns_bref; + + /** + * Namespace connect flags supported by server (may be changed via + * /proc, LRU resize may be disabled/enabled). + */ + __u64 ns_connect_flags; + + /** Client side original connect flags supported by server. */ + __u64 ns_orig_connect_flags; + + /* namespace proc dir entry */ + struct proc_dir_entry *ns_proc_dir_entry; + + /** + * Position in global namespace list linking all namespaces on + * the node. + */ + struct list_head ns_list_chain; + + /** + * List of unused locks for this namespace. This list is also called + * LRU lock list. + * Unused locks are locks with zero reader/writer reference counts. + * This list is only used on clients for lock caching purposes. + * When we want to release some locks voluntarily or if server wants + * us to release some locks due to e.g. memory pressure, we take locks + * to release from the head of this list. + * Locks are linked via l_lru field in \see struct ldlm_lock. + */ + struct list_head ns_unused_list; + /** Number of locks in the LRU list above */ + int ns_nr_unused; + + /** + * Maximum number of locks permitted in the LRU. If 0, means locks + * are managed by pools and there is no preset limit, rather it is all + * controlled by available memory on this client and on server. + */ + unsigned int ns_max_unused; + /** Maximum allowed age (last used time) for locks in the LRU */ + unsigned int ns_max_age; + /** + * Server only: number of times we evicted clients due to lack of reply + * to ASTs. + */ + unsigned int ns_timeouts; + /** + * Number of seconds since the file change time after which the + * MDT will return an UPDATE lock along with a LOOKUP lock. + * This allows the client to start caching negative dentries + * for a directory and may save an RPC for a later stat. + */ + unsigned int ns_ctime_age_limit; + + /** + * Used to rate-limit ldlm_namespace_dump calls. + * \see ldlm_namespace_dump. Increased by 10 seconds every time + * it is called. + */ + cfs_time_t ns_next_dump; + + /** "policy" function that does actual lock conflict determination */ + ldlm_res_policy ns_policy; + + /** + * LVB operations for this namespace. + * \see struct ldlm_valblock_ops + */ + struct ldlm_valblock_ops *ns_lvbo; + + /** + * Used by filter code to store pointer to OBD of the service. + * Should be dropped in favor of \a ns_obd + */ + void *ns_lvbp; + + /** + * Wait queue used by __ldlm_namespace_free. Gets woken up every time + * a resource is removed. + */ + wait_queue_head_t ns_waitq; + /** LDLM pool structure for this namespace */ + struct ldlm_pool ns_pool; + /** Definition of how eagerly unused locks will be released from LRU */ + ldlm_appetite_t ns_appetite; + + /** + * If more than \a ns_contended_locks are found, the resource is + * considered to be contended. Lock enqueues might specify that no + * contended locks should be granted + */ + unsigned ns_contended_locks; + + /** + * The resources in this namespace remember contended state during + * \a ns_contention_time, in seconds. + */ + unsigned ns_contention_time; + + /** + * Limit size of contended extent locks, in bytes. + * If extended lock is requested for more then this many bytes and + * caller instructs us not to grant contended locks, we would disregard + * such a request. + */ + unsigned ns_max_nolock_size; + + /** Limit of parallel AST RPC count. */ + unsigned ns_max_parallel_ast; + + /** Callback to cancel locks before replaying it during recovery. */ + ldlm_cancel_for_recovery ns_cancel_for_recovery; + + /** LDLM lock stats */ + struct lprocfs_stats *ns_stats; + + /** + * Flag to indicate namespace is being freed. Used to determine if + * recalculation of LDLM pool statistics should be skipped. + */ + unsigned ns_stopping:1; +}; + +/** + * Returns 1 if namespace \a ns is a client namespace. + */ +static inline int ns_is_client(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_CLIENT; +} + +/** + * Returns 1 if namespace \a ns is a server namespace. + */ +static inline int ns_is_server(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + LASSERT(!(ns->ns_client & ~(LDLM_NAMESPACE_CLIENT | + LDLM_NAMESPACE_SERVER))); + LASSERT(ns->ns_client == LDLM_NAMESPACE_CLIENT || + ns->ns_client == LDLM_NAMESPACE_SERVER); + return ns->ns_client == LDLM_NAMESPACE_SERVER; +} + +/** + * Returns 1 if namespace \a ns supports early lock cancel (ELC). + */ +static inline int ns_connect_cancelset(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_CANCELSET); +} + +/** + * Returns 1 if this namespace supports lru_resize. + */ +static inline int ns_connect_lru_resize(struct ldlm_namespace *ns) +{ + LASSERT(ns != NULL); + return !!(ns->ns_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline void ns_register_cancel(struct ldlm_namespace *ns, + ldlm_cancel_for_recovery arg) +{ + LASSERT(ns != NULL); + ns->ns_cancel_for_recovery = arg; +} + +struct ldlm_lock; + +/** Type for blocking callback function of a lock. */ +typedef int (*ldlm_blocking_callback)(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +/** Type for completion callback function of a lock. */ +typedef int (*ldlm_completion_callback)(struct ldlm_lock *lock, __u64 flags, + void *data); +/** Type for glimpse callback function of a lock. */ +typedef int (*ldlm_glimpse_callback)(struct ldlm_lock *lock, void *data); +/** Type for weight callback function of a lock. */ +typedef unsigned long (*ldlm_weigh_callback)(struct ldlm_lock *lock); + +/** Work list for sending GL ASTs to multiple locks. */ +struct ldlm_glimpse_work { + struct ldlm_lock *gl_lock; /* lock to glimpse */ + struct list_head gl_list; /* linkage to other gl work structs */ + __u32 gl_flags;/* see LDLM_GL_WORK_* below */ + union ldlm_gl_desc *gl_desc; /* glimpse descriptor to be packed in + * glimpse callback request */ +}; + +/** The ldlm_glimpse_work is allocated on the stack and should not be freed. */ +#define LDLM_GL_WORK_NOFREE 0x1 + +/** Interval node data for each LDLM_EXTENT lock. */ +struct ldlm_interval { + struct interval_node li_node; /* node for tree management */ + struct list_head li_group; /* the locks which have the same + * policy - group of the policy */ +}; +#define to_ldlm_interval(n) container_of(n, struct ldlm_interval, li_node) + +/** + * Interval tree for extent locks. + * The interval tree must be accessed under the resource lock. + * Interval trees are used for granted extent locks to speed up conflicts + * lookup. See ldlm/interval_tree.c for more details. + */ +struct ldlm_interval_tree { + /** Tree size. */ + int lit_size; + ldlm_mode_t lit_mode; /* lock mode */ + struct interval_node *lit_root; /* actual ldlm_interval */ +}; + +/** Whether to track references to exports by LDLM locks. */ +#define LUSTRE_TRACKS_LOCK_EXP_REFS (0) + +/** Cancel flags. */ +typedef enum { + LCF_ASYNC = 0x1, /* Cancel locks asynchronously. */ + LCF_LOCAL = 0x2, /* Cancel locks locally, not notifing server */ + LCF_BL_AST = 0x4, /* Cancel locks marked as LDLM_FL_BL_AST + * in the same RPC */ +} ldlm_cancel_flags_t; + +struct ldlm_flock { + __u64 start; + __u64 end; + __u64 owner; + __u64 blocking_owner; + struct obd_export *blocking_export; + /* Protected by the hash lock */ + __u32 blocking_refs; + __u32 pid; +}; + +typedef union { + struct ldlm_extent l_extent; + struct ldlm_flock l_flock; + struct ldlm_inodebits l_inodebits; +} ldlm_policy_data_t; + +void ldlm_convert_policy_to_wire(ldlm_type_t type, + const ldlm_policy_data_t *lpolicy, + ldlm_wire_policy_data_t *wpolicy); +void ldlm_convert_policy_to_local(struct obd_export *exp, ldlm_type_t type, + const ldlm_wire_policy_data_t *wpolicy, + ldlm_policy_data_t *lpolicy); + +enum lvb_type { + LVB_T_NONE = 0, + LVB_T_OST = 1, + LVB_T_LQUOTA = 2, + LVB_T_LAYOUT = 3, +}; + +/** + * LDLM lock structure + * + * Represents a single LDLM lock and its state in memory. Each lock is + * associated with a single ldlm_resource, the object which is being + * locked. There may be multiple ldlm_locks on a single resource, + * depending on the lock type and whether the locks are conflicting or + * not. + */ +struct ldlm_lock { + /** + * Local lock handle. + * When remote side wants to tell us about a lock, they address + * it by this opaque handle. The handle does not hold a + * reference on the ldlm_lock, so it can be safely passed to + * other threads or nodes. When the lock needs to be accessed + * from the handle, it is looked up again in the lock table, and + * may no longer exist. + * + * Must be first in the structure. + */ + struct portals_handle l_handle; + /** + * Lock reference count. + * This is how many users have pointers to actual structure, so that + * we do not accidentally free lock structure that is in use. + */ + atomic_t l_refc; + /** + * Internal spinlock protects l_resource. We should hold this lock + * first before taking res_lock. + */ + spinlock_t l_lock; + /** + * Pointer to actual resource this lock is in. + * ldlm_lock_change_resource() can change this. + */ + struct ldlm_resource *l_resource; + /** + * List item for client side LRU list. + * Protected by ns_lock in struct ldlm_namespace. + */ + struct list_head l_lru; + /** + * Linkage to resource's lock queues according to current lock state. + * (could be granted, waiting or converting) + * Protected by lr_lock in struct ldlm_resource. + */ + struct list_head l_res_link; + /** + * Tree node for ldlm_extent. + */ + struct ldlm_interval *l_tree_node; + /** + * Per export hash of locks. + * Protected by per-bucket exp->exp_lock_hash locks. + */ + struct hlist_node l_exp_hash; + /** + * Per export hash of flock locks. + * Protected by per-bucket exp->exp_flock_hash locks. + */ + struct hlist_node l_exp_flock_hash; + /** + * Requested mode. + * Protected by lr_lock. + */ + ldlm_mode_t l_req_mode; + /** + * Granted mode, also protected by lr_lock. + */ + ldlm_mode_t l_granted_mode; + /** Lock completion handler pointer. Called when lock is granted. */ + ldlm_completion_callback l_completion_ast; + /** + * Lock blocking AST handler pointer. + * It plays two roles: + * - as a notification of an attempt to queue a conflicting lock (once) + * - as a notification when the lock is being cancelled. + * + * As such it's typically called twice: once for the initial conflict + * and then once more when the last user went away and the lock is + * cancelled (could happen recursively). + */ + ldlm_blocking_callback l_blocking_ast; + /** + * Lock glimpse handler. + * Glimpse handler is used to obtain LVB updates from a client by + * server + */ + ldlm_glimpse_callback l_glimpse_ast; + + /** XXX apparently unused "weight" handler. To be removed? */ + ldlm_weigh_callback l_weigh_ast; + + /** + * Lock export. + * This is a pointer to actual client export for locks that were granted + * to clients. Used server-side. + */ + struct obd_export *l_export; + /** + * Lock connection export. + * Pointer to server export on a client. + */ + struct obd_export *l_conn_export; + + /** + * Remote lock handle. + * If the lock is remote, this is the handle of the other side lock + * (l_handle) + */ + struct lustre_handle l_remote_handle; + + /** + * Representation of private data specific for a lock type. + * Examples are: extent range for extent lock or bitmask for ibits locks + */ + ldlm_policy_data_t l_policy_data; + + /** + * Lock state flags. + * Like whenever we receive any blocking requests for this lock, etc. + * Protected by lr_lock. + */ + __u64 l_flags; + /** + * Lock r/w usage counters. + * Protected by lr_lock. + */ + __u32 l_readers; + __u32 l_writers; + /** + * If the lock is granted, a process sleeps on this waitq to learn when + * it's no longer in use. If the lock is not granted, a process sleeps + * on this waitq to learn when it becomes granted. + */ + wait_queue_head_t l_waitq; + + /** + * Seconds. It will be updated if there is any activity related to + * the lock, e.g. enqueue the lock or send blocking AST. + */ + cfs_time_t l_last_activity; + + /** + * Time last used by e.g. being matched by lock match. + * Jiffies. Should be converted to time if needed. + */ + cfs_time_t l_last_used; + + /** Originally requested extent for the extent lock. */ + struct ldlm_extent l_req_extent; + + unsigned int l_failed:1, + /** + * Set for locks that were removed from class hash table and will be + * destroyed when last reference to them is released. Set by + * ldlm_lock_destroy_internal(). + * + * Protected by lock and resource locks. + */ + l_destroyed:1, + /* + * it's set in lock_res_and_lock() and unset in unlock_res_and_lock(). + * + * NB: compared with check_res_locked(), checking this bit is cheaper. + * Also, spin_is_locked() is deprecated for kernel code; one reason is + * because it works only for SMP so user needs to add extra macros like + * LASSERT_SPIN_LOCKED for uniprocessor kernels. + */ + l_res_locked:1, + /* + * It's set once we call ldlm_add_waiting_lock_res_locked() + * to start the lock-timeout timer and it will never be reset. + * + * Protected by lock_res_and_lock(). + */ + l_waited:1, + /** Flag whether this is a server namespace lock. */ + l_ns_srv:1; + + /* + * Client-side-only members. + */ + + enum lvb_type l_lvb_type; + + /** + * Temporary storage for a LVB received during an enqueue operation. + */ + __u32 l_lvb_len; + void *l_lvb_data; + + /** Private storage for lock user. Opaque to LDLM. */ + void *l_ast_data; + + /* + * Server-side-only members. + */ + + /** + * Connection cookie for the client originating the operation. + * Used by Commit on Share (COS) code. Currently only used for + * inodebits locks on MDS. + */ + __u64 l_client_cookie; + + /** + * List item for locks waiting for cancellation from clients. + * The lists this could be linked into are: + * waiting_locks_list (protected by waiting_locks_spinlock), + * then if the lock timed out, it is moved to + * expired_lock_thread.elt_expired_locks for further processing. + * Protected by elt_lock. + */ + struct list_head l_pending_chain; + + /** + * Set when lock is sent a blocking AST. Time in seconds when timeout + * is reached and client holding this lock could be evicted. + * This timeout could be further extended by e.g. certain IO activity + * under this lock. + * \see ost_rw_prolong_locks + */ + cfs_time_t l_callback_timeout; + + /** Local PID of process which created this lock. */ + __u32 l_pid; + + /** + * Number of times blocking AST was sent for this lock. + * This is for debugging. Valid values are 0 and 1, if there is an + * attempt to send blocking AST more than once, an assertion would be + * hit. \see ldlm_work_bl_ast_lock + */ + int l_bl_ast_run; + /** List item ldlm_add_ast_work_item() for case of blocking ASTs. */ + struct list_head l_bl_ast; + /** List item ldlm_add_ast_work_item() for case of completion ASTs. */ + struct list_head l_cp_ast; + /** For ldlm_add_ast_work_item() for "revoke" AST used in COS. */ + struct list_head l_rk_ast; + + /** + * Pointer to a conflicting lock that caused blocking AST to be sent + * for this lock + */ + struct ldlm_lock *l_blocking_lock; + + /** + * Protected by lr_lock, linkages to "skip lists". + * For more explanations of skip lists see ldlm/ldlm_inodebits.c + */ + struct list_head l_sl_mode; + struct list_head l_sl_policy; + + /** Reference tracking structure to debug leaked locks. */ + struct lu_ref l_reference; +#if LUSTRE_TRACKS_LOCK_EXP_REFS + /* Debugging stuff for bug 20498, for tracking export references. */ + /** number of export references taken */ + int l_exp_refs_nr; + /** link all locks referencing one export */ + struct list_head l_exp_refs_link; + /** referenced export object */ + struct obd_export *l_exp_refs_target; +#endif + /** + * export blocking dlm lock list, protected by + * l_export->exp_bl_list_lock. + * Lock order of waiting_lists_spinlock, exp_bl_list_lock and res lock + * is: res lock -> exp_bl_list_lock -> wanting_lists_spinlock. + */ + struct list_head l_exp_list; +}; + +/** + * LDLM resource description. + * Basically, resource is a representation for a single object. + * Object has a name which is currently 4 64-bit integers. LDLM user is + * responsible for creation of a mapping between objects it wants to be + * protected and resource names. + * + * A resource can only hold locks of a single lock type, though there may be + * multiple ldlm_locks on a single resource, depending on the lock type and + * whether the locks are conflicting or not. + */ +struct ldlm_resource { + struct ldlm_ns_bucket *lr_ns_bucket; + + /** + * List item for list in namespace hash. + * protected by ns_lock + */ + struct hlist_node lr_hash; + + /** Spinlock to protect locks under this resource. */ + spinlock_t lr_lock; + + /** + * protected by lr_lock + * @{ */ + /** List of locks in granted state */ + struct list_head lr_granted; + /** List of locks waiting to change their granted mode (converted) */ + struct list_head lr_converting; + /** + * List of locks that could not be granted due to conflicts and + * that are waiting for conflicts to go away */ + struct list_head lr_waiting; + /** @} */ + + /* XXX No longer needed? Remove ASAP */ + ldlm_mode_t lr_most_restr; + + /** Type of locks this resource can hold. Only one type per resource. */ + ldlm_type_t lr_type; /* LDLM_{PLAIN,EXTENT,FLOCK,IBITS} */ + + /** Resource name */ + struct ldlm_res_id lr_name; + /** Reference count for this resource */ + atomic_t lr_refcount; + + /** + * Interval trees (only for extent locks) for all modes of this resource + */ + struct ldlm_interval_tree lr_itree[LCK_MODE_NUM]; + + /** + * Server-side-only lock value block elements. + * To serialize lvbo_init. + */ + struct mutex lr_lvb_mutex; + int lr_lvb_len; + /** protected by lr_lock */ + void *lr_lvb_data; + + /** When the resource was considered as contended. */ + cfs_time_t lr_contention_time; + /** List of references to this resource. For debugging. */ + struct lu_ref lr_reference; + + struct inode *lr_lvb_inode; +}; + +static inline bool ldlm_has_layout(struct ldlm_lock *lock) +{ + return lock->l_resource->lr_type == LDLM_IBITS && + lock->l_policy_data.l_inodebits.bits & MDS_INODELOCK_LAYOUT; +} + +static inline char * +ldlm_ns_name(struct ldlm_namespace *ns) +{ + return ns->ns_rs_hash->hs_name; +} + +static inline struct ldlm_namespace * +ldlm_res_to_ns(struct ldlm_resource *res) +{ + return res->lr_ns_bucket->nsb_namespace; +} + +static inline struct ldlm_namespace * +ldlm_lock_to_ns(struct ldlm_lock *lock) +{ + return ldlm_res_to_ns(lock->l_resource); +} + +static inline char * +ldlm_lock_to_ns_name(struct ldlm_lock *lock) +{ + return ldlm_ns_name(ldlm_lock_to_ns(lock)); +} + +static inline struct adaptive_timeout * +ldlm_lock_to_ns_at(struct ldlm_lock *lock) +{ + return &lock->l_resource->lr_ns_bucket->nsb_at_estimate; +} + +static inline int ldlm_lvbo_init(struct ldlm_resource *res) +{ + struct ldlm_namespace *ns = ldlm_res_to_ns(res); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_init != NULL) + return ns->ns_lvbo->lvbo_init(res); + + return 0; +} + +static inline int ldlm_lvbo_size(struct ldlm_lock *lock) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL && ns->ns_lvbo->lvbo_size != NULL) + return ns->ns_lvbo->lvbo_size(lock); + + return 0; +} + +static inline int ldlm_lvbo_fill(struct ldlm_lock *lock, void *buf, int len) +{ + struct ldlm_namespace *ns = ldlm_lock_to_ns(lock); + + if (ns->ns_lvbo != NULL) { + LASSERT(ns->ns_lvbo->lvbo_fill != NULL); + return ns->ns_lvbo->lvbo_fill(lock, buf, len); + } + return 0; +} + +struct ldlm_ast_work { + struct ldlm_lock *w_lock; + int w_blocking; + struct ldlm_lock_desc w_desc; + struct list_head w_list; + int w_flags; + void *w_data; + int w_datalen; +}; + +/** + * Common ldlm_enqueue parameters + */ +struct ldlm_enqueue_info { + __u32 ei_type; /** Type of the lock being enqueued. */ + __u32 ei_mode; /** Mode of the lock being enqueued. */ + void *ei_cb_bl; /** blocking lock callback */ + void *ei_cb_cp; /** lock completion callback */ + void *ei_cb_gl; /** lock glimpse callback */ + void *ei_cb_wg; /** lock weigh callback */ + void *ei_cbdata; /** Data to be passed into callbacks. */ +}; + +extern struct obd_ops ldlm_obd_ops; + +extern char *ldlm_lockname[]; +extern char *ldlm_typename[]; +extern char *ldlm_it2str(int it); + +/** + * Just a fancy CDEBUG call with log level preset to LDLM_DEBUG. + * For the cases where we do not have actual lock to print along + * with a debugging message that is ldlm-related + */ +#define LDLM_DEBUG_NOLOCK(format, a...) \ + CDEBUG(D_DLMTRACE, "### " format "\n" , ##a) + +/** + * Support function for lock information printing into debug logs. + * \see LDLM_DEBUG + */ +#define ldlm_lock_debug(msgdata, mask, cdls, lock, fmt, a...) do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _ldlm_lock_debug(lock, msgdata, fmt, ##a); \ +} while(0) + +void _ldlm_lock_debug(struct ldlm_lock *lock, + struct libcfs_debug_msg_data *data, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Rate-limited version of lock printing function. + */ +#define LDLM_DEBUG_LIMIT(mask, lock, fmt, a...) do { \ + static cfs_debug_limit_state_t _ldlm_cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, mask, &_ldlm_cdls); \ + ldlm_lock_debug(&msgdata, mask, &_ldlm_cdls, lock, "### " fmt , ##a);\ +} while (0) + +#define LDLM_ERROR(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_ERROR, lock, fmt, ## a) +#define LDLM_WARN(lock, fmt, a...) LDLM_DEBUG_LIMIT(D_WARNING, lock, fmt, ## a) + +/** Non-rate-limited lock printing function for debugging purposes. */ +#define LDLM_DEBUG(lock, fmt, a...) do { \ + if (likely(lock != NULL)) { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, D_DLMTRACE, NULL); \ + ldlm_lock_debug(&msgdata, D_DLMTRACE, NULL, lock, \ + "### " fmt , ##a); \ + } else { \ + LDLM_DEBUG_NOLOCK("no dlm lock: " fmt, ##a); \ + } \ +} while (0) + +typedef int (*ldlm_processing_policy)(struct ldlm_lock *lock, __u64 *flags, + int first_enq, ldlm_error_t *err, + struct list_head *work_list); + +/** + * Return values for lock iterators. + * Also used during deciding of lock grants and cancellations. + */ +#define LDLM_ITER_CONTINUE 1 /* keep iterating */ +#define LDLM_ITER_STOP 2 /* stop iterating */ + +typedef int (*ldlm_iterator_t)(struct ldlm_lock *, void *); +typedef int (*ldlm_res_iterator_t)(struct ldlm_resource *, void *); + +/** \defgroup ldlm_iterator Lock iterators + * + * LDLM provides for a way to iterate through every lock on a resource or + * namespace or every resource in a namespace. + * @{ */ +int ldlm_resource_foreach(struct ldlm_resource *res, ldlm_iterator_t iter, + void *closure); +void ldlm_namespace_foreach(struct ldlm_namespace *ns, ldlm_iterator_t iter, + void *closure); +int ldlm_resource_iterate(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_iterator_t iter, void *data); +/** @} ldlm_iterator */ + +int ldlm_replay_locks(struct obd_import *imp); + +/* ldlm_flock.c */ +int ldlm_flock_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); + +/* ldlm_extent.c */ +__u64 ldlm_extent_shift_kms(struct ldlm_lock *lock, __u64 old_kms); + +struct ldlm_callback_suite { + ldlm_completion_callback lcs_completion; + ldlm_blocking_callback lcs_blocking; + ldlm_glimpse_callback lcs_glimpse; + ldlm_weigh_callback lcs_weigh; +}; + +/* ldlm_lockd.c */ +int ldlm_del_waiting_lock(struct ldlm_lock *lock); +int ldlm_refresh_waiting_lock(struct ldlm_lock *lock, int timeout); +int ldlm_get_ref(void); +void ldlm_put_ref(void); +int ldlm_init_export(struct obd_export *exp); +void ldlm_destroy_export(struct obd_export *exp); +struct ldlm_lock *ldlm_request_lock(struct ptlrpc_request *req); + +/* ldlm_lock.c */ +void ldlm_register_intent(struct ldlm_namespace *ns, ldlm_res_policy arg); +void ldlm_lock2handle(const struct ldlm_lock *lock, + struct lustre_handle *lockh); +struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, __u64 flags); +void ldlm_cancel_callback(struct ldlm_lock *); +int ldlm_lock_remove_from_lru(struct ldlm_lock *); +int ldlm_lock_set_data(struct lustre_handle *, void *); + +/** + * Obtain a lock reference by its handle. + */ +static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) +{ + return __ldlm_handle2lock(h, 0); +} + +#define LDLM_LOCK_REF_DEL(lock) \ + lu_ref_del(&lock->l_reference, "handle", current) + +static inline struct ldlm_lock * +ldlm_handle2lock_long(const struct lustre_handle *h, __u64 flags) +{ + struct ldlm_lock *lock; + + lock = __ldlm_handle2lock(h, flags); + if (lock != NULL) + LDLM_LOCK_REF_DEL(lock); + return lock; +} + +/** + * Update Lock Value Block Operations (LVBO) on a resource taking into account + * data from reqest \a r + */ +static inline int ldlm_res_lvbo_update(struct ldlm_resource *res, + struct ptlrpc_request *r, int increase) +{ + if (ldlm_res_to_ns(res)->ns_lvbo && + ldlm_res_to_ns(res)->ns_lvbo->lvbo_update) { + return ldlm_res_to_ns(res)->ns_lvbo->lvbo_update(res, r, + increase); + } + return 0; +} + +int ldlm_error2errno(ldlm_error_t error); +ldlm_error_t ldlm_errno2error(int err_no); /* don't call it `errno': this + * confuses user-space. */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void ldlm_dump_export_locks(struct obd_export *exp); +#endif + +/** + * Release a temporary lock reference obtained by ldlm_handle2lock() or + * __ldlm_handle2lock(). + */ +#define LDLM_LOCK_PUT(lock) \ +do { \ + LDLM_LOCK_REF_DEL(lock); \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +/** + * Release a lock reference obtained by some other means (see + * LDLM_LOCK_PUT()). + */ +#define LDLM_LOCK_RELEASE(lock) \ +do { \ + /*LDLM_DEBUG((lock), "put");*/ \ + ldlm_lock_put(lock); \ +} while (0) + +#define LDLM_LOCK_GET(lock) \ +({ \ + ldlm_lock_get(lock); \ + /*LDLM_DEBUG((lock), "get");*/ \ + lock; \ +}) + +#define ldlm_lock_list_put(head, member, count) \ +({ \ + struct ldlm_lock *_lock, *_next; \ + int c = count; \ + list_for_each_entry_safe(_lock, _next, head, member) { \ + if (c-- == 0) \ + break; \ + list_del_init(&_lock->member); \ + LDLM_LOCK_RELEASE(_lock); \ + } \ + LASSERT(c <= 0); \ +}) + +struct ldlm_lock *ldlm_lock_get(struct ldlm_lock *lock); +void ldlm_lock_put(struct ldlm_lock *lock); +void ldlm_lock_destroy(struct ldlm_lock *lock); +void ldlm_lock2desc(struct ldlm_lock *lock, struct ldlm_lock_desc *desc); +void ldlm_lock_addref(struct lustre_handle *lockh, __u32 mode); +int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); +void ldlm_lock_fail_match_locked(struct ldlm_lock *lock); +void ldlm_lock_fail_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match(struct ldlm_lock *lock); +void ldlm_lock_allow_match_locked(struct ldlm_lock *lock); +ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, __u64 flags, + const struct ldlm_res_id *, ldlm_type_t type, + ldlm_policy_data_t *, ldlm_mode_t mode, + struct lustre_handle *, int unref); +ldlm_mode_t ldlm_revalidate_lock_handle(struct lustre_handle *lockh, + __u64 *bits); +struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, + __u32 *flags); +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode); +void ldlm_lock_cancel(struct ldlm_lock *lock); +void ldlm_reprocess_all(struct ldlm_resource *res); +void ldlm_reprocess_all_ns(struct ldlm_namespace *ns); +void ldlm_lock_dump_handle(int level, struct lustre_handle *); +void ldlm_unlink_lock_skiplist(struct ldlm_lock *req); + +/* resource.c */ +struct ldlm_namespace * +ldlm_namespace_new(struct obd_device *obd, char *name, + ldlm_side_t client, ldlm_appetite_t apt, + ldlm_ns_type_t ns_type); +int ldlm_namespace_cleanup(struct ldlm_namespace *ns, __u64 flags); +void ldlm_namespace_free(struct ldlm_namespace *ns, + struct obd_import *imp, int force); +void ldlm_namespace_register(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_unregister(struct ldlm_namespace *ns, ldlm_side_t client); +void ldlm_namespace_move_locked(struct ldlm_namespace *ns, ldlm_side_t client); +struct ldlm_namespace *ldlm_namespace_first_locked(ldlm_side_t client); +void ldlm_namespace_get(struct ldlm_namespace *ns); +void ldlm_namespace_put(struct ldlm_namespace *ns); +int ldlm_proc_setup(void); +#ifdef LPROCFS +void ldlm_proc_cleanup(void); +#else +static inline void ldlm_proc_cleanup(void) {} +#endif + +/* resource.c - internal */ +struct ldlm_resource *ldlm_resource_get(struct ldlm_namespace *ns, + struct ldlm_resource *parent, + const struct ldlm_res_id *, + ldlm_type_t type, int create); +struct ldlm_resource *ldlm_resource_getref(struct ldlm_resource *res); +int ldlm_resource_putref(struct ldlm_resource *res); +void ldlm_resource_add_lock(struct ldlm_resource *res, + struct list_head *head, + struct ldlm_lock *lock); +void ldlm_resource_unlink_lock(struct ldlm_lock *lock); +void ldlm_res2desc(struct ldlm_resource *res, struct ldlm_resource_desc *desc); +void ldlm_dump_all_namespaces(ldlm_side_t client, int level); +void ldlm_namespace_dump(int level, struct ldlm_namespace *); +void ldlm_resource_dump(int level, struct ldlm_resource *); +int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, + const struct ldlm_res_id *); + +#define LDLM_RESOURCE_ADDREF(res) do { \ + lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +#define LDLM_RESOURCE_DELREF(res) do { \ + lu_ref_del(&(res)->lr_reference, __FUNCTION__, current); \ +} while (0) + +/* ldlm_request.c */ +int ldlm_expired_completion_wait(void *data); +/** \defgroup ldlm_local_ast Default AST handlers for local locks + * These AST handlers are typically used for server-side local locks and are + * also used by client-side lock handlers to perform minimum level base + * processing. + * @{ */ +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag); +int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); +int ldlm_completion_ast_async(struct ldlm_lock *lock, __u64 flags, void *data); +int ldlm_completion_ast(struct ldlm_lock *lock, __u64 flags, void *data); +/** @} ldlm_local_ast */ + +/** \defgroup ldlm_cli_api API to operate on locks from actual LDLM users. + * These are typically used by client and server (*_local versions) + * to obtain and release locks. + * @{ */ +int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, + struct ldlm_enqueue_info *einfo, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t const *policy, __u64 *flags, + void *lvb, __u32 lvb_len, enum lvb_type lvb_type, + struct lustre_handle *lockh, int async); +int ldlm_prep_enqueue_req(struct obd_export *exp, + struct ptlrpc_request *req, + struct list_head *cancels, + int count); +int ldlm_prep_elc_req(struct obd_export *exp, + struct ptlrpc_request *req, + int version, int opc, int canceloff, + struct list_head *cancels, int count); + +struct ptlrpc_request *ldlm_enqueue_pack(struct obd_export *exp, int lvb_len); +int ldlm_handle_enqueue0(struct ldlm_namespace *ns, struct ptlrpc_request *req, + const struct ldlm_request *dlm_req, + const struct ldlm_callback_suite *cbs); +int ldlm_cli_enqueue_fini(struct obd_export *exp, struct ptlrpc_request *req, + ldlm_type_t type, __u8 with_policy, ldlm_mode_t mode, + __u64 *flags, void *lvb, __u32 lvb_len, + struct lustre_handle *lockh, int rc); +int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_type_t type, ldlm_policy_data_t *policy, + ldlm_mode_t mode, __u64 *flags, + ldlm_blocking_callback blocking, + ldlm_completion_callback completion, + ldlm_glimpse_callback glimpse, + void *data, __u32 lvb_len, enum lvb_type lvb_type, + const __u64 *client_cookie, + struct lustre_handle *lockh); +int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, + void *data, __u32 data_len); +int ldlm_cli_convert(struct lustre_handle *, int new_mode, __u32 *flags); +int ldlm_cli_update_pool(struct ptlrpc_request *req); +int ldlm_cli_cancel(struct lustre_handle *lockh, + ldlm_cancel_flags_t cancel_flags); +int ldlm_cli_cancel_unused(struct ldlm_namespace *, const struct ldlm_res_id *, + ldlm_cancel_flags_t flags, void *opaque); +int ldlm_cli_cancel_unused_resource(struct ldlm_namespace *ns, + const struct ldlm_res_id *res_id, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque); +int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *head, + int count, ldlm_cancel_flags_t flags); +int ldlm_cancel_resource_local(struct ldlm_resource *res, + struct list_head *cancels, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, int lock_flags, + ldlm_cancel_flags_t cancel_flags, void *opaque); +int ldlm_cli_cancel_list_local(struct list_head *cancels, int count, + ldlm_cancel_flags_t flags); +int ldlm_cli_cancel_list(struct list_head *head, int count, + struct ptlrpc_request *req, ldlm_cancel_flags_t flags); +/** @} ldlm_cli_api */ + +/* mds/handler.c */ +/* This has to be here because recursive inclusion sucks. */ +int intent_disposition(struct ldlm_reply *rep, int flag); +void intent_set_disposition(struct ldlm_reply *rep, int flag); + + +/* ioctls for trying requests */ +#define IOC_LDLM_TYPE 'f' +#define IOC_LDLM_MIN_NR 40 + +#define IOC_LDLM_TEST _IOWR('f', 40, long) +#define IOC_LDLM_DUMP _IOWR('f', 41, long) +#define IOC_LDLM_REGRESS_START _IOWR('f', 42, long) +#define IOC_LDLM_REGRESS_STOP _IOWR('f', 43, long) +#define IOC_LDLM_MAX_NR 43 + +/** + * "Modes" of acquiring lock_res, necessary to tell lockdep that taking more + * than one lock_res is dead-lock safe. + */ +enum lock_res_type { + LRT_NORMAL, + LRT_NEW +}; + +/** Lock resource. */ +static inline void lock_res(struct ldlm_resource *res) +{ + spin_lock(&res->lr_lock); +} + +/** Lock resource with a way to instruct lockdep code about nestedness-safe. */ +static inline void lock_res_nested(struct ldlm_resource *res, + enum lock_res_type mode) +{ + spin_lock_nested(&res->lr_lock, mode); +} + +/** Unlock resource. */ +static inline void unlock_res(struct ldlm_resource *res) +{ + spin_unlock(&res->lr_lock); +} + +/** Check if resource is already locked, assert if not. */ +static inline void check_res_locked(struct ldlm_resource *res) +{ + LASSERT(spin_is_locked(&res->lr_lock)); +} + +struct ldlm_resource * lock_res_and_lock(struct ldlm_lock *lock); +void unlock_res_and_lock(struct ldlm_lock *lock); + +/* ldlm_pool.c */ +/** \defgroup ldlm_pools Various LDLM pool related functions + * There are not used outside of ldlm. + * @{ + */ +void ldlm_pools_recalc(ldlm_side_t client); +int ldlm_pools_init(void); +void ldlm_pools_fini(void); + +int ldlm_pool_init(struct ldlm_pool *pl, struct ldlm_namespace *ns, + int idx, ldlm_side_t client); +int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, + unsigned int gfp_mask); +void ldlm_pool_fini(struct ldlm_pool *pl); +int ldlm_pool_setup(struct ldlm_pool *pl, int limit); +int ldlm_pool_recalc(struct ldlm_pool *pl); +__u32 ldlm_pool_get_lvf(struct ldlm_pool *pl); +__u64 ldlm_pool_get_slv(struct ldlm_pool *pl); +__u64 ldlm_pool_get_clv(struct ldlm_pool *pl); +__u32 ldlm_pool_get_limit(struct ldlm_pool *pl); +void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv); +void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv); +void ldlm_pool_set_limit(struct ldlm_pool *pl, __u32 limit); +void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock); +void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock); +/** @} */ + +#endif +/** @} LDLM */ diff --git a/drivers/staging/lustre/lustre/include/lustre_eacl.h b/drivers/staging/lustre/lustre/include/lustre_eacl.h new file mode 100644 index 000000000000..b94f76a3301b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_eacl.h @@ -0,0 +1,95 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_EACL_H +#define _LUSTRE_EACL_H + +/** \defgroup eacl eacl + * + * @{ + */ + +#ifdef CONFIG_FS_POSIX_ACL + +#include <linux/posix_acl_xattr.h> + +typedef struct { + __u16 e_tag; + __u16 e_perm; + __u32 e_id; + __u32 e_stat; +} ext_acl_xattr_entry; + +typedef struct { + __u32 a_count; + ext_acl_xattr_entry a_entries[0]; +} ext_acl_xattr_header; + +#define CFS_ACL_XATTR_SIZE(count, prefix) \ + (sizeof(prefix ## _header) + (count) * sizeof(prefix ## _entry)) + +#define CFS_ACL_XATTR_COUNT(size, prefix) \ + (((size) - sizeof(prefix ## _header)) / sizeof(prefix ## _entry)) + + +extern ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size); +extern int +lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size, + posix_acl_xattr_header **out); +extern void +lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size); +extern void +lustre_ext_acl_xattr_free(ext_acl_xattr_header *header); +extern int +lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out); +extern ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header); + +#endif /* CONFIG_FS_POSIX_ACL */ + +/** @} eacl */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_export.h b/drivers/staging/lustre/lustre/include/lustre_export.h new file mode 100644 index 000000000000..d61c020a4643 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_export.h @@ -0,0 +1,389 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_export PortalRPC export definitions + * + * @{ + */ + +#ifndef __EXPORT_H +#define __EXPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include <lprocfs_status.h> +#include <lustre/lustre_idl.h> +#include <lustre_dlm.h> + +struct mds_client_data; +struct mdt_client_data; +struct mds_idmap_table; +struct mdt_idmap_table; + +/** + * Target-specific export data + */ +struct tg_export_data { + /** Protects led_lcd below */ + struct mutex ted_lcd_lock; + /** Per-client data for each export */ + struct lsd_client_data *ted_lcd; + /** Offset of record in last_rcvd file */ + loff_t ted_lr_off; + /** Client index in last_rcvd file */ + int ted_lr_idx; +}; + +/** + * MDT-specific export data + */ +struct mdt_export_data { + struct tg_export_data med_ted; + /** List of all files opened by client on this MDT */ + struct list_head med_open_head; + spinlock_t med_open_lock; /* med_open_head, mfd_list */ + /** Bitmask of all ibit locks this MDT understands */ + __u64 med_ibits_known; + struct mutex med_idmap_mutex; + struct lustre_idmap_table *med_idmap; +}; + +struct ec_export_data { /* echo client */ + struct list_head eced_locks; +}; + +/* In-memory access to client data from OST struct */ +/** Filter (oss-side) specific import data */ +struct filter_export_data { + struct tg_export_data fed_ted; + spinlock_t fed_lock; /**< protects fed_mod_list */ + long fed_dirty; /* in bytes */ + long fed_grant; /* in bytes */ + struct list_head fed_mod_list; /* files being modified */ + int fed_mod_count;/* items in fed_writing list */ + long fed_pending; /* bytes just being written */ + __u32 fed_group; + __u8 fed_pagesize; /* log2 of client page size */ +}; + +struct mgs_export_data { + struct list_head med_clients; /* mgc fs client via this exp */ + spinlock_t med_lock; /* protect med_clients */ +}; + +/** + * per-NID statistics structure. + * It tracks access patterns to this export on a per-client-NID basis + */ +struct nid_stat { + lnet_nid_t nid; + struct hlist_node nid_hash; + struct list_head nid_list; + struct obd_device *nid_obd; + struct proc_dir_entry *nid_proc; + struct lprocfs_stats *nid_stats; + struct lprocfs_stats *nid_ldlm_stats; + atomic_t nid_exp_ref_count; /* for obd_nid_stats_hash + exp_nid_stats */ +}; + +#define nidstat_getref(nidstat) \ +do { \ + atomic_inc(&(nidstat)->nid_exp_ref_count); \ +} while(0) + +#define nidstat_putref(nidstat) \ +do { \ + atomic_dec(&(nidstat)->nid_exp_ref_count); \ + LASSERTF(atomic_read(&(nidstat)->nid_exp_ref_count) >= 0, \ + "stat %p nid_exp_ref_count < 0\n", nidstat); \ +} while(0) + +enum obd_option { + OBD_OPT_FORCE = 0x0001, + OBD_OPT_FAILOVER = 0x0002, + OBD_OPT_ABORT_RECOV = 0x0004, +}; + +/** + * Export structure. Represents target-side of connection in portals. + * Also used in Lustre to connect between layers on the same node when + * there is no network-connection in-between. + * For every connected client there is an export structure on the server + * attached to the same obd device. + */ +struct obd_export { + /** + * Export handle, it's id is provided to client on connect + * Subsequent client RPCs contain this handle id to identify + * what export they are talking to. + */ + struct portals_handle exp_handle; + atomic_t exp_refcount; + /** + * Set of counters below is to track where export references are + * kept. The exp_rpc_count is used for reconnect handling also, + * the cb_count and locks_count are for debug purposes only for now. + * The sum of them should be less than exp_refcount by 3 + */ + atomic_t exp_rpc_count; /* RPC references */ + atomic_t exp_cb_count; /* Commit callback references */ + /** Number of queued replay requests to be processes */ + atomic_t exp_replay_count; + atomic_t exp_locks_count; /** Lock references */ +#if LUSTRE_TRACKS_LOCK_EXP_REFS + struct list_head exp_locks_list; + spinlock_t exp_locks_list_guard; +#endif + /** UUID of client connected to this export */ + struct obd_uuid exp_client_uuid; + /** To link all exports on an obd device */ + struct list_head exp_obd_chain; + struct hlist_node exp_uuid_hash; /** uuid-export hash*/ + struct hlist_node exp_nid_hash; /** nid-export hash */ + /** + * All exports eligible for ping evictor are linked into a list + * through this field in "most time since last request on this export" + * order + * protected by obd_dev_lock + */ + struct list_head exp_obd_chain_timed; + /** Obd device of this export */ + struct obd_device *exp_obd; + /** + * "reverse" import to send requests (e.g. from ldlm) back to client + * exp_lock protect its change + */ + struct obd_import *exp_imp_reverse; + struct nid_stat *exp_nid_stats; + struct lprocfs_stats *exp_md_stats; + /** Active connetion */ + struct ptlrpc_connection *exp_connection; + /** Connection count value from last succesful reconnect rpc */ + __u32 exp_conn_cnt; + /** Hash list of all ldlm locks granted on this export */ + cfs_hash_t *exp_lock_hash; + /** + * Hash list for Posix lock deadlock detection, added with + * ldlm_lock::l_exp_flock_hash. + */ + cfs_hash_t *exp_flock_hash; + struct list_head exp_outstanding_replies; + struct list_head exp_uncommitted_replies; + spinlock_t exp_uncommitted_replies_lock; + /** Last committed transno for this export */ + __u64 exp_last_committed; + /** When was last request received */ + cfs_time_t exp_last_request_time; + /** On replay all requests waiting for replay are linked here */ + struct list_head exp_req_replay_queue; + /** + * protects exp_flags, exp_outstanding_replies and the change + * of exp_imp_reverse + */ + spinlock_t exp_lock; + /** Compatibility flags for this export are embedded into + * exp_connect_data */ + struct obd_connect_data exp_connect_data; + enum obd_option exp_flags; + unsigned long exp_failed:1, + exp_in_recovery:1, + exp_disconnected:1, + exp_connecting:1, + /** VBR: export missed recovery */ + exp_delayed:1, + /** VBR: failed version checking */ + exp_vbr_failed:1, + exp_req_replay_needed:1, + exp_lock_replay_needed:1, + exp_need_sync:1, + exp_flvr_changed:1, + exp_flvr_adapt:1, + exp_libclient:1, /* liblustre client? */ + /* client timed out and tried to reconnect, + * but couldn't because of active rpcs */ + exp_abort_active_req:1, + /* if to swap nidtbl entries for 2.2 clients. + * Only used by the MGS to fix LU-1644. */ + exp_need_mne_swab:1; + /* also protected by exp_lock */ + enum lustre_sec_part exp_sp_peer; + struct sptlrpc_flavor exp_flvr; /* current */ + struct sptlrpc_flavor exp_flvr_old[2]; /* about-to-expire */ + cfs_time_t exp_flvr_expire[2]; /* seconds */ + + /** protects exp_hp_rpcs */ + spinlock_t exp_rpc_lock; + struct list_head exp_hp_rpcs; /* (potential) HP RPCs */ + + /** blocking dlm lock list, protected by exp_bl_list_lock */ + struct list_head exp_bl_list; + spinlock_t exp_bl_list_lock; + + /** Target specific data */ + union { + struct tg_export_data eu_target_data; + struct mdt_export_data eu_mdt_data; + struct filter_export_data eu_filter_data; + struct ec_export_data eu_ec_data; + struct mgs_export_data eu_mgs_data; + } u; +}; + +#define exp_target_data u.eu_target_data +#define exp_mdt_data u.eu_mdt_data +#define exp_filter_data u.eu_filter_data +#define exp_ec_data u.eu_ec_data + +static inline __u64 *exp_connect_flags_ptr(struct obd_export *exp) +{ + return &exp->exp_connect_data.ocd_connect_flags; +} + +static inline __u64 exp_connect_flags(struct obd_export *exp) +{ + return *exp_connect_flags_ptr(exp); +} + +static inline int exp_max_brw_size(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_BRW_SIZE) + return exp->exp_connect_data.ocd_brw_size; + + return ONE_MB_BRW_SIZE; +} + +static inline int exp_connect_multibulk(struct obd_export *exp) +{ + return exp_max_brw_size(exp) > ONE_MB_BRW_SIZE; +} + +static inline int exp_expired(struct obd_export *exp, cfs_duration_t age) +{ + LASSERT(exp->exp_delayed); + return cfs_time_before(cfs_time_add(exp->exp_last_request_time, age), + cfs_time_current_sec()); +} + +static inline int exp_connect_cancelset(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_CANCELSET); +} + +static inline int exp_connect_lru_resize(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_rmtclient(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_RMT_CLIENT); +} + +static inline int client_is_remote(struct obd_export *exp) +{ + struct obd_import *imp = class_exp2cliimp(exp); + + return !!(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_RMT_CLIENT); +} + +static inline int exp_connect_vbr(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT(exp->exp_connection); + return !!(exp_connect_flags(exp) & OBD_CONNECT_VBR); +} + +static inline int exp_connect_som(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp_connect_flags(exp) & OBD_CONNECT_SOM); +} + +static inline int exp_connect_umask(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_UMASK); +} + +static inline int imp_connect_lru_resize(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + return !!(ocd->ocd_connect_flags & OBD_CONNECT_LRU_RESIZE); +} + +static inline int exp_connect_layout(struct obd_export *exp) +{ + return !!(exp_connect_flags(exp) & OBD_CONNECT_LAYOUTLOCK); +} + +static inline bool exp_connect_lvb_type(struct obd_export *exp) +{ + LASSERT(exp != NULL); + if (exp_connect_flags(exp) & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +static inline bool imp_connect_lvb_type(struct obd_import *imp) +{ + struct obd_connect_data *ocd; + + LASSERT(imp != NULL); + ocd = &imp->imp_connect_data; + if (ocd->ocd_connect_flags & OBD_CONNECT_LVB_TYPE) + return true; + else + return false; +} + +extern struct obd_export *class_conn2export(struct lustre_handle *conn); +extern struct obd_device *class_conn2obd(struct lustre_handle *conn); + +/** @} export */ + +#endif /* __EXPORT_H */ +/** @} obd_export */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fid.h b/drivers/staging/lustre/lustre/include/lustre_fid.h new file mode 100644 index 000000000000..7d20cba07287 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fid.h @@ -0,0 +1,762 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fid.h + * + * Author: Yury Umanets <umka@clusterfs.com> + */ + +#ifndef __LINUX_FID_H +#define __LINUX_FID_H + +/** \defgroup fid fid + * + * @{ + * + * http://wiki.lustre.org/index.php/Architecture_-_Interoperability_fids_zfs + * describes the FID namespace and interoperability requirements for FIDs. + * The important parts of that document are included here for reference. + * + * FID + * File IDentifier generated by client from range allocated by the SEQuence + * service and stored in struct lu_fid. The FID is composed of three parts: + * SEQuence, ObjectID, and VERsion. The SEQ component is a filesystem + * unique 64-bit integer, and only one client is ever assigned any SEQ value. + * The first 0x400 FID_SEQ_NORMAL [2^33, 2^33 + 0x400] values are reserved + * for system use. The OID component is a 32-bit value generated by the + * client on a per-SEQ basis to allow creating many unique FIDs without + * communication with the server. The VER component is a 32-bit value that + * distinguishes between different FID instantiations, such as snapshots or + * separate subtrees within the filesystem. FIDs with the same VER field + * are considered part of the same namespace. + * + * OLD filesystems are those upgraded from Lustre 1.x that predate FIDs, and + * MDTs use 32-bit ldiskfs internal inode/generation numbers (IGIFs), while + * OSTs use 64-bit Lustre object IDs and generation numbers. + * + * NEW filesystems are those formatted since the introduction of FIDs. + * + * IGIF + * Inode and Generation In FID, a surrogate FID used to globally identify + * an existing object on OLD formatted MDT file system. This would only be + * used on MDT0 in a DNE filesystem, because there cannot be more than one + * MDT in an OLD formatted filesystem. Belongs to sequence in [12, 2^32 - 1] + * range, where inode number is stored in SEQ, and inode generation is in OID. + * NOTE: This assumes no more than 2^32-1 inodes exist in the MDT filesystem, + * which is the maximum possible for an ldiskfs backend. It also assumes + * that the reserved ext3/ext4/ldiskfs inode numbers [0-11] are never visible + * to clients, which has always been true. + * + * IDIF + * object ID In FID, a surrogate FID used to globally identify an existing + * OST object on OLD formatted OST file system. Belongs to a sequence in + * [2^32, 2^33 - 1]. Sequence number is calculated as: + * + * 1 << 32 | (ost_index << 16) | ((objid >> 32) & 0xffff) + * + * that is, SEQ consists of 16-bit OST index, and higher 16 bits of object + * ID. The generation of unique SEQ values per OST allows the IDIF FIDs to + * be identified in the FLD correctly. The OID field is calculated as: + * + * objid & 0xffffffff + * + * that is, it consists of lower 32 bits of object ID. For objects within + * the IDIF range, object ID extraction will be: + * + * o_id = (fid->f_seq & 0x7fff) << 16 | fid->f_oid; + * o_seq = 0; // formerly group number + * + * NOTE: This assumes that no more than 2^48-1 objects have ever been created + * on any OST, and that no more than 65535 OSTs are in use. Both are very + * reasonable assumptions, i.e. an IDIF can uniquely map all objects assuming + * a maximum creation rate of 1M objects per second for a maximum of 9 years, + * or combinations thereof. + * + * OST_MDT0 + * Surrogate FID used to identify an existing object on OLD formatted OST + * filesystem. Belongs to the reserved SEQuence 0, and is used prior to + * the introduction of FID-on-OST, at which point IDIF will be used to + * identify objects as residing on a specific OST. + * + * LLOG + * For Lustre Log objects the object sequence 1 is used. This is compatible + * with both OLD and NEW namespaces, as this SEQ number is in the + * ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * ECHO + * For testing OST IO performance the object sequence 2 is used. This is + * compatible with both OLD and NEW namespaces, as this SEQ number is in + * the ext3/ldiskfs reserved inode range and does not conflict with IGIF + * sequence numbers. + * + * OST_MDT1 .. OST_MAX + * For testing with multiple MDTs the object sequence 3 through 9 is used, + * allowing direct mapping of MDTs 1 through 7 respectively, for a total + * of 8 MDTs including OST_MDT0. This matches the legacy CMD project "group" + * mappings. However, this SEQ range is only for testing prior to any + * production DNE release, as the objects in this range conflict across all + * OSTs, as the OST index is not part of the FID. For production DNE usage, + * OST objects created by MDT1+ will use FID_SEQ_NORMAL FIDs. + * + * DLM OST objid to IDIF mapping + * For compatibility with existing OLD OST network protocol structures, the + * FID must map onto the o_id and o_seq in a manner that ensures existing + * objects are identified consistently for IO, as well as onto the LDLM + * namespace to ensure IDIFs there is only a single resource name for any + * object in the DLM. The OLD OST object DLM resource mapping is: + * + * resource[] = {o_id, o_seq, 0, 0}; // o_seq == 0 for production releases + * + * The NEW OST object DLM resource mapping is the same for both MDT and OST: + * + * resource[] = {SEQ, OID, VER, HASH}; + * + * NOTE: for mapping IDIF values to DLM resource names the o_id may be + * larger than the 2^33 reserved sequence numbers for IDIF, so it is possible + * for the o_id numbers to overlap FID SEQ numbers in the resource. However, + * in all production releases the OLD o_seq field is always zero, and all + * valid FID OID values are non-zero, so the lock resources will not collide. + * Even so, the MDT and OST resources are also in different LDLM namespaces. + */ + +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> +#include <lustre_req_layout.h> +#include <lustre_mdt.h> +#include <obd.h> + + +struct lu_site; +struct lu_context; + +/* Whole sequences space range and zero range definitions */ +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_fid LUSTRE_BFL_FID; +extern const struct lu_fid LU_OBF_FID; +extern const struct lu_fid LU_DOT_LUSTRE_FID; + +enum { + /* + * This is how may metadata FIDs may be allocated in one sequence(128k) + */ + LUSTRE_METADATA_SEQ_MAX_WIDTH = 0x0000000000020000ULL, + + /* + * This is how many data FIDs could be allocated in one sequence(4B - 1) + */ + LUSTRE_DATA_SEQ_MAX_WIDTH = 0x00000000FFFFFFFFULL, + + /* + * How many sequences to allocate to a client at once. + */ + LUSTRE_SEQ_META_WIDTH = 0x0000000000000001ULL, + + /* + * seq allocation pool size. + */ + LUSTRE_SEQ_BATCH_WIDTH = LUSTRE_SEQ_META_WIDTH * 1000, + + /* + * This is how many sequences may be in one super-sequence allocated to + * MDTs. + */ + LUSTRE_SEQ_SUPER_WIDTH = ((1ULL << 30ULL) * LUSTRE_SEQ_META_WIDTH) +}; + +enum { + /** 2^6 FIDs for OI containers */ + OSD_OI_FID_OID_BITS = 6, + /** reserve enough FIDs in case we want more in the future */ + OSD_OI_FID_OID_BITS_MAX = 10, +}; + +/** special OID for local objects */ +enum local_oid { + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, /* deprecated in 2.4 */ + MDD_ORPHAN_OID = 7UL, /* deprecated in 2.4 */ + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + /** \see mdt_mod_init */ + LAST_RECV_OID = 11UL, + OSD_FS_ROOT_OID = 13UL, + ACCT_USER_OID = 15UL, + ACCT_GROUP_OID = 16UL, + LFSCK_BOOKMARK_OID = 17UL, + OTABLE_IT_OID = 18UL, + /* These two definitions are obsolete + * OFD_GROUP0_LAST_OID = 20UL, + * OFD_GROUP4K_LAST_OID = 20UL+4096, + */ + OFD_LAST_GROUP_OID = 4117UL, + LLOG_CATALOGS_OID = 4118UL, + MGS_CONFIGS_OID = 4119UL, + OFD_HEALTH_CHECK_OID = 4120UL, + MDD_LOV_OBJ_OSEQ = 4121UL, + LFSCK_NAMESPACE_OID = 4122UL, + REMOTE_PARENT_DIR_OID = 4123UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + +static inline void lu_local_name_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_NAME; + fid->f_oid = oid; + fid->f_ver = 0; +} + +/* For new FS (>= 2.4), the root FID will be changed to + * [FID_SEQ_ROOT:1:0], for existing FS, (upgraded to 2.4), + * the root FID will still be IGIF */ +static inline int fid_is_root(const struct lu_fid *fid) +{ + return unlikely((fid_seq(fid) == FID_SEQ_ROOT && + fid_oid(fid) == 1)); +} + +static inline int fid_is_dot_lustre(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE); +} + +static inline int fid_is_obf(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_DOT_LUSTRE && + fid_oid(fid) == FID_OID_DOT_LUSTRE_OBF); +} + +static inline int fid_is_otable_it(const struct lu_fid *fid) +{ + return unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + fid_oid(fid) == OTABLE_IT_OID); +} + +static inline int fid_is_acct(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_LOCAL_FILE && + (fid_oid(fid) == ACCT_USER_OID || + fid_oid(fid) == ACCT_GROUP_OID); +} + +static inline int fid_is_quota(const struct lu_fid *fid) +{ + return fid_seq(fid) == FID_SEQ_QUOTA || + fid_seq(fid) == FID_SEQ_QUOTA_GLB; +} + +static inline int fid_is_namespace_visible(const struct lu_fid *fid) +{ + const __u64 seq = fid_seq(fid); + + /* Here, we cannot distinguish whether the normal FID is for OST + * object or not. It is caller's duty to check more if needed. */ + return (!fid_is_last_id(fid) && + (fid_seq_is_norm(seq) || fid_seq_is_igif(seq))) || + fid_is_root(fid) || fid_is_dot_lustre(fid); +} + +static inline int fid_seq_in_fldb(__u64 seq) +{ + return fid_seq_is_igif(seq) || fid_seq_is_norm(seq) || + fid_seq_is_root(seq) || fid_seq_is_dot(seq); +} + +static inline void lu_last_id_fid(struct lu_fid *fid, __u64 seq) +{ + if (fid_seq_is_mdt0(seq)) { + fid->f_seq = fid_idif_seq(0, 0); + } else { + LASSERTF(fid_seq_is_norm(seq) || fid_seq_is_echo(seq) || + fid_seq_is_idif(seq), LPX64"\n", seq); + fid->f_seq = seq; + } + fid->f_oid = 0; + fid->f_ver = 0; +} + +enum lu_mgr_type { + LUSTRE_SEQ_SERVER, + LUSTRE_SEQ_CONTROLLER +}; + +struct lu_server_seq; + +/* Client sequence manager interface. */ +struct lu_client_seq { + /* Sequence-controller export. */ + struct obd_export *lcs_exp; + struct mutex lcs_mutex; + + /* + * Range of allowed for allocation sequeces. When using lu_client_seq on + * clients, this contains meta-sequence range. And for servers this + * contains super-sequence range. + */ + struct lu_seq_range lcs_space; + + /* Seq related proc */ + proc_dir_entry_t *lcs_proc_dir; + + /* This holds last allocated fid in last obtained seq */ + struct lu_fid lcs_fid; + + /* LUSTRE_SEQ_METADATA or LUSTRE_SEQ_DATA */ + enum lu_cli_type lcs_type; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lcs_name[80]; + + /* + * Sequence width, that is how many objects may be allocated in one + * sequence. Default value for it is LUSTRE_SEQ_MAX_WIDTH. + */ + __u64 lcs_width; + + /* Seq-server for direct talking */ + struct lu_server_seq *lcs_srv; + + /* wait queue for fid allocation and update indicator */ + wait_queue_head_t lcs_waitq; + int lcs_update; +}; + +/* server sequence manager interface */ +struct lu_server_seq { + /* Available sequences space */ + struct lu_seq_range lss_space; + + /* keeps highwater in lsr_end for seq allocation algorithm */ + struct lu_seq_range lss_lowater_set; + struct lu_seq_range lss_hiwater_set; + + /* + * Device for server side seq manager needs (saving sequences to backing + * store). + */ + struct dt_device *lss_dev; + + /* /seq file object device */ + struct dt_object *lss_obj; + + /* Seq related proc */ + proc_dir_entry_t *lss_proc_dir; + + /* LUSTRE_SEQ_SERVER or LUSTRE_SEQ_CONTROLLER */ + enum lu_mgr_type lss_type; + + /* Client interafce to request controller */ + struct lu_client_seq *lss_cli; + + /* Mutex for protecting allocation */ + struct mutex lss_mutex; + + /* + * Service uuid, passed from MDT + seq name to form unique seq name to + * use it with procfs. + */ + char lss_name[80]; + + /* + * Allocation chunks for super and meta sequences. Default values are + * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. + */ + __u64 lss_width; + + /* + * minimum lss_alloc_set size that should be allocated from + * lss_space + */ + __u64 lss_set_width; + + /* sync is needed for update operation */ + __u32 lss_need_sync; + + /** + * Pointer to site object, required to access site fld. + */ + struct seq_server_site *lss_site; +}; + +int seq_query(struct com_thread_info *info); +int seq_handle(struct ptlrpc_request *req); + +/* Server methods */ +int seq_server_init(struct lu_server_seq *seq, + struct dt_device *dev, + const char *prefix, + enum lu_mgr_type type, + struct seq_server_site *ss, + const struct lu_env *env); + +void seq_server_fini(struct lu_server_seq *seq, + const struct lu_env *env); + +int seq_server_alloc_super(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_alloc_meta(struct lu_server_seq *seq, + struct lu_seq_range *out, + const struct lu_env *env); + +int seq_server_set_cli(struct lu_server_seq *seq, + struct lu_client_seq *cli, + const struct lu_env *env); + +/* Client methods */ +int seq_client_init(struct lu_client_seq *seq, + struct obd_export *exp, + enum lu_cli_type type, + const char *prefix, + struct lu_server_seq *srv); + +void seq_client_fini(struct lu_client_seq *seq); + +void seq_client_flush(struct lu_client_seq *seq); + +int seq_client_alloc_fid(const struct lu_env *env, struct lu_client_seq *seq, + struct lu_fid *fid); +int seq_client_get_seq(const struct lu_env *env, struct lu_client_seq *seq, + seqno_t *seqnr); +int seq_site_fini(const struct lu_env *env, struct seq_server_site *ss); +/* Fids common stuff */ +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); + +int client_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type); +int client_fid_fini(struct obd_device *obd); + +/* fid locking */ + +struct ldlm_namespace; + +/* + * Build (DLM) resource name from FID. + * + * NOTE: until Lustre 1.8.7/2.1.1 the fid_ver() was packed into name[2], + * but was moved into name[1] along with the OID to avoid consuming the + * renaming name[2,3] fields that need to be used for the quota identifier. + */ +static inline struct ldlm_res_id * +fid_build_reg_res_name(const struct lu_fid *f, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + name->name[LUSTRE_RES_ID_SEQ_OFF] = fid_seq(f); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = fid_ver_oid(f); + return name; +} + +/* + * Build (DLM) resource identifier from global quota FID and quota ID. + */ +static inline struct ldlm_res_id * +fid_build_quota_resid(const struct lu_fid *glb_fid, union lquota_id *qid, + struct ldlm_res_id *res) +{ + fid_build_reg_res_name(glb_fid, res); + res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF] = fid_seq(&qid->qid_fid); + res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] = fid_ver_oid(&qid->qid_fid); + return res; +} + +/* + * Extract global FID and quota ID from resource name + */ +static inline void fid_extract_quota_resid(struct ldlm_res_id *res, + struct lu_fid *glb_fid, + union lquota_id *qid) +{ + glb_fid->f_seq = res->name[LUSTRE_RES_ID_SEQ_OFF]; + glb_fid->f_oid = (__u32)res->name[LUSTRE_RES_ID_VER_OID_OFF]; + glb_fid->f_ver = (__u32)(res->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32); + + qid->qid_fid.f_seq = res->name[LUSTRE_RES_ID_QUOTA_SEQ_OFF]; + qid->qid_fid.f_oid = (__u32)res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF]; + qid->qid_fid.f_ver = + (__u32)(res->name[LUSTRE_RES_ID_QUOTA_VER_OID_OFF] >> 32); +} + +/* + * Return true if resource is for object identified by fid. + */ +static inline int fid_res_name_eq(const struct lu_fid *f, + const struct ldlm_res_id *name) +{ + return name->name[LUSTRE_RES_ID_SEQ_OFF] == fid_seq(f) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == fid_ver_oid(f); +} + +/* reverse function of fid_build_reg_res_name() */ +static inline void fid_build_from_res_name(struct lu_fid *f, + const struct ldlm_res_id *name) +{ + fid_zero(f); + f->f_seq = name->name[LUSTRE_RES_ID_SEQ_OFF]; + f->f_oid = name->name[LUSTRE_RES_ID_VER_OID_OFF] & 0xffffffff; + f->f_ver = name->name[LUSTRE_RES_ID_VER_OID_OFF] >> 32; + LASSERT(fid_res_name_eq(f, name)); +} + +static inline struct ldlm_res_id * +fid_build_pdo_res_name(const struct lu_fid *f, + unsigned int hash, + struct ldlm_res_id *name) +{ + fid_build_reg_res_name(f, name); + name->name[LUSTRE_RES_ID_HSH_OFF] = hash; + return name; +} + +/** + * Build DLM resource name from object id & seq, which will be removed + * finally, when we replace ost_id with FID in data stack. + * + * Currently, resid from the old client, whose res[0] = object_id, + * res[1] = object_seq, is just oposite with Metatdata + * resid, where, res[0] = fid->f_seq, res[1] = fid->f_oid. + * To unifiy the resid identification, we will reverse the data + * resid to keep it same with Metadata resid, i.e. + * + * For resid from the old client, + * res[0] = objid, res[1] = 0, still keep the original order, + * for compatiblity. + * + * For new resid + * res will be built from normal FID directly, i.e. res[0] = f_seq, + * res[1] = f_oid + f_ver. + */ +static inline void ostid_build_res_name(struct ost_id *oi, + struct ldlm_res_id *name) +{ + memset(name, 0, sizeof *name); + if (fid_seq_is_mdt0(ostid_seq(oi))) { + name->name[LUSTRE_RES_ID_SEQ_OFF] = ostid_id(oi); + name->name[LUSTRE_RES_ID_VER_OID_OFF] = ostid_seq(oi); + } else { + fid_build_reg_res_name((struct lu_fid *)oi, name); + } +} + +static inline void ostid_res_name_to_id(struct ost_id *oi, + struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_SEQ_OFF])) { + /* old resid */ + ostid_set_seq(oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + } else { + /* new resid */ + fid_build_from_res_name((struct lu_fid *)oi, name); + } +} + +/** + * Return true if the resource is for the object identified by this id & group. + */ +static inline int ostid_res_name_eq(struct ost_id *oi, + struct ldlm_res_id *name) +{ + /* Note: it is just a trick here to save some effort, probably the + * correct way would be turn them into the FID and compare */ + if (fid_seq_is_mdt0(ostid_seq(oi))) { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_id(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_seq(oi); + } else { + return name->name[LUSTRE_RES_ID_SEQ_OFF] == ostid_seq(oi) && + name->name[LUSTRE_RES_ID_VER_OID_OFF] == ostid_id(oi); + } +} + +/* The same as osc_build_res_name() */ +static inline void ost_fid_build_resid(const struct lu_fid *fid, + struct ldlm_res_id *resname) +{ + if (fid_is_mdt0(fid) || fid_is_idif(fid)) { + struct ost_id oi; + oi.oi.oi_id = 0; /* gcc 4.7.2 complains otherwise */ + if (fid_to_ostid(fid, &oi) != 0) + return; + ostid_build_res_name(&oi, resname); + } else { + fid_build_reg_res_name(fid, resname); + } +} + +static inline void ost_fid_from_resid(struct lu_fid *fid, + const struct ldlm_res_id *name) +{ + if (fid_seq_is_mdt0(name->name[LUSTRE_RES_ID_VER_OID_OFF])) { + /* old resid */ + struct ost_id oi; + ostid_set_seq(&oi, name->name[LUSTRE_RES_ID_VER_OID_OFF]); + ostid_set_id(&oi, name->name[LUSTRE_RES_ID_SEQ_OFF]); + ostid_to_fid(fid, &oi, 0); + } else { + /* new resid */ + fid_build_from_res_name(fid, name); + } +} + +/** + * Flatten 128-bit FID values into a 64-bit value for use as an inode number. + * For non-IGIF FIDs this starts just over 2^32, and continues without + * conflict until 2^64, at which point we wrap the high 24 bits of the SEQ + * into the range where there may not be many OID values in use, to minimize + * the risk of conflict. + * + * Suppose LUSTRE_SEQ_MAX_WIDTH less than (1 << 24) which is currently true, + * the time between re-used inode numbers is very long - 2^40 SEQ numbers, + * or about 2^40 client mounts, if clients create less than 2^24 files/mount. + */ +static inline __u64 fid_flatten(const struct lu_fid *fid) +{ + __u64 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + RETURN(ino); + } + + seq = fid_seq(fid); + + ino = (seq << 24) + ((seq >> 24) & 0xffffff0000ULL) + fid_oid(fid); + + RETURN(ino ? ino : fid_oid(fid)); +} + +static inline __u32 fid_hash(const struct lu_fid *f, int bits) +{ + /* all objects with same id and different versions will belong to same + * collisions list. */ + return cfs_hash_long(fid_flatten(f), bits); +} + +/** + * map fid to 32 bit value for ino on 32bit systems. */ +static inline __u32 fid_flatten32(const struct lu_fid *fid) +{ + __u32 ino; + __u64 seq; + + if (fid_is_igif(fid)) { + ino = lu_igif_ino(fid); + RETURN(ino); + } + + seq = fid_seq(fid) - FID_SEQ_START; + + /* Map the high bits of the OID into higher bits of the inode number so + * that inodes generated at about the same time have a reduced chance + * of collisions. This will give a period of 2^12 = 1024 unique clients + * (from SEQ) and up to min(LUSTRE_SEQ_MAX_WIDTH, 2^20) = 128k objects + * (from OID), or up to 128M inodes without collisions for new files. */ + ino = ((seq & 0x000fffffULL) << 12) + ((seq >> 8) & 0xfffff000) + + (seq >> (64 - (40-8)) & 0xffffff00) + + (fid_oid(fid) & 0xff000fff) + ((fid_oid(fid) & 0x00fff000) << 8); + + RETURN(ino ? ino : fid_oid(fid)); +} + +static inline int lu_fid_diff(struct lu_fid *fid1, struct lu_fid *fid2) +{ + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID", fid2:"DFID"\n", + PFID(fid1), PFID(fid2)); + + if (fid_is_idif(fid1) && fid_is_idif(fid2)) + return fid_idif_id(fid1->f_seq, fid1->f_oid, fid1->f_ver) - + fid_idif_id(fid2->f_seq, fid2->f_oid, fid2->f_ver); + + return fid_oid(fid1) - fid_oid(fid2); +} + +#define LUSTRE_SEQ_SRV_NAME "seq_srv" +#define LUSTRE_SEQ_CTL_NAME "seq_ctl" + +/* Range common stuff */ +static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_index = cpu_to_le32(src->lsr_index); + dst->lsr_flags = cpu_to_le32(src->lsr_flags); +} + +static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_index = le32_to_cpu(src->lsr_index); + dst->lsr_flags = le32_to_cpu(src->lsr_flags); +} + +static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_index = cpu_to_be32(src->lsr_index); + dst->lsr_flags = cpu_to_be32(src->lsr_flags); +} + +static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_index = be32_to_cpu(src->lsr_index); + dst->lsr_flags = be32_to_cpu(src->lsr_flags); +} + +/** @} fid */ + +#endif /* __LINUX_FID_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_fld.h b/drivers/staging/lustre/lustre/include/lustre_fld.h new file mode 100644 index 000000000000..11e034a65b17 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fld.h @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2013, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_FLD_H +#define __LINUX_FLD_H + +/** \defgroup fld fld + * + * @{ + */ + +#include <lustre/lustre_idl.h> +#include <lustre_mdt.h> +#include <dt_object.h> + +#include <linux/libcfs/libcfs.h> + +struct lu_client_fld; +struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; + +extern const struct dt_index_features fld_index_features; +extern const char fld_index_name[]; + +/* + * FLD (Fid Location Database) interface. + */ +enum { + LUSTRE_CLI_FLD_HASH_DHT = 0, + LUSTRE_CLI_FLD_HASH_RRB +}; + + +struct lu_fld_target { + struct list_head ft_chain; + struct obd_export *ft_exp; + struct lu_server_fld *ft_srv; + __u64 ft_idx; +}; + +struct lu_server_fld { + /** + * Fld dir proc entry. */ + proc_dir_entry_t *lsf_proc_dir; + + /** + * /fld file object device */ + struct dt_object *lsf_obj; + + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ + struct fld_cache *lsf_cache; + + /** + * Protect index modifications */ + struct mutex lsf_lock; + + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" */ + char lsf_name[80]; + +}; + +struct lu_client_fld { + /** + * Client side proc entry. */ + proc_dir_entry_t *lcf_proc_dir; + + /** + * List of exports client FLD knows about. */ + struct list_head lcf_targets; + + /** + * Current hash to be used to chose an export. */ + struct lu_fld_hash *lcf_hash; + + /** + * Exports count. */ + int lcf_count; + + /** + * Lock protecting exports list and fld_hash. */ + spinlock_t lcf_lock; + + /** + * Client FLD cache. */ + struct fld_cache *lcf_cache; + + /** + * Client fld proc entry name. */ + char lcf_name[80]; + + const struct lu_context *lcf_ctx; + + int lcf_flags; +}; + +/** + * number of blocks to reserve for particular operations. Should be function of + * ... something. Stub for now. + */ +enum { + /* one insert operation can involve two delete and one insert */ + FLD_TXN_INDEX_INSERT_CREDITS = 60, + FLD_TXN_INDEX_DELETE_CREDITS = 20, +}; + +int fld_query(struct com_thread_info *info); + +/* Server methods */ +int fld_server_init(const struct lu_env *env, struct lu_server_fld *fld, + struct dt_device *dt, const char *prefix, int mds_node_id, + int type); + +void fld_server_fini(const struct lu_env *env, struct lu_server_fld *fld); + +int fld_declare_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + struct lu_seq_range *new, + struct thandle *th); + +int fld_server_create(const struct lu_env *env, + struct lu_server_fld *fld, + struct lu_seq_range *add_range, + struct thandle *th); + +int fld_insert_entry(const struct lu_env *env, + struct lu_server_fld *fld, + const struct lu_seq_range *range); + +int fld_server_lookup(const struct lu_env *env, struct lu_server_fld *fld, + seqno_t seq, struct lu_seq_range *range); + +/* Client methods */ +int fld_client_init(struct lu_client_fld *fld, + const char *prefix, int hash); + +void fld_client_fini(struct lu_client_fld *fld); + +void fld_client_flush(struct lu_client_fld *fld); + +int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds, + __u32 flags, const struct lu_env *env); + +int fld_client_create(struct lu_client_fld *fld, + struct lu_seq_range *range, + const struct lu_env *env); + +int fld_client_delete(struct lu_client_fld *fld, + seqno_t seq, + const struct lu_env *env); + +int fld_client_add_target(struct lu_client_fld *fld, + struct lu_fld_target *tar); + +int fld_client_del_target(struct lu_client_fld *fld, + __u64 idx); + +void fld_client_proc_fini(struct lu_client_fld *fld); + +/** @} fld */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_fsfilt.h b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h new file mode 100644 index 000000000000..9dcc332cb2f3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_fsfilt.h @@ -0,0 +1,48 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_fsfilt.h + * + * Filesystem interface helper. + */ + +#ifndef _LUSTRE_FSFILT_H +#define _LUSTRE_FSFILT_H + +#include <linux/lustre_fsfilt.h> + +#define LU221_BAD_TIME (0x80000000U + 24 * 3600) + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_ha.h b/drivers/staging/lustre/lustre/include/lustre_ha.h new file mode 100644 index 000000000000..105f6d61eef0 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_ha.h @@ -0,0 +1,67 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_HA_H +#define _LUSTRE_HA_H + +/** \defgroup ha ha + * + * @{ + */ + +struct obd_import; +struct obd_export; +struct obd_device; +struct ptlrpc_request; + + +int ptlrpc_replay(struct obd_import *imp); +int ptlrpc_resend(struct obd_import *imp); +void ptlrpc_free_committed(struct obd_import *imp); +void ptlrpc_wake_delayed(struct obd_import *imp); +int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid, int async); +int ptlrpc_set_import_active(struct obd_import *imp, int active); +void ptlrpc_activate_import(struct obd_import *imp); +void ptlrpc_deactivate_import(struct obd_import *imp); +void ptlrpc_invalidate_import(struct obd_import *imp); +void ptlrpc_fail_import(struct obd_import *imp, __u32 conn_cnt); +int ptlrpc_check_suspend(void); +void ptlrpc_activate_timeouts(struct obd_import *imp); +void ptlrpc_deactivate_timeouts(struct obd_import *imp); + +/** @} ha */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_handles.h b/drivers/staging/lustre/lustre/include/lustre_handles.h new file mode 100644 index 000000000000..fcd40f33426a --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_handles.h @@ -0,0 +1,93 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LUSTRE_HANDLES_H_ +#define __LUSTRE_HANDLES_H_ + +/** \defgroup handles handles + * + * @{ + */ + +#include <linux/lustre_handles.h> + +#include <linux/libcfs/libcfs.h> + + +struct portals_handle_ops { + void (*hop_addref)(void *object); + void (*hop_free)(void *object, int size); +}; + +/* These handles are most easily used by having them appear at the very top of + * whatever object that you want to make handles for. ie: + * + * struct ldlm_lock { + * struct portals_handle handle; + * ... + * }; + * + * Now you're able to assign the results of cookie2handle directly to an + * ldlm_lock. If it's not at the top, you'll want to use container_of() + * to compute the start of the structure based on the handle field. */ +struct portals_handle { + struct list_head h_link; + __u64 h_cookie; + struct portals_handle_ops *h_ops; + + /* newly added fields to handle the RCU issue. -jxiong */ + cfs_rcu_head_t h_rcu; + spinlock_t h_lock; + unsigned int h_size:31; + unsigned int h_in:1; +}; +#define RCU2HANDLE(rcu) container_of(rcu, struct portals_handle, h_rcu) + +/* handles.c */ + +/* Add a handle to the hash table */ +void class_handle_hash(struct portals_handle *, + struct portals_handle_ops *ops); +void class_handle_unhash(struct portals_handle *); +void class_handle_hash_back(struct portals_handle *); +void *class_handle2object(__u64 cookie); +void class_handle_free_cb(cfs_rcu_head_t *); +int class_handle_init(void); +void class_handle_cleanup(void); + +/** @} handles */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_idmap.h b/drivers/staging/lustre/lustre/include/lustre_idmap.h new file mode 100644 index 000000000000..084bdd6ab4db --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_idmap.h @@ -0,0 +1,104 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/lustre/include/lustre_idmap.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_IDMAP_H +#define _LUSTRE_IDMAP_H + +/** \defgroup idmap idmap + * + * @{ + */ + +#include <linux/libcfs/libcfs.h> + +#define CFS_NGROUPS_PER_BLOCK ((int)(PAGE_CACHE_SIZE / sizeof(gid_t))) + +#define CFS_GROUP_AT(gi, i) \ + ((gi)->blocks[(i) / CFS_NGROUPS_PER_BLOCK][(i) % CFS_NGROUPS_PER_BLOCK]) + +enum { + CFS_IC_NOTHING = 0, /* convert nothing */ + CFS_IC_ALL = 1, /* convert all items */ + CFS_IC_MAPPED = 2, /* convert mapped uid/gid */ + CFS_IC_UNMAPPED = 3 /* convert unmapped uid/gid */ +}; + +#define CFS_IDMAP_NOTFOUND (-1) + +#define CFS_IDMAP_HASHSIZE 32 + +enum lustre_idmap_idx { + RMT_UIDMAP_IDX, + LCL_UIDMAP_IDX, + RMT_GIDMAP_IDX, + LCL_GIDMAP_IDX, + CFS_IDMAP_N_HASHES +}; + +struct lustre_idmap_table { + spinlock_t lit_lock; + struct list_head lit_idmaps[CFS_IDMAP_N_HASHES][CFS_IDMAP_HASHSIZE]; +}; + +struct lu_ucred; + +extern void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist); +extern void lustre_groups_sort(group_info_t *group_info); +extern int lustre_in_group_p(struct lu_ucred *mu, gid_t grp); + +extern int lustre_idmap_add(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid); +extern int lustre_idmap_del(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid); +extern int lustre_idmap_lookup_uid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, uid_t uid); +extern int lustre_idmap_lookup_gid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, gid_t gid); +extern struct lustre_idmap_table *lustre_idmap_init(void); +extern void lustre_idmap_fini(struct lustre_idmap_table *t); + +/** @} idmap */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_import.h b/drivers/staging/lustre/lustre/include/lustre_import.h new file mode 100644 index 000000000000..3a5dd6a94c08 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_import.h @@ -0,0 +1,367 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup obd_import PtlRPC import definitions + * Imports are client-side representation of remote obd target. + * + * @{ + */ + +#ifndef __IMPORT_H +#define __IMPORT_H + +/** \defgroup export export + * + * @{ + */ + +#include <lustre_handles.h> +#include <lustre/lustre_idl.h> + + +/** + * Adaptive Timeout stuff + * + * @{ + */ +#define D_ADAPTTO D_OTHER +#define AT_BINS 4 /* "bin" means "N seconds of history" */ +#define AT_FLG_NOHIST 0x1 /* use last reported value only */ + +struct adaptive_timeout { + time_t at_binstart; /* bin start time */ + unsigned int at_hist[AT_BINS]; /* timeout history bins */ + unsigned int at_flags; + unsigned int at_current; /* current timeout value */ + unsigned int at_worst_ever; /* worst-ever timeout value */ + time_t at_worst_time; /* worst-ever timeout timestamp */ + spinlock_t at_lock; +}; + +struct ptlrpc_at_array { + struct list_head *paa_reqs_array; /** array to hold requests */ + __u32 paa_size; /** the size of array */ + __u32 paa_count; /** the total count of reqs */ + time_t paa_deadline; /** the earliest deadline of reqs */ + __u32 *paa_reqs_count; /** the count of reqs in each entry */ +}; + +#define IMP_AT_MAX_PORTALS 8 +struct imp_at { + int iat_portal[IMP_AT_MAX_PORTALS]; + struct adaptive_timeout iat_net_latency; + struct adaptive_timeout iat_service_estimate[IMP_AT_MAX_PORTALS]; +}; + + +/** @} */ + +/** Possible import states */ +enum lustre_imp_state { + LUSTRE_IMP_CLOSED = 1, + LUSTRE_IMP_NEW = 2, + LUSTRE_IMP_DISCON = 3, + LUSTRE_IMP_CONNECTING = 4, + LUSTRE_IMP_REPLAY = 5, + LUSTRE_IMP_REPLAY_LOCKS = 6, + LUSTRE_IMP_REPLAY_WAIT = 7, + LUSTRE_IMP_RECOVER = 8, + LUSTRE_IMP_FULL = 9, + LUSTRE_IMP_EVICTED = 10, +}; + +/** Returns test string representation of numeric import state \a state */ +static inline char * ptlrpc_import_state_name(enum lustre_imp_state state) +{ + static char* import_state_names[] = { + "<UNKNOWN>", "CLOSED", "NEW", "DISCONN", + "CONNECTING", "REPLAY", "REPLAY_LOCKS", "REPLAY_WAIT", + "RECOVER", "FULL", "EVICTED", + }; + + LASSERT (state <= LUSTRE_IMP_EVICTED); + return import_state_names[state]; +} + +/** + * List of import event types + */ +enum obd_import_event { + IMP_EVENT_DISCON = 0x808001, + IMP_EVENT_INACTIVE = 0x808002, + IMP_EVENT_INVALIDATE = 0x808003, + IMP_EVENT_ACTIVE = 0x808004, + IMP_EVENT_OCD = 0x808005, + IMP_EVENT_DEACTIVATE = 0x808006, + IMP_EVENT_ACTIVATE = 0x808007, +}; + +/** + * Definition of import connection structure + */ +struct obd_import_conn { + /** Item for linking connections together */ + struct list_head oic_item; + /** Pointer to actual PortalRPC connection */ + struct ptlrpc_connection *oic_conn; + /** uuid of remote side */ + struct obd_uuid oic_uuid; + /** + * Time (64 bit jiffies) of last connection attempt on this connection + */ + __u64 oic_last_attempt; +}; + +/* state history */ +#define IMP_STATE_HIST_LEN 16 +struct import_state_hist { + enum lustre_imp_state ish_state; + time_t ish_time; +}; + +/** + * Defintion of PortalRPC import structure. + * Imports are representing client-side view to remote target. + */ +struct obd_import { + /** Local handle (== id) for this import. */ + struct portals_handle imp_handle; + /** Reference counter */ + atomic_t imp_refcount; + struct lustre_handle imp_dlm_handle; /* client's ldlm export */ + /** Currently active connection */ + struct ptlrpc_connection *imp_connection; + /** PortalRPC client structure for this import */ + struct ptlrpc_client *imp_client; + /** List element for linking into pinger chain */ + struct list_head imp_pinger_chain; + /** List element for linking into chain for destruction */ + struct list_head imp_zombie_chain; + + /** + * Lists of requests that are retained for replay, waiting for a reply, + * or waiting for recovery to complete, respectively. + * @{ + */ + struct list_head imp_replay_list; + struct list_head imp_sending_list; + struct list_head imp_delayed_list; + /** @} */ + + /** obd device for this import */ + struct obd_device *imp_obd; + + /** + * some seciruty-related fields + * @{ + */ + struct ptlrpc_sec *imp_sec; + struct mutex imp_sec_mutex; + cfs_time_t imp_sec_expire; + /** @} */ + + /** Wait queue for those who need to wait for recovery completion */ + wait_queue_head_t imp_recovery_waitq; + + /** Number of requests currently in-flight */ + atomic_t imp_inflight; + /** Number of requests currently unregistering */ + atomic_t imp_unregistering; + /** Number of replay requests inflight */ + atomic_t imp_replay_inflight; + /** Number of currently happening import invalidations */ + atomic_t imp_inval_count; + /** Numbner of request timeouts */ + atomic_t imp_timeouts; + /** Current import state */ + enum lustre_imp_state imp_state; + /** History of import states */ + struct import_state_hist imp_state_hist[IMP_STATE_HIST_LEN]; + int imp_state_hist_idx; + /** Current import generation. Incremented on every reconnect */ + int imp_generation; + /** Incremented every time we send reconnection request */ + __u32 imp_conn_cnt; + /** + * \see ptlrpc_free_committed remembers imp_generation value here + * after a check to save on unnecessary replay list iterations + */ + int imp_last_generation_checked; + /** Last tranno we replayed */ + __u64 imp_last_replay_transno; + /** Last transno committed on remote side */ + __u64 imp_peer_committed_transno; + /** + * \see ptlrpc_free_committed remembers last_transno since its last + * check here and if last_transno did not change since last run of + * ptlrpc_free_committed and import generation is the same, we can + * skip looking for requests to remove from replay list as optimisation + */ + __u64 imp_last_transno_checked; + /** + * Remote export handle. This is how remote side knows what export + * we are talking to. Filled from response to connect request + */ + struct lustre_handle imp_remote_handle; + /** When to perform next ping. time in jiffies. */ + cfs_time_t imp_next_ping; + /** When we last succesfully connected. time in 64bit jiffies */ + __u64 imp_last_success_conn; + + /** List of all possible connection for import. */ + struct list_head imp_conn_list; + /** + * Current connection. \a imp_connection is imp_conn_current->oic_conn + */ + struct obd_import_conn *imp_conn_current; + + /** Protects flags, level, generation, conn_cnt, *_list */ + spinlock_t imp_lock; + + /* flags */ + unsigned long imp_no_timeout:1, /* timeouts are disabled */ + imp_invalid:1, /* evicted */ + /* administratively disabled */ + imp_deactive:1, + /* try to recover the import */ + imp_replayable:1, + /* don't run recovery (timeout instead) */ + imp_dlm_fake:1, + /* use 1/2 timeout on MDS' OSCs */ + imp_server_timeout:1, + /* VBR: imp in delayed recovery */ + imp_delayed_recovery:1, + /* VBR: if gap was found then no lock replays + */ + imp_no_lock_replay:1, + /* recovery by versions was failed */ + imp_vbr_failed:1, + /* force an immidiate ping */ + imp_force_verify:1, + /* force a scheduled ping */ + imp_force_next_verify:1, + /* pingable */ + imp_pingable:1, + /* resend for replay */ + imp_resend_replay:1, + /* disable normal recovery, for test only. */ + imp_no_pinger_recover:1, + /* need IR MNE swab */ + imp_need_mne_swab:1, + /* import must be reconnected instead of + * chouse new connection */ + imp_force_reconnect:1, + /* import has tried to connect with server */ + imp_connect_tried:1; + __u32 imp_connect_op; + struct obd_connect_data imp_connect_data; + __u64 imp_connect_flags_orig; + int imp_connect_error; + + __u32 imp_msg_magic; + __u32 imp_msghdr_flags; /* adjusted based on server capability */ + + struct ptlrpc_request_pool *imp_rq_pool; /* emergency request pool */ + + struct imp_at imp_at; /* adaptive timeout data */ + time_t imp_last_reply_time; /* for health check */ +}; + +typedef void (*obd_import_callback)(struct obd_import *imp, void *closure, + int event, void *event_arg, void *cb_data); + +/** + * Structure for import observer. + * It is possible to register "observer" on an import and every time + * something happens to an import (like connect/evict/disconnect) + * obderver will get its callback called with event type + */ +struct obd_import_observer { + struct list_head oio_chain; + obd_import_callback oio_cb; + void *oio_cb_data; +}; + +void class_observe_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_unobserve_import(struct obd_import *imp, obd_import_callback cb, + void *cb_data); +void class_notify_import_observers(struct obd_import *imp, int event, + void *event_arg); + +/* import.c */ +static inline unsigned int at_est2timeout(unsigned int val) +{ + /* add an arbitrary minimum: 125% +5 sec */ + return (val + (val >> 2) + 5); +} + +static inline unsigned int at_timeout2est(unsigned int val) +{ + /* restore estimate value from timeout: e=4/5(t-5) */ + LASSERT(val); + return (max((val << 2) / 5, 5U) - 4); +} + +static inline void at_reset(struct adaptive_timeout *at, int val) { + at->at_current = val; + at->at_worst_ever = val; + at->at_worst_time = cfs_time_current_sec(); +} +static inline void at_init(struct adaptive_timeout *at, int val, int flags) { + memset(at, 0, sizeof(*at)); + spin_lock_init(&at->at_lock); + at->at_flags = flags; + at_reset(at, val); +} +extern unsigned int at_min; +static inline int at_get(struct adaptive_timeout *at) { + return (at->at_current > at_min) ? at->at_current : at_min; +} +int at_measured(struct adaptive_timeout *at, unsigned int val); +int import_at_get_index(struct obd_import *imp, int portal); +extern unsigned int at_max; +#define AT_OFF (at_max == 0) + +/* genops.c */ +struct obd_export; +extern struct obd_import *class_exp2cliimp(struct obd_export *); +extern struct obd_import *class_conn2cliimp(struct lustre_handle *); + +/** @} import */ + +#endif /* __IMPORT_H */ + +/** @} obd_import */ diff --git a/drivers/staging/lustre/lustre/include/lustre_lib.h b/drivers/staging/lustre/lustre/include/lustre_lib.h new file mode 100644 index 000000000000..bdfc5391c6d2 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_lib.h @@ -0,0 +1,667 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_lib.h + * + * Basic Lustre library routines. + */ + +#ifndef _LUSTRE_LIB_H +#define _LUSTRE_LIB_H + +/** \defgroup lib lib + * + * @{ + */ + +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> +#include <lustre_ver.h> +#include <lustre_cfg.h> +#include <linux/lustre_lib.h> + +/* target.c */ +struct ptlrpc_request; +struct obd_export; +struct lu_target; +struct l_wait_info; +#include <lustre_ha.h> +#include <lustre_net.h> +#include <lvfs.h> + + +int target_pack_pool_reply(struct ptlrpc_request *req); +int do_set_info_async(struct obd_import *imp, + int opcode, int version, + obd_count keylen, void *key, + obd_count vallen, void *val, + struct ptlrpc_request_set *set); + +#define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ +#define OBD_MAX_IOCTL_BUFFER CONFIG_LUSTRE_OBD_MAX_IOCTL_BUFFER + +void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id); + +/* client.c */ + +int client_sanobd_setup(struct obd_device *obddev, struct lustre_cfg* lcfg); +struct client_obd *client_conn2cli(struct lustre_handle *conn); + +struct md_open_data; +struct obd_client_handle { + struct lustre_handle och_fh; + struct lu_fid och_fid; + struct md_open_data *och_mod; + __u32 och_magic; + int och_flags; +}; +#define OBD_CLIENT_HANDLE_MAGIC 0xd15ea5ed + +/* statfs_pack.c */ +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs); +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs); + +/* l_lock.c */ +struct lustre_lock { + int l_depth; + task_t *l_owner; + struct semaphore l_sem; + spinlock_t l_spin; +}; + +void l_lock_init(struct lustre_lock *); +void l_lock(struct lustre_lock *); +void l_unlock(struct lustre_lock *); +int l_has_lock(struct lustre_lock *); + +/* + * For md echo client + */ +enum md_echo_cmd { + ECHO_MD_CREATE = 1, /* Open/Create file on MDT */ + ECHO_MD_MKDIR = 2, /* Mkdir on MDT */ + ECHO_MD_DESTROY = 3, /* Unlink file on MDT */ + ECHO_MD_RMDIR = 4, /* Rmdir on MDT */ + ECHO_MD_LOOKUP = 5, /* Lookup on MDT */ + ECHO_MD_GETATTR = 6, /* Getattr on MDT */ + ECHO_MD_SETATTR = 7, /* Setattr on MDT */ + ECHO_MD_ALLOC_FID = 8, /* Get FIDs from MDT */ +}; + +/* + * OBD IOCTLS + */ +#define OBD_IOCTL_VERSION 0x00010004 + +struct obd_ioctl_data { + __u32 ioc_len; + __u32 ioc_version; + + union { + __u64 ioc_cookie; + __u64 ioc_u64_1; + }; + union { + __u32 ioc_conn1; + __u32 ioc_u32_1; + }; + union { + __u32 ioc_conn2; + __u32 ioc_u32_2; + }; + + struct obdo ioc_obdo1; + struct obdo ioc_obdo2; + + obd_size ioc_count; + obd_off ioc_offset; + __u32 ioc_dev; + __u32 ioc_command; + + __u64 ioc_nid; + __u32 ioc_nal; + __u32 ioc_type; + + /* buffers the kernel will treat as user pointers */ + __u32 ioc_plen1; + char *ioc_pbuf1; + __u32 ioc_plen2; + char *ioc_pbuf2; + + /* inline buffers for various arguments */ + __u32 ioc_inllen1; + char *ioc_inlbuf1; + __u32 ioc_inllen2; + char *ioc_inlbuf2; + __u32 ioc_inllen3; + char *ioc_inlbuf3; + __u32 ioc_inllen4; + char *ioc_inlbuf4; + + char ioc_bulk[0]; +}; + +struct obd_ioctl_hdr { + __u32 ioc_len; + __u32 ioc_version; +}; + +static inline int obd_ioctl_packlen(struct obd_ioctl_data *data) +{ + int len = cfs_size_round(sizeof(struct obd_ioctl_data)); + len += cfs_size_round(data->ioc_inllen1); + len += cfs_size_round(data->ioc_inllen2); + len += cfs_size_round(data->ioc_inllen3); + len += cfs_size_round(data->ioc_inllen4); + return len; +} + + +static inline int obd_ioctl_is_invalid(struct obd_ioctl_data *data) +{ + if (data->ioc_len > (1<<30)) { + CERROR("OBD ioctl: ioc_len larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen1 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen1 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen2 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen2 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen3 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen3 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inllen4 > (1<<30)) { + CERROR("OBD ioctl: ioc_inllen4 larger than 1<<30\n"); + return 1; + } + if (data->ioc_inlbuf1 && !data->ioc_inllen1) { + CERROR("OBD ioctl: inlbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf2 && !data->ioc_inllen2) { + CERROR("OBD ioctl: inlbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf3 && !data->ioc_inllen3) { + CERROR("OBD ioctl: inlbuf3 pointer but 0 length\n"); + return 1; + } + if (data->ioc_inlbuf4 && !data->ioc_inllen4) { + CERROR("OBD ioctl: inlbuf4 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf1 && !data->ioc_plen1) { + CERROR("OBD ioctl: pbuf1 pointer but 0 length\n"); + return 1; + } + if (data->ioc_pbuf2 && !data->ioc_plen2) { + CERROR("OBD ioctl: pbuf2 pointer but 0 length\n"); + return 1; + } + if (data->ioc_plen1 && !data->ioc_pbuf1) { + CERROR("OBD ioctl: plen1 set but NULL pointer\n"); + return 1; + } + if (data->ioc_plen2 && !data->ioc_pbuf2) { + CERROR("OBD ioctl: plen2 set but NULL pointer\n"); + return 1; + } + if (obd_ioctl_packlen(data) > data->ioc_len) { + CERROR("OBD ioctl: packlen exceeds ioc_len (%d > %d)\n", + obd_ioctl_packlen(data), data->ioc_len); + return 1; + } + return 0; +} + + +#include <obd_support.h> + +/* function defined in lustre/obdclass/<platform>/<platform>-module.c */ +int obd_ioctl_getdata(char **buf, int *len, void *arg); +int obd_ioctl_popdata(void *arg, void *data, int len); + +static inline void obd_ioctl_freedata(char *buf, int len) +{ + ENTRY; + + OBD_FREE_LARGE(buf, len); + EXIT; + return; +} + +/* + * BSD ioctl description: + * #define IOC_V1 _IOR(g, n1, long) + * #define IOC_V2 _IOW(g, n2, long) + * + * ioctl(f, IOC_V1, arg); + * arg will be treated as a long value, + * + * ioctl(f, IOC_V2, arg) + * arg will be treated as a pointer, bsd will call + * copyin(buf, arg, sizeof(long)) + * + * To make BSD ioctl handles argument correctly and simplely, + * we change _IOR to _IOWR so BSD will copyin obd_ioctl_data + * for us. Does this change affect Linux? (XXX Liang) + */ +#define OBD_IOC_CREATE _IOWR('f', 101, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DESTROY _IOW ('f', 104, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PREALLOCATE _IOWR('f', 105, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_SETATTR _IOW ('f', 107, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETATTR _IOWR ('f', 108, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ _IOWR('f', 109, OBD_IOC_DATA_TYPE) +#define OBD_IOC_WRITE _IOWR('f', 110, OBD_IOC_DATA_TYPE) + + +#define OBD_IOC_STATFS _IOWR('f', 113, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SYNC _IOW ('f', 114, OBD_IOC_DATA_TYPE) +#define OBD_IOC_READ2 _IOWR('f', 115, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FORMAT _IOWR('f', 116, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARTITION _IOWR('f', 117, OBD_IOC_DATA_TYPE) +#define OBD_IOC_COPY _IOWR('f', 120, OBD_IOC_DATA_TYPE) +#define OBD_IOC_MIGR _IOWR('f', 121, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PUNCH _IOWR('f', 122, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_MODULE_DEBUG _IOWR('f', 124, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_READ _IOWR('f', 125, OBD_IOC_DATA_TYPE) +#define OBD_IOC_BRW_WRITE _IOWR('f', 126, OBD_IOC_DATA_TYPE) +#define OBD_IOC_NAME2DEV _IOWR('f', 127, OBD_IOC_DATA_TYPE) +#define OBD_IOC_UUID2DEV _IOWR('f', 130, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GETNAME _IOWR('f', 131, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETMDNAME _IOR('f', 131, char[MAX_OBD_NAME]) +#define OBD_IOC_GETDTNAME OBD_IOC_GETNAME + +#define OBD_IOC_LOV_GET_CONFIG _IOWR('f', 132, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLIENT_RECOVER _IOW ('f', 133, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PING_TARGET _IOW ('f', 136, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_DEC_FS_USE_COUNT _IO ('f', 139 ) +#define OBD_IOC_NO_TRANSNO _IOW ('f', 140, OBD_IOC_DATA_TYPE) +#define OBD_IOC_SET_READONLY _IOW ('f', 141, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ABORT_RECOVERY _IOR ('f', 142, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_ROOT_SQUASH _IOWR('f', 143, OBD_IOC_DATA_TYPE) + +#define OBD_GET_VERSION _IOWR ('f', 144, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GSS_SUPPORT _IOWR('f', 145, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CLOSE_UUID _IOWR ('f', 147, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CHANGELOG_SEND _IOW ('f', 148, OBD_IOC_DATA_TYPE) +#define OBD_IOC_GETDEVICE _IOWR ('f', 149, OBD_IOC_DATA_TYPE) +#define OBD_IOC_FID2PATH _IOWR ('f', 150, OBD_IOC_DATA_TYPE) +/* see also <lustre/lustre_user.h> for ioctls 151-153 */ +/* OBD_IOC_LOV_SETSTRIPE: See also LL_IOC_LOV_SETSTRIPE */ +#define OBD_IOC_LOV_SETSTRIPE _IOW ('f', 154, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_GETSTRIPE: See also LL_IOC_LOV_GETSTRIPE */ +#define OBD_IOC_LOV_GETSTRIPE _IOW ('f', 155, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LOV_SETEA: See also LL_IOC_LOV_SETEA */ +#define OBD_IOC_LOV_SETEA _IOW ('f', 156, OBD_IOC_DATA_TYPE) +/* see <lustre/lustre_user.h> for ioctls 157-159 */ +/* OBD_IOC_QUOTACHECK: See also LL_IOC_QUOTACHECK */ +#define OBD_IOC_QUOTACHECK _IOW ('f', 160, int) +/* OBD_IOC_POLL_QUOTACHECK: See also LL_IOC_POLL_QUOTACHECK */ +#define OBD_IOC_POLL_QUOTACHECK _IOR ('f', 161, struct if_quotacheck *) +/* OBD_IOC_QUOTACTL: See also LL_IOC_QUOTACTL */ +#define OBD_IOC_QUOTACTL _IOWR('f', 162, struct if_quotactl) +/* see also <lustre/lustre_user.h> for ioctls 163-176 */ +#define OBD_IOC_CHANGELOG_REG _IOW ('f', 177, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_DEREG _IOW ('f', 178, struct obd_ioctl_data) +#define OBD_IOC_CHANGELOG_CLEAR _IOW ('f', 179, struct obd_ioctl_data) +#define OBD_IOC_RECORD _IOWR('f', 180, OBD_IOC_DATA_TYPE) +#define OBD_IOC_ENDRECORD _IOWR('f', 181, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARSE _IOWR('f', 182, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DORECORD _IOWR('f', 183, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PROCESS_CFG _IOWR('f', 184, OBD_IOC_DATA_TYPE) +#define OBD_IOC_DUMP_LOG _IOWR('f', 185, OBD_IOC_DATA_TYPE) +#define OBD_IOC_CLEAR_LOG _IOWR('f', 186, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PARAM _IOW ('f', 187, OBD_IOC_DATA_TYPE) +#define OBD_IOC_POOL _IOWR('f', 188, OBD_IOC_DATA_TYPE) +#define OBD_IOC_REPLACE_NIDS _IOWR('f', 189, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_CATLOGLIST _IOWR('f', 190, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_INFO _IOWR('f', 191, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_PRINT _IOWR('f', 192, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CANCEL _IOWR('f', 193, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_REMOVE _IOWR('f', 194, OBD_IOC_DATA_TYPE) +#define OBD_IOC_LLOG_CHECK _IOWR('f', 195, OBD_IOC_DATA_TYPE) +/* OBD_IOC_LLOG_CATINFO is deprecated */ +#define OBD_IOC_LLOG_CATINFO _IOWR('f', 196, OBD_IOC_DATA_TYPE) + +#define ECHO_IOC_GET_STRIPE _IOWR('f', 200, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_SET_STRIPE _IOWR('f', 201, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_ENQUEUE _IOWR('f', 202, OBD_IOC_DATA_TYPE) +#define ECHO_IOC_CANCEL _IOWR('f', 203, OBD_IOC_DATA_TYPE) + +#define OBD_IOC_GET_OBJ_VERSION _IOR('f', 210, OBD_IOC_DATA_TYPE) + +/* <lustre/lustre_user.h> defines ioctl number 218-219 */ +#define OBD_IOC_GET_MNTOPT _IOW('f', 220, mntopt_t) + +#define OBD_IOC_ECHO_MD _IOR('f', 221, struct obd_ioctl_data) +#define OBD_IOC_ECHO_ALLOC_SEQ _IOWR('f', 222, struct obd_ioctl_data) + +#define OBD_IOC_START_LFSCK _IOWR('f', 230, OBD_IOC_DATA_TYPE) +#define OBD_IOC_STOP_LFSCK _IOW('f', 231, OBD_IOC_DATA_TYPE) +#define OBD_IOC_PAUSE_LFSCK _IOW('f', 232, OBD_IOC_DATA_TYPE) + +/* XXX _IOWR('f', 250, long) has been defined in + * libcfs/include/libcfs/libcfs_private.h for debug, don't use it + */ + +/* Until such time as we get_info the per-stripe maximum from the OST, + * we define this to be 2T - 4k, which is the ext3 maxbytes. */ +#define LUSTRE_STRIPE_MAXBYTES 0x1fffffff000ULL + +/* Special values for remove LOV EA from disk */ +#define LOVEA_DELETE_VALUES(size, count, offset) (size == 0 && count == 0 && \ + offset == (typeof(offset))(-1)) + +/* #define POISON_BULK 0 */ + +/* + * l_wait_event is a flexible sleeping function, permitting simple caller + * configuration of interrupt and timeout sensitivity along with actions to + * be performed in the event of either exception. + * + * The first form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT_INTR(timeout, timeout_handler, + * intr_handler, callback_data); + * rc = l_wait_event(waitq, condition, &lwi); + * + * l_wait_event() makes the current process wait on 'waitq' until 'condition' + * is TRUE or a "killable" signal (SIGTERM, SIKGILL, SIGINT) is pending. It + * returns 0 to signify 'condition' is TRUE, but if a signal wakes it before + * 'condition' becomes true, it optionally calls the specified 'intr_handler' + * if not NULL, and returns -EINTR. + * + * If a non-zero timeout is specified, signals are ignored until the timeout + * has expired. At this time, if 'timeout_handler' is not NULL it is called. + * If it returns FALSE l_wait_event() continues to wait as described above with + * signals enabled. Otherwise it returns -ETIMEDOUT. + * + * LWI_INTR(intr_handler, callback_data) is shorthand for + * LWI_TIMEOUT_INTR(0, NULL, intr_handler, callback_data) + * + * The second form of usage looks like this: + * + * struct l_wait_info lwi = LWI_TIMEOUT(timeout, timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * + * This form is the same as the first except that it COMPLETELY IGNORES + * SIGNALS. The caller must therefore beware that if 'timeout' is zero, or if + * 'timeout_handler' is not NULL and returns FALSE, then the ONLY thing that + * can unblock the current process is 'condition' becoming TRUE. + * + * Another form of usage is: + * struct l_wait_info lwi = LWI_TIMEOUT_INTERVAL(timeout, interval, + * timeout_handler); + * rc = l_wait_event(waitq, condition, &lwi); + * This is the same as previous case, but condition is checked once every + * 'interval' jiffies (if non-zero). + * + * Subtle synchronization point: this macro does *not* necessary takes + * wait-queue spin-lock before returning, and, hence, following idiom is safe + * ONLY when caller provides some external locking: + * + * Thread1 Thread2 + * + * l_wait_event(&obj->wq, ....); (1) + * + * wake_up(&obj->wq): (2) + * spin_lock(&q->lock); (2.1) + * __wake_up_common(q, ...); (2.2) + * spin_unlock(&q->lock, flags); (2.3) + * + * OBD_FREE_PTR(obj); (3) + * + * As l_wait_event() may "short-cut" execution and return without taking + * wait-queue spin-lock, some additional synchronization is necessary to + * guarantee that step (3) can begin only after (2.3) finishes. + * + * XXX nikita: some ptlrpc daemon threads have races of that sort. + * + */ +static inline int back_to_sleep(void *arg) +{ + return 0; +} + +#define LWI_ON_SIGNAL_NOOP ((void (*)(void *))(-1)) + +struct l_wait_info { + cfs_duration_t lwi_timeout; + cfs_duration_t lwi_interval; + int lwi_allow_intr; + int (*lwi_on_timeout)(void *); + void (*lwi_on_signal)(void *); + void *lwi_cb_data; +}; + +/* NB: LWI_TIMEOUT ignores signals completely */ +#define LWI_TIMEOUT(time, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTERVAL(time, interval, cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = cb, \ + .lwi_cb_data = data, \ + .lwi_interval = interval, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 0 \ +}) + +#define LWI_TIMEOUT_INTR_ALL(time, time_cb, sig_cb, data) \ +((struct l_wait_info) { \ + .lwi_timeout = time, \ + .lwi_on_timeout = time_cb, \ + .lwi_on_signal = sig_cb, \ + .lwi_cb_data = data, \ + .lwi_interval = 0, \ + .lwi_allow_intr = 1 \ +}) + +#define LWI_INTR(cb, data) LWI_TIMEOUT_INTR(0, NULL, cb, data) + + +/* + * wait for @condition to become true, but no longer than timeout, specified + * by @info. + */ +#define __l_wait_event(wq, condition, info, ret, l_add_wait) \ +do { \ + wait_queue_t __wait; \ + cfs_duration_t __timeout = info->lwi_timeout; \ + sigset_t __blocked; \ + int __allow_intr = info->lwi_allow_intr; \ + \ + ret = 0; \ + if (condition) \ + break; \ + \ + init_waitqueue_entry_current(&__wait); \ + l_add_wait(&wq, &__wait); \ + \ + /* Block all signals (just the non-fatal ones if no timeout). */ \ + if (info->lwi_on_signal != NULL && (__timeout == 0 || __allow_intr)) \ + __blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); \ + else \ + __blocked = cfs_block_sigsinv(0); \ + \ + for (;;) { \ + unsigned __wstate; \ + \ + __wstate = info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr) ? \ + TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE; \ + \ + set_current_state(TASK_INTERRUPTIBLE); \ + \ + if (condition) \ + break; \ + \ + if (__timeout == 0) { \ + waitq_wait(&__wait, __wstate); \ + } else { \ + cfs_duration_t interval = info->lwi_interval? \ + min_t(cfs_duration_t, \ + info->lwi_interval,__timeout):\ + __timeout; \ + cfs_duration_t remaining = waitq_timedwait(&__wait,\ + __wstate, \ + interval); \ + __timeout = cfs_time_sub(__timeout, \ + cfs_time_sub(interval, remaining));\ + if (__timeout == 0) { \ + if (info->lwi_on_timeout == NULL || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ + break; \ + } \ + /* Take signals after the timeout expires. */ \ + if (info->lwi_on_signal != NULL) \ + (void)cfs_block_sigsinv(LUSTRE_FATAL_SIGS);\ + } \ + } \ + \ + if (condition) \ + break; \ + if (cfs_signal_pending()) { \ + if (info->lwi_on_signal != NULL && \ + (__timeout == 0 || __allow_intr)) { \ + if (info->lwi_on_signal != LWI_ON_SIGNAL_NOOP) \ + info->lwi_on_signal(info->lwi_cb_data);\ + ret = -EINTR; \ + break; \ + } \ + /* We have to do this here because some signals */ \ + /* are not blockable - ie from strace(1). */ \ + /* In these cases we want to schedule_timeout() */ \ + /* again, because we don't want that to return */ \ + /* -EINTR when the RPC actually succeeded. */ \ + /* the recalc_sigpending() below will deliver the */ \ + /* signal properly. */ \ + cfs_clear_sigpending(); \ + } \ + } \ + \ + cfs_restore_sigs(__blocked); \ + \ + set_current_state(TASK_RUNNING); \ + remove_wait_queue(&wq, &__wait); \ +} while (0) + + + +#define l_wait_event(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue); \ + __ret; \ +}) + +#define l_wait_event_exclusive(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive); \ + __ret; \ +}) + +#define l_wait_event_exclusive_head(wq, condition, info) \ +({ \ + int __ret; \ + struct l_wait_info *__info = (info); \ + \ + __l_wait_event(wq, condition, __info, \ + __ret, add_wait_queue_exclusive_head); \ + __ret; \ +}) + +#define l_wait_condition(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive(wq, condition, &lwi); \ +}) + +#define l_wait_condition_exclusive_head(wq, condition) \ +({ \ + struct l_wait_info lwi = { 0 }; \ + l_wait_event_exclusive_head(wq, condition, &lwi); \ +}) + +#define LIBLUSTRE_CLIENT (0) + +/** @} lib */ + +#endif /* _LUSTRE_LIB_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_linkea.h b/drivers/staging/lustre/lustre/include/lustre_linkea.h new file mode 100644 index 000000000000..5790be913bf6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_linkea.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + * Use is subject to license terms. + * + * Author: di wang <di.wang@intel.com> + */ + +struct linkea_data { + /** + * Buffer to keep link EA body. + */ + struct lu_buf *ld_buf; + /** + * The matched header, entry and its lenght in the EA + */ + struct link_ea_header *ld_leh; + struct link_ea_entry *ld_lee; + int ld_reclen; +}; + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf); +int linkea_init(struct linkea_data *ldata); +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid); +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname); +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid); + +#define LINKEA_NEXT_ENTRY(ldata) \ + (struct link_ea_entry *)((char *)ldata.ld_lee + ldata.ld_reclen) + +#define LINKEA_FIRST_ENTRY(ldata) \ + (struct link_ea_entry *)(ldata.ld_leh + 1) diff --git a/drivers/staging/lustre/lustre/include/lustre_lite.h b/drivers/staging/lustre/lustre/include/lustre_lite.h new file mode 100644 index 000000000000..25f8bfaccef3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_lite.h @@ -0,0 +1,147 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LL_H +#define _LL_H + +/** \defgroup lite lite + * + * @{ + */ + +#include <linux/lustre_lite.h> + +#include <obd_class.h> +#include <obd_ost.h> +#include <lustre_net.h> +#include <lustre_mds.h> +#include <lustre_ha.h> + +/* 4UL * 1024 * 1024 */ +#define LL_MAX_BLKSIZE_BITS (22) +#define LL_MAX_BLKSIZE (1UL<<LL_MAX_BLKSIZE_BITS) + +#include <lustre/lustre_user.h> + + +struct lustre_rw_params { + int lrp_lock_mode; + ldlm_policy_data_t lrp_policy; + obd_flag lrp_brw_flags; + int lrp_ast_flags; +}; + +/* + * XXX nikita: this function lives in the header because it is used by both + * llite kernel module and liblustre library, and there is no (?) better place + * to put it in. + */ +static inline void lustre_build_lock_params(int cmd, unsigned long open_flags, + __u64 connect_flags, + loff_t pos, ssize_t len, + struct lustre_rw_params *params) +{ + params->lrp_lock_mode = (cmd == OBD_BRW_READ) ? LCK_PR : LCK_PW; + params->lrp_brw_flags = 0; + + params->lrp_policy.l_extent.start = pos; + params->lrp_policy.l_extent.end = pos + len - 1; + /* + * for now O_APPEND always takes local locks. + */ + if (cmd == OBD_BRW_WRITE && (open_flags & O_APPEND)) { + params->lrp_policy.l_extent.start = 0; + params->lrp_policy.l_extent.end = OBD_OBJECT_EOF; + } else if (LIBLUSTRE_CLIENT && (connect_flags & OBD_CONNECT_SRVLOCK)) { + /* + * liblustre: OST-side locking for all non-O_APPEND + * reads/writes. + */ + params->lrp_lock_mode = LCK_NL; + params->lrp_brw_flags = OBD_BRW_SRVLOCK; + } else { + /* + * nothing special for the kernel. In the future llite may use + * OST-side locks for small writes into highly contended + * files. + */ + } + params->lrp_ast_flags = (open_flags & O_NONBLOCK) ? + LDLM_FL_BLOCK_NOWAIT : 0; +} + +/* + * This is embedded into liblustre and llite super-blocks to keep track of + * connect flags (capabilities) supported by all imports given mount is + * connected to. + */ +struct lustre_client_ocd { + /* + * This is conjunction of connect_flags across all imports (LOVs) this + * mount is connected to. This field is updated by cl_ocd_update() + * under ->lco_lock. + */ + __u64 lco_flags; + struct mutex lco_lock; + struct obd_export *lco_md_exp; + struct obd_export *lco_dt_exp; +}; + +/* + * Chain of hash overflow pages. + */ +struct ll_dir_chain { + /* XXX something. Later */ +}; + +static inline void ll_dir_chain_init(struct ll_dir_chain *chain) +{ +} + +static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) +{ +} + +static inline unsigned long hash_x_index(__u64 hash, int hash64) +{ + if (BITS_PER_LONG == 32 && hash64) + hash >>= 32; + return ~0UL - hash; +} + +/** @} lite */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_log.h b/drivers/staging/lustre/lustre/include/lustre_log.h new file mode 100644 index 000000000000..714ab378e431 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_log.h @@ -0,0 +1,576 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_log.h + * + * Generic infrastructure for managing a collection of logs. + * These logs are used for: + * + * - orphan recovery: OST adds record on create + * - mtime/size consistency: the OST adds a record on first write + * - open/unlinked objects: OST adds a record on destroy + * + * - mds unlink log: the MDS adds an entry upon delete + * + * - raid1 replication log between OST's + * - MDS replication logs + */ + +#ifndef _LUSTRE_LOG_H +#define _LUSTRE_LOG_H + +/** \defgroup log log + * + * @{ + */ + +#include <linux/lustre_log.h> + +#include <obd_class.h> +#include <obd_ost.h> +#include <lustre/lustre_idl.h> +#include <dt_object.h> + +#define LOG_NAME_LIMIT(logname, name) \ + snprintf(logname, sizeof(logname), "LOGS/%s", name) +#define LLOG_EEMPTY 4711 + +enum llog_open_param { + LLOG_OPEN_EXISTS = 0x0000, + LLOG_OPEN_NEW = 0x0001, +}; + +struct plain_handle_data { + struct list_head phd_entry; + struct llog_handle *phd_cat_handle; + struct llog_cookie phd_cookie; /* cookie of this log in its cat */ +}; + +struct cat_handle_data { + struct list_head chd_head; + struct llog_handle *chd_current_log; /* currently open log */ + struct llog_handle *chd_next_log; /* llog to be used next */ +}; + +static inline void logid_to_fid(struct llog_logid *id, struct lu_fid *fid) +{ + /* For compatibility purposes we identify pre-OSD (~< 2.3.51 MDS) + * logid's by non-zero ogen (inode generation) and convert them + * into IGIF */ + if (id->lgl_ogen == 0) { + fid->f_seq = id->lgl_oi.oi.oi_seq; + fid->f_oid = id->lgl_oi.oi.oi_id; + fid->f_ver = 0; + } else { + lu_igif_build(fid, id->lgl_oi.oi.oi_id, id->lgl_ogen); + } +} + +static inline void fid_to_logid(struct lu_fid *fid, struct llog_logid *id) +{ + id->lgl_oi.oi.oi_seq = fid->f_seq; + id->lgl_oi.oi.oi_id = fid->f_oid; + id->lgl_ogen = 0; +} + +static inline void logid_set_id(struct llog_logid *log_id, __u64 id) +{ + log_id->lgl_oi.oi.oi_id = id; +} + +static inline __u64 logid_id(struct llog_logid *log_id) +{ + return log_id->lgl_oi.oi.oi_id; +} + +struct llog_handle; + +/* llog.c - general API */ +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid); +int llog_copy_handler(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data); +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata); +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index); +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param); +int llog_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_get_size(struct llog_handle *loghandle); + +/* llog_process flags */ +#define LLOG_FLAG_NODEAMON 0x0001 + +/* llog_cat.c - catalog api */ +struct llog_process_data { + /** + * Any useful data needed while processing catalog. This is + * passed later to process callback. + */ + void *lpd_data; + /** + * Catalog process callback function, called for each record + * in catalog. + */ + llog_cb_t lpd_cb; + /** + * Start processing the catalog from startcat/startidx + */ + int lpd_startcat; + int lpd_startidx; +}; + +struct llog_process_cat_data { + /** + * Temporary stored first_idx while scanning log. + */ + int lpcd_first_idx; + /** + * Temporary stored last_idx while scanning log. + */ + int lpcd_last_idx; +}; + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle); +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th); +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th); +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf); +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies); +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data, int startcat, int startidx, bool fork); +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx); +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, llog_cb_t cb, + void *data); +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh); + +/* llog_obd.c */ +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op); +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt); +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *); +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags); +int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies); +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags); + +int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *idx); + +int obd_llog_finish(struct obd_device *obd, int count); + +/* llog_ioctl.c */ +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data); + +/* llog_net.c */ +int llog_initiator_connect(struct llog_ctxt *ctxt); + +struct llog_operations { + int (*lop_destroy)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_next_block)(const struct lu_env *env, struct llog_handle *h, + int *curr_idx, int next_idx, __u64 *offset, + void *buf, int len); + int (*lop_prev_block)(const struct lu_env *env, struct llog_handle *h, + int prev_idx, void *buf, int len); + int (*lop_read_header)(const struct lu_env *env, + struct llog_handle *handle); + int (*lop_setup)(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd); + int (*lop_sync)(struct llog_ctxt *ctxt, struct obd_export *exp, + int flags); + int (*lop_cleanup)(const struct lu_env *env, struct llog_ctxt *ctxt); + int (*lop_cancel)(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags); + int (*lop_connect)(struct llog_ctxt *ctxt, struct llog_logid *logid, + struct llog_gen *gen, struct obd_uuid *uuid); + /** + * Any llog file must be opened first using llog_open(). Llog can be + * opened by name, logid or without both, in last case the new logid + * will be generated. + */ + int (*lop_open)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_logid *logid, char *name, + enum llog_open_param); + /** + * Opened llog may not exist and this must be checked where needed using + * the llog_exist() call. + */ + int (*lop_exist)(struct llog_handle *lgh); + /** + * Close llog file and calls llog_free_handle() implicitly. + * Any opened llog must be closed by llog_close() call. + */ + int (*lop_close)(const struct lu_env *env, struct llog_handle *handle); + /** + * Create new llog file. The llog must be opened. + * Must be used only for local llog operations. + */ + int (*lop_declare_create)(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th); + int (*lop_create)(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); + /** + * write new record in llog. It appends records usually but can edit + * existing records too. + */ + int (*lop_declare_write_rec)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, + int idx, struct thandle *th); + int (*lop_write_rec)(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *cookie, int cookiecount, + void *buf, int idx, struct thandle *th); + /** + * Add new record in llog catalog. Does the same as llog_write_rec() + * but using llog catalog. + */ + int (*lop_declare_add)(const struct lu_env *env, + struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); + int (*lop_add)(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *cookie, + void *buf, struct thandle *th); + /* Old llog_add version, used in MDS-LOV-OSC now and will gone with + * LOD/OSP replacement */ + int (*lop_obd_add)(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies); +}; + +/* In-memory descriptor for a log object or log catalog */ +struct llog_handle { + struct rw_semaphore lgh_lock; + spinlock_t lgh_hdr_lock; /* protect lgh_hdr data */ + struct llog_logid lgh_id; /* id of this log */ + struct llog_log_hdr *lgh_hdr; + struct file *lgh_file; + struct dt_object *lgh_obj; + int lgh_last_idx; + int lgh_cur_idx; /* used during llog_process */ + __u64 lgh_cur_offset; /* used during llog_process */ + struct llog_ctxt *lgh_ctxt; + union { + struct plain_handle_data phd; + struct cat_handle_data chd; + } u; + char *lgh_name; + void *private_data; + struct llog_operations *lgh_logops; + atomic_t lgh_refcount; +}; + +/* llog_lvfs.c */ +extern struct llog_operations llog_lvfs_ops; + +/* llog_osd.c */ +extern struct llog_operations llog_osd_ops; +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, + struct llog_catid *idarray); +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, + struct llog_catid *idarray); + +#define LLOG_CTXT_FLAG_UNINITIALIZED 0x00000001 +#define LLOG_CTXT_FLAG_STOP 0x00000002 + +struct llog_ctxt { + int loc_idx; /* my index the obd array of ctxt's */ + struct obd_device *loc_obd; /* points back to the containing obd*/ + struct obd_llog_group *loc_olg; /* group containing that ctxt */ + struct obd_export *loc_exp; /* parent "disk" export (e.g. MDS) */ + struct obd_import *loc_imp; /* to use in RPC's: can be backward + pointing import */ + struct llog_operations *loc_logops; + struct llog_handle *loc_handle; + struct mutex loc_mutex; /* protect loc_imp */ + atomic_t loc_refcount; + long loc_flags; /* flags, see above defines */ + struct dt_object *loc_dir; +}; + +#define LLOG_PROC_BREAK 0x0001 +#define LLOG_DEL_RECORD 0x0002 + +static inline int llog_obd2ops(struct llog_ctxt *ctxt, + struct llog_operations **lop) +{ + if (ctxt == NULL) + return -ENOTCONN; + + *lop = ctxt->loc_logops; + if (*lop == NULL) + return -EOPNOTSUPP; + + return 0; +} + +static inline int llog_handle2ops(struct llog_handle *loghandle, + struct llog_operations **lop) +{ + if (loghandle == NULL || loghandle->lgh_logops == NULL) + return -EINVAL; + + *lop = loghandle->lgh_logops; + return 0; +} + +static inline int llog_data_len(int len) +{ + return cfs_size_round(len); +} + +static inline struct llog_ctxt *llog_ctxt_get(struct llog_ctxt *ctxt) +{ + atomic_inc(&ctxt->loc_refcount); + CDEBUG(D_INFO, "GETting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount)); + return ctxt; +} + +static inline void llog_ctxt_put(struct llog_ctxt *ctxt) +{ + if (ctxt == NULL) + return; + LASSERT_ATOMIC_GT_LT(&ctxt->loc_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting ctxt %p : new refcount %d\n", ctxt, + atomic_read(&ctxt->loc_refcount) - 1); + __llog_ctxt_put(NULL, ctxt); +} + +static inline void llog_group_init(struct obd_llog_group *olg, int group) +{ + init_waitqueue_head(&olg->olg_waitq); + spin_lock_init(&olg->olg_lock); + mutex_init(&olg->olg_cat_processing); + olg->olg_seq = group; +} + +static inline int llog_group_set_ctxt(struct obd_llog_group *olg, + struct llog_ctxt *ctxt, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] != NULL) { + spin_unlock(&olg->olg_lock); + return -EEXIST; + } + olg->olg_ctxts[index] = ctxt; + spin_unlock(&olg->olg_lock); + return 0; +} + +static inline struct llog_ctxt *llog_group_get_ctxt(struct obd_llog_group *olg, + int index) +{ + struct llog_ctxt *ctxt; + + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + + spin_lock(&olg->olg_lock); + if (olg->olg_ctxts[index] == NULL) + ctxt = NULL; + else + ctxt = llog_ctxt_get(olg->olg_ctxts[index]); + spin_unlock(&olg->olg_lock); + return ctxt; +} + +static inline void llog_group_clear_ctxt(struct obd_llog_group *olg, int index) +{ + LASSERT(index >= 0 && index < LLOG_MAX_CTXTS); + spin_lock(&olg->olg_lock); + olg->olg_ctxts[index] = NULL; + spin_unlock(&olg->olg_lock); +} + +static inline struct llog_ctxt *llog_get_context(struct obd_device *obd, + int index) +{ + return llog_group_get_ctxt(&obd->obd_olg, index); +} + +static inline int llog_group_ctxt_null(struct obd_llog_group *olg, int index) +{ + return (olg->olg_ctxts[index] == NULL); +} + +static inline int llog_ctxt_null(struct obd_device *obd, int index) +{ + return (llog_group_ctxt_null(&obd->obd_olg, index)); +} + +static inline int llog_destroy(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_destroy == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_destroy(env, handle); + RETURN(rc); +} + +static inline int llog_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_next_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_next_block(env, loghandle, cur_idx, next_idx, + cur_offset, buf, len); + RETURN(rc); +} + +static inline int llog_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_prev_block == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_prev_block(env, loghandle, prev_idx, buf, len); + RETURN(rc); +} + +static inline int llog_connect(struct llog_ctxt *ctxt, + struct llog_logid *logid, struct llog_gen *gen, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_obd2ops(ctxt, &lop); + if (rc) + RETURN(rc); + if (lop->lop_connect == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_connect(ctxt, logid, gen, uuid); + RETURN(rc); +} + +/* llog.c */ +int llog_exist(struct llog_handle *loghandle); +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th); +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th); +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th); +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th); +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th); +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th); +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg); +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name); +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name); +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx); + +/** @} log */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mdc.h b/drivers/staging/lustre/lustre/include/lustre_mdc.h new file mode 100644 index 000000000000..fb1561a809b9 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mdc.h @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mdc.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDC_H +#define _LUSTRE_MDC_H + +/** \defgroup mdc mdc + * + * @{ + */ + +# include <linux/fs.h> +# include <linux/dcache.h> +# ifdef CONFIG_FS_POSIX_ACL +# include <linux/posix_acl_xattr.h> +# endif /* CONFIG_FS_POSIX_ACL */ +# include <linux/lustre_intent.h> +#include <lustre_handles.h> +#include <linux/libcfs/libcfs.h> +#include <obd_class.h> +#include <lustre/lustre_idl.h> +#include <lustre_lib.h> +#include <lustre_dlm.h> +#include <lustre_export.h> + +struct ptlrpc_client; +struct obd_export; +struct ptlrpc_request; +struct obd_device; + +struct mdc_rpc_lock { + struct mutex rpcl_mutex; + struct lookup_intent *rpcl_it; + int rpcl_fakes; +}; + +#define MDC_FAKE_RPCL_IT ((void *)0x2c0012bfUL) + +static inline void mdc_init_rpc_lock(struct mdc_rpc_lock *lck) +{ + mutex_init(&lck->rpcl_mutex); + lck->rpcl_it = NULL; +} + +static inline void mdc_get_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + ENTRY; + + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP)) + return; + + /* This would normally block until the existing request finishes. + * If fail_loc is set it will block until the regular request is + * done, then set rpcl_it to MDC_FAKE_RPCL_IT. Once that is set + * it will only be cleared when all fake requests are finished. + * Only when all fake requests are finished can normal requests + * be sent, to ensure they are recoverable again. */ + again: + mutex_lock(&lck->rpcl_mutex); + + if (CFS_FAIL_CHECK_QUIET(OBD_FAIL_MDC_RPCS_SEM)) { + lck->rpcl_it = MDC_FAKE_RPCL_IT; + lck->rpcl_fakes++; + mutex_unlock(&lck->rpcl_mutex); + return; + } + + /* This will only happen when the CFS_FAIL_CHECK() was + * just turned off but there are still requests in progress. + * Wait until they finish. It doesn't need to be efficient + * in this extremely rare case, just have low overhead in + * the common case when it isn't true. */ + while (unlikely(lck->rpcl_it == MDC_FAKE_RPCL_IT)) { + mutex_unlock(&lck->rpcl_mutex); + schedule_timeout(cfs_time_seconds(1) / 4); + goto again; + } + + LASSERT(lck->rpcl_it == NULL); + lck->rpcl_it = it; +} + +static inline void mdc_put_rpc_lock(struct mdc_rpc_lock *lck, + struct lookup_intent *it) +{ + if (it != NULL && (it->it_op == IT_GETATTR || it->it_op == IT_LOOKUP)) + goto out; + + if (lck->rpcl_it == MDC_FAKE_RPCL_IT) { /* OBD_FAIL_MDC_RPCS_SEM */ + mutex_lock(&lck->rpcl_mutex); + + LASSERTF(lck->rpcl_fakes > 0, "%d\n", lck->rpcl_fakes); + lck->rpcl_fakes--; + + if (lck->rpcl_fakes == 0) + lck->rpcl_it = NULL; + + } else { + LASSERTF(it == lck->rpcl_it, "%p != %p\n", it, lck->rpcl_it); + lck->rpcl_it = NULL; + } + + mutex_unlock(&lck->rpcl_mutex); + out: + EXIT; +} + +static inline void mdc_update_max_ea_from_body(struct obd_export *exp, + struct mdt_body *body) +{ + if (body->valid & OBD_MD_FLMODEASIZE) { + if (exp->exp_obd->u.cli.cl_max_mds_easize < body->max_mdsize) + exp->exp_obd->u.cli.cl_max_mds_easize = + body->max_mdsize; + if (exp->exp_obd->u.cli.cl_max_mds_cookiesize < + body->max_cookiesize) + exp->exp_obd->u.cli.cl_max_mds_cookiesize = + body->max_cookiesize; + } +} + + +struct mdc_cache_waiter { + struct list_head mcw_entry; + wait_queue_head_t mcw_waitq; +}; + +/* mdc/mdc_locks.c */ +int it_disposition(struct lookup_intent *it, int flag); +void it_clear_disposition(struct lookup_intent *it, int flag); +void it_set_disposition(struct lookup_intent *it, int flag); +int it_open_error(int phase, struct lookup_intent *it); + +/** @} mdc */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mds.h b/drivers/staging/lustre/lustre/include/lustre_mds.h new file mode 100644 index 000000000000..b386f87471e3 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mds.h @@ -0,0 +1,81 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_mds.h + * + * MDS data structures. + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_MDS_H +#define _LUSTRE_MDS_H + +/** \defgroup mds mds + * + * @{ + */ + +#include <lustre_handles.h> +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> +#include <lustre_lib.h> +#include <lustre_dlm.h> +#include <lustre_export.h> + +struct mds_group_info { + struct obd_uuid *uuid; + int group; +}; + +struct mds_capa_info { + struct obd_uuid *uuid; + struct lustre_capa_key *capa; +}; + +#define MDD_OBD_NAME "mdd_obd" +#define MDD_OBD_UUID "mdd_obd_uuid" + +static inline int md_should_create(__u64 flags) +{ + return !(flags & MDS_OPEN_DELAY_CREATE || + !(flags & FMODE_WRITE)); +} + +/* these are local flags, used only on the client, private */ +#define M_CHECK_STALE 0200000000 + +/** @} mds */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_mdt.h b/drivers/staging/lustre/lustre/include/lustre_mdt.h new file mode 100644 index 000000000000..dba26a6cfa38 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_mdt.h @@ -0,0 +1,84 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LINUX_MDT_H +#define __LINUX_MDT_H + +/** \defgroup mdt mdt + * + * @{ + */ + +#include <lustre/lustre_idl.h> +#include <lustre_req_layout.h> +#include <md_object.h> +#include <dt_object.h> +#include <linux/libcfs/libcfs.h> + +/* + * Common thread info for mdt, seq and fld + */ +struct com_thread_info { + /* + * for req-layout interface. + */ + struct req_capsule *cti_pill; +}; + +enum { + ESERIOUS = 0x0001000 +}; + +static inline int err_serious(int rc) +{ + LASSERT(rc < 0); + LASSERT(-rc < ESERIOUS); + return -(-rc | ESERIOUS); +} + +static inline int clear_serious(int rc) +{ + if (rc < 0) + rc = -(-rc & ~ESERIOUS); + return rc; +} + +static inline int is_serious(int rc) +{ + return (rc < 0 && -rc & ESERIOUS); +} + +/** @} mdt */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_net.h b/drivers/staging/lustre/lustre/include/lustre_net.h new file mode 100644 index 000000000000..293dd90e5b6c --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_net.h @@ -0,0 +1,3451 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2010, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/** \defgroup PtlRPC Portal RPC and networking module. + * + * PortalRPC is the layer used by rest of lustre code to achieve network + * communications: establish connections with corresponding export and import + * states, listen for a service, send and receive RPCs. + * PortalRPC also includes base recovery framework: packet resending and + * replaying, reconnections, pinger. + * + * PortalRPC utilizes LNet as its transport layer. + * + * @{ + */ + + +#ifndef _LUSTRE_NET_H +#define _LUSTRE_NET_H + +/** \defgroup net net + * + * @{ + */ + +#include <linux/lustre_net.h> + +#include <linux/libcfs/libcfs.h> +// #include <obd.h> +#include <linux/lnet/lnet.h> +#include <lustre/lustre_idl.h> +#include <lustre_ha.h> +#include <lustre_sec.h> +#include <lustre_import.h> +#include <lprocfs_status.h> +#include <lu_object.h> +#include <lustre_req_layout.h> + +#include <obd_support.h> +#include <lustre_ver.h> + +/* MD flags we _always_ use */ +#define PTLRPC_MD_OPTIONS 0 + +/** + * Max # of bulk operations in one request. + * In order for the client and server to properly negotiate the maximum + * possible transfer size, PTLRPC_BULK_OPS_COUNT must be a power-of-two + * value. The client is free to limit the actual RPC size for any bulk + * transfer via cl_max_pages_per_rpc to some non-power-of-two value. */ +#define PTLRPC_BULK_OPS_BITS 2 +#define PTLRPC_BULK_OPS_COUNT (1U << PTLRPC_BULK_OPS_BITS) +/** + * PTLRPC_BULK_OPS_MASK is for the convenience of the client only, and + * should not be used on the server at all. Otherwise, it imposes a + * protocol limitation on the maximum RPC size that can be used by any + * RPC sent to that server in the future. Instead, the server should + * use the negotiated per-client ocd_brw_size to determine the bulk + * RPC count. */ +#define PTLRPC_BULK_OPS_MASK (~((__u64)PTLRPC_BULK_OPS_COUNT - 1)) + +/** + * Define maxima for bulk I/O. + * + * A single PTLRPC BRW request is sent via up to PTLRPC_BULK_OPS_COUNT + * of LNET_MTU sized RDMA transfers. Clients and servers negotiate the + * currently supported maximum between peers at connect via ocd_brw_size. + */ +#define PTLRPC_MAX_BRW_BITS (LNET_MTU_BITS + PTLRPC_BULK_OPS_BITS) +#define PTLRPC_MAX_BRW_SIZE (1 << PTLRPC_MAX_BRW_BITS) +#define PTLRPC_MAX_BRW_PAGES (PTLRPC_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) + +#define ONE_MB_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) +#define MD_MAX_BRW_PAGES (MD_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define DT_MAX_BRW_SIZE PTLRPC_MAX_BRW_SIZE +#define DT_MAX_BRW_PAGES (DT_MAX_BRW_SIZE >> PAGE_CACHE_SHIFT) +#define OFD_MAX_BRW_SIZE (1 << LNET_MTU_BITS) + +/* When PAGE_SIZE is a constant, we can check our arithmetic here with cpp! */ +# if ((PTLRPC_MAX_BRW_PAGES & (PTLRPC_MAX_BRW_PAGES - 1)) != 0) +# error "PTLRPC_MAX_BRW_PAGES isn't a power of two" +# endif +# if (PTLRPC_MAX_BRW_SIZE != (PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE)) +# error "PTLRPC_MAX_BRW_SIZE isn't PTLRPC_MAX_BRW_PAGES * PAGE_CACHE_SIZE" +# endif +# if (PTLRPC_MAX_BRW_SIZE > LNET_MTU * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_SIZE too big" +# endif +# if (PTLRPC_MAX_BRW_PAGES > LNET_MAX_IOV * PTLRPC_BULK_OPS_COUNT) +# error "PTLRPC_MAX_BRW_PAGES too big" +# endif + +#define PTLRPC_NTHRS_INIT 2 + +/** + * Buffer Constants + * + * Constants determine how memory is used to buffer incoming service requests. + * + * ?_NBUFS # buffers to allocate when growing the pool + * ?_BUFSIZE # bytes in a single request buffer + * ?_MAXREQSIZE # maximum request service will receive + * + * When fewer than ?_NBUFS/2 buffers are posted for receive, another chunk + * of ?_NBUFS is added to the pool. + * + * Messages larger than ?_MAXREQSIZE are dropped. Request buffers are + * considered full when less than ?_MAXREQSIZE is left in them. + */ +/** + * Thread Constants + * + * Constants determine how threads are created for ptlrpc service. + * + * ?_NTHRS_INIT # threads to create for each service partition on + * initializing. If it's non-affinity service and + * there is only one partition, it's the overall # + * threads for the service while initializing. + * ?_NTHRS_BASE # threads should be created at least for each + * ptlrpc partition to keep the service healthy. + * It's the low-water mark of threads upper-limit + * for each partition. + * ?_THR_FACTOR # threads can be added on threads upper-limit for + * each CPU core. This factor is only for reference, + * we might decrease value of factor if number of cores + * per CPT is above a limit. + * ?_NTHRS_MAX # overall threads can be created for a service, + * it's a soft limit because if service is running + * on machine with hundreds of cores and tens of + * CPU partitions, we need to guarantee each partition + * has ?_NTHRS_BASE threads, which means total threads + * will be ?_NTHRS_BASE * number_of_cpts which can + * exceed ?_NTHRS_MAX. + * + * Examples + * + * #define MDS_NTHRS_INIT 2 + * #define MDS_NTHRS_BASE 64 + * #define MDS_NTHRS_FACTOR 8 + * #define MDS_NTHRS_MAX 1024 + * + * Example 1): + * --------------------------------------------------------------------- + * Server(A) has 16 cores, user configured it to 4 partitions so each + * partition has 4 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(4) * MDS_NTHRS_FACTOR(8) = 96 + * + * Total number of threads for the service is: + * 96 * partitions(4) = 384 + * + * Example 2): + * --------------------------------------------------------------------- + * Server(B) has 32 cores, user configured it to 4 partitions so each + * partition has 8 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(8) * MDS_NTHRS_FACTOR(8) = 128 + * + * Total number of threads for the service is: + * 128 * partitions(4) = 512 + * + * Example 3): + * --------------------------------------------------------------------- + * Server(B) has 96 cores, user configured it to 8 partitions so each + * partition has 12 cores, then actual number of service threads on each + * partition is: + * MDS_NTHRS_BASE(64) + cores(12) * MDS_NTHRS_FACTOR(8) = 160 + * + * Total number of threads for the service is: + * 160 * partitions(8) = 1280 + * + * However, it's above the soft limit MDS_NTHRS_MAX, so we choose this number + * as upper limit of threads number for each partition: + * MDS_NTHRS_MAX(1024) / partitions(8) = 128 + * + * Example 4): + * --------------------------------------------------------------------- + * Server(C) have a thousand of cores and user configured it to 32 partitions + * MDS_NTHRS_BASE(64) * 32 = 2048 + * + * which is already above soft limit MDS_NTHRS_MAX(1024), but we still need + * to guarantee that each partition has at least MDS_NTHRS_BASE(64) threads + * to keep service healthy, so total number of threads will just be 2048. + * + * NB: we don't suggest to choose server with that many cores because backend + * filesystem itself, buffer cache, or underlying network stack might + * have some SMP scalability issues at that large scale. + * + * If user already has a fat machine with hundreds or thousands of cores, + * there are two choices for configuration: + * a) create CPU table from subset of all CPUs and run Lustre on + * top of this subset + * b) bind service threads on a few partitions, see modparameters of + * MDS and OSS for details +* + * NB: these calculations (and examples below) are simplified to help + * understanding, the real implementation is a little more complex, + * please see ptlrpc_server_nthreads_check() for details. + * + */ + + /* + * LDLM threads constants: + * + * Given 8 as factor and 24 as base threads number + * + * example 1) + * On 4-core machine we will have 24 + 8 * 4 = 56 threads. + * + * example 2) + * On 8-core machine with 2 partitions we will have 24 + 4 * 8 = 56 + * threads for each partition and total threads number will be 112. + * + * example 3) + * On 64-core machine with 8 partitions we will need LDLM_NTHRS_BASE(24) + * threads for each partition to keep service healthy, so total threads + * number should be 24 * 8 = 192. + * + * So with these constants, threads number will be at the similar level + * of old versions, unless target machine has over a hundred cores + */ +#define LDLM_THR_FACTOR 8 +#define LDLM_NTHRS_INIT PTLRPC_NTHRS_INIT +#define LDLM_NTHRS_BASE 24 +#define LDLM_NTHRS_MAX (num_online_cpus() == 1 ? 64 : 128) + +#define LDLM_BL_THREADS LDLM_NTHRS_AUTO_INIT +#define LDLM_CLIENT_NBUFS 1 +#define LDLM_SERVER_NBUFS 64 +#define LDLM_BUFSIZE (8 * 1024) +#define LDLM_MAXREQSIZE (5 * 1024) +#define LDLM_MAXREPSIZE (1024) + + /* + * MDS threads constants: + * + * Please see examples in "Thread Constants", MDS threads number will be at + * the comparable level of old versions, unless the server has many cores. + */ +#ifndef MDS_MAX_THREADS +#define MDS_MAX_THREADS 1024 +#define MDS_MAX_OTHR_THREADS 256 + +#else /* MDS_MAX_THREADS */ +#if MDS_MAX_THREADS < PTLRPC_NTHRS_INIT +#undef MDS_MAX_THREADS +#define MDS_MAX_THREADS PTLRPC_NTHRS_INIT +#endif +#define MDS_MAX_OTHR_THREADS max(PTLRPC_NTHRS_INIT, MDS_MAX_THREADS / 2) +#endif + +/* default service */ +#define MDS_THR_FACTOR 8 +#define MDS_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_NTHRS_MAX MDS_MAX_THREADS +#define MDS_NTHRS_BASE min(64, MDS_NTHRS_MAX) + +/* read-page service */ +#define MDS_RDPG_THR_FACTOR 4 +#define MDS_RDPG_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_RDPG_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_RDPG_NTHRS_BASE min(48, MDS_RDPG_NTHRS_MAX) + +/* these should be removed when we remove setattr service in the future */ +#define MDS_SETA_THR_FACTOR 4 +#define MDS_SETA_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_SETA_NTHRS_MAX MDS_MAX_OTHR_THREADS +#define MDS_SETA_NTHRS_BASE min(48, MDS_SETA_NTHRS_MAX) + +/* non-affinity threads */ +#define MDS_OTHR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define MDS_OTHR_NTHRS_MAX MDS_MAX_OTHR_THREADS + +#define MDS_NBUFS 64 + +/** + * Assume file name length = FNAME_MAX = 256 (true for ext3). + * path name length = PATH_MAX = 4096 + * LOV MD size max = EA_MAX = 24 * 2000 + * (NB: 24 is size of lov_ost_data) + * LOV LOGCOOKIE size max = 32 * 2000 + * (NB: 32 is size of llog_cookie) + * symlink: FNAME_MAX + PATH_MAX <- largest + * link: FNAME_MAX + PATH_MAX (mds_rec_link < mds_rec_create) + * rename: FNAME_MAX + FNAME_MAX + * open: FNAME_MAX + EA_MAX + * + * MDS_MAXREQSIZE ~= 4736 bytes = + * lustre_msg + ldlm_request + mdt_body + mds_rec_create + FNAME_MAX + PATH_MAX + * MDS_MAXREPSIZE ~= 8300 bytes = lustre_msg + llog_header + * + * Realistic size is about 512 bytes (20 character name + 128 char symlink), + * except in the open case where there are a large number of OSTs in a LOV. + */ +#define MDS_MAXREQSIZE (5 * 1024) /* >= 4736 */ +#define MDS_MAXREPSIZE (9 * 1024) /* >= 8300 */ + +/** + * MDS incoming request with LOV EA + * 24 = sizeof(struct lov_ost_data), i.e: replay of opencreate + */ +#define MDS_LOV_MAXREQSIZE max(MDS_MAXREQSIZE, \ + 362 + LOV_MAX_STRIPE_COUNT * 24) +/** + * MDS outgoing reply with LOV EA + * + * NB: max reply size Lustre 2.4+ client can get from old MDS is: + * LOV_MAX_STRIPE_COUNT * (llog_cookie + lov_ost_data) + extra bytes + * + * but 2.4 or later MDS will never send reply with llog_cookie to any + * version client. This macro is defined for server side reply buffer size. + */ +#define MDS_LOV_MAXREPSIZE MDS_LOV_MAXREQSIZE + +/** + * This is the size of a maximum REINT_SETXATTR request: + * + * lustre_msg 56 (32 + 4 x 5 + 4) + * ptlrpc_body 184 + * mdt_rec_setxattr 136 + * lustre_capa 120 + * name 256 (XATTR_NAME_MAX) + * value 65536 (XATTR_SIZE_MAX) + */ +#define MDS_EA_MAXREQSIZE 66288 + +/** + * These are the maximum request and reply sizes (rounded up to 1 KB + * boundaries) for the "regular" MDS_REQUEST_PORTAL and MDS_REPLY_PORTAL. + */ +#define MDS_REG_MAXREQSIZE (((max(MDS_EA_MAXREQSIZE, \ + MDS_LOV_MAXREQSIZE) + 1023) >> 10) << 10) +#define MDS_REG_MAXREPSIZE MDS_REG_MAXREQSIZE + +/** + * The update request includes all of updates from the create, which might + * include linkea (4K maxim), together with other updates, we set it to 9K: + * lustre_msg + ptlrpc_body + UPDATE_BUF_SIZE (8K) + */ +#define MDS_OUT_MAXREQSIZE (9 * 1024) +#define MDS_OUT_MAXREPSIZE MDS_MAXREPSIZE + +/** MDS_BUFSIZE = max_reqsize (w/o LOV EA) + max sptlrpc payload size */ +#define MDS_BUFSIZE max(MDS_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 8 * 1024) + +/** + * MDS_REG_BUFSIZE should at least be MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD. + * However, we need to allocate a much larger buffer for it because LNet + * requires each MD(rqbd) has at least MDS_REQ_MAXREQSIZE bytes left to avoid + * dropping of maximum-sized incoming request. So if MDS_REG_BUFSIZE is only a + * little larger than MDS_REG_MAXREQSIZE, then it can only fit in one request + * even there are about MDS_REG_MAX_REQSIZE bytes left in a rqbd, and memory + * utilization is very low. + * + * In the meanwhile, size of rqbd can't be too large, because rqbd can't be + * reused until all requests fit in it have been processed and released, + * which means one long blocked request can prevent the rqbd be reused. + * Now we set request buffer size to 160 KB, so even each rqbd is unlinked + * from LNet with unused 65 KB, buffer utilization will be about 59%. + * Please check LU-2432 for details. + */ +#define MDS_REG_BUFSIZE max(MDS_REG_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 160 * 1024) + +/** + * MDS_OUT_BUFSIZE = max_out_reqsize + max sptlrpc payload (~1K) which is + * about 10K, for the same reason as MDS_REG_BUFSIZE, we also give some + * extra bytes to each request buffer to improve buffer utilization rate. + */ +#define MDS_OUT_BUFSIZE max(MDS_OUT_MAXREQSIZE + SPTLRPC_MAX_PAYLOAD, \ + 24 * 1024) + +/** FLD_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc */ +#define FLD_MAXREQSIZE (160) + +/** FLD_MAXREPSIZE == lustre_msg + ptlrpc_body */ +#define FLD_MAXREPSIZE (152) +#define FLD_BUFSIZE (1 << 12) + +/** + * SEQ_MAXREQSIZE == lustre_msg + __u32 padding + ptlrpc_body + opc + lu_range + + * __u32 padding */ +#define SEQ_MAXREQSIZE (160) + +/** SEQ_MAXREPSIZE == lustre_msg + ptlrpc_body + lu_range */ +#define SEQ_MAXREPSIZE (152) +#define SEQ_BUFSIZE (1 << 12) + +/** MGS threads must be >= 3, see bug 22458 comment #28 */ +#define MGS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define MGS_NTHRS_MAX 32 + +#define MGS_NBUFS 64 +#define MGS_BUFSIZE (8 * 1024) +#define MGS_MAXREQSIZE (7 * 1024) +#define MGS_MAXREPSIZE (9 * 1024) + + /* + * OSS threads constants: + * + * Given 8 as factor and 64 as base threads number + * + * example 1): + * On 8-core server configured to 2 partitions, we will have + * 64 + 8 * 4 = 96 threads for each partition, 192 total threads. + * + * example 2): + * On 32-core machine configured to 4 partitions, we will have + * 64 + 8 * 8 = 112 threads for each partition, so total threads number + * will be 112 * 4 = 448. + * + * example 3): + * On 64-core machine configured to 4 partitions, we will have + * 64 + 16 * 8 = 192 threads for each partition, so total threads number + * will be 192 * 4 = 768 which is above limit OSS_NTHRS_MAX(512), so we + * cut off the value to OSS_NTHRS_MAX(512) / 4 which is 128 threads + * for each partition. + * + * So we can see that with these constants, threads number wil be at the + * similar level of old versions, unless the server has many cores. + */ + /* depress threads factor for VM with small memory size */ +#define OSS_THR_FACTOR min_t(int, 8, \ + NUM_CACHEPAGES >> (28 - PAGE_CACHE_SHIFT)) +#define OSS_NTHRS_INIT (PTLRPC_NTHRS_INIT + 1) +#define OSS_NTHRS_BASE 64 +#define OSS_NTHRS_MAX 512 + +/* threads for handling "create" request */ +#define OSS_CR_THR_FACTOR 1 +#define OSS_CR_NTHRS_INIT PTLRPC_NTHRS_INIT +#define OSS_CR_NTHRS_BASE 8 +#define OSS_CR_NTHRS_MAX 64 + +/** + * OST_IO_MAXREQSIZE ~= + * lustre_msg + ptlrpc_body + obdo + obd_ioobj + + * DT_MAX_BRW_PAGES * niobuf_remote + * + * - single object with 16 pages is 512 bytes + * - OST_IO_MAXREQSIZE must be at least 1 page of cookies plus some spillover + * - Must be a multiple of 1024 + * - actual size is about 18K + */ +#define _OST_MAXREQSIZE_SUM (sizeof(struct lustre_msg) + \ + sizeof(struct ptlrpc_body) + \ + sizeof(struct obdo) + \ + sizeof(struct obd_ioobj) + \ + sizeof(struct niobuf_remote) * DT_MAX_BRW_PAGES) +/** + * FIEMAP request can be 4K+ for now + */ +#define OST_MAXREQSIZE (5 * 1024) +#define OST_IO_MAXREQSIZE max_t(int, OST_MAXREQSIZE, \ + (((_OST_MAXREQSIZE_SUM - 1) | (1024 - 1)) + 1)) + +#define OST_MAXREPSIZE (9 * 1024) +#define OST_IO_MAXREPSIZE OST_MAXREPSIZE + +#define OST_NBUFS 64 +/** OST_BUFSIZE = max_reqsize + max sptlrpc payload size */ +#define OST_BUFSIZE max_t(int, OST_MAXREQSIZE + 1024, 16 * 1024) +/** + * OST_IO_MAXREQSIZE is 18K, giving extra 46K can increase buffer utilization + * rate of request buffer, please check comment of MDS_LOV_BUFSIZE for details. + */ +#define OST_IO_BUFSIZE max_t(int, OST_IO_MAXREQSIZE + 1024, 64 * 1024) + +/* Macro to hide a typecast. */ +#define ptlrpc_req_async_args(req) ((void *)&req->rq_async_args) + +/** + * Structure to single define portal connection. + */ +struct ptlrpc_connection { + /** linkage for connections hash table */ + struct hlist_node c_hash; + /** Our own lnet nid for this connection */ + lnet_nid_t c_self; + /** Remote side nid for this connection */ + lnet_process_id_t c_peer; + /** UUID of the other side */ + struct obd_uuid c_remote_uuid; + /** reference counter for this connection */ + atomic_t c_refcount; +}; + +/** Client definition for PortalRPC */ +struct ptlrpc_client { + /** What lnet portal does this client send messages to by default */ + __u32 cli_request_portal; + /** What portal do we expect replies on */ + __u32 cli_reply_portal; + /** Name of the client */ + char *cli_name; +}; + +/** state flags of requests */ +/* XXX only ones left are those used by the bulk descs as well! */ +#define PTL_RPC_FL_INTR (1 << 0) /* reply wait was interrupted by user */ +#define PTL_RPC_FL_TIMEOUT (1 << 7) /* request timed out waiting for reply */ + +#define REQ_MAX_ACK_LOCKS 8 + +union ptlrpc_async_args { + /** + * Scratchpad for passing args to completion interpreter. Users + * cast to the struct of their choosing, and CLASSERT that this is + * big enough. For _tons_ of context, OBD_ALLOC a struct and store + * a pointer to it here. The pointer_arg ensures this struct is at + * least big enough for that. + */ + void *pointer_arg[11]; + __u64 space[7]; +}; + +struct ptlrpc_request_set; +typedef int (*set_interpreter_func)(struct ptlrpc_request_set *, void *, int); +typedef int (*set_producer_func)(struct ptlrpc_request_set *, void *); + +/** + * Definition of request set structure. + * Request set is a list of requests (not necessary to the same target) that + * once populated with RPCs could be sent in parallel. + * There are two kinds of request sets. General purpose and with dedicated + * serving thread. Example of the latter is ptlrpcd set. + * For general purpose sets once request set started sending it is impossible + * to add new requests to such set. + * Provides a way to call "completion callbacks" when all requests in the set + * returned. + */ +struct ptlrpc_request_set { + atomic_t set_refcount; + /** number of in queue requests */ + atomic_t set_new_count; + /** number of uncompleted requests */ + atomic_t set_remaining; + /** wait queue to wait on for request events */ + wait_queue_head_t set_waitq; + wait_queue_head_t *set_wakeup_ptr; + /** List of requests in the set */ + struct list_head set_requests; + /** + * List of completion callbacks to be called when the set is completed + * This is only used if \a set_interpret is NULL. + * Links struct ptlrpc_set_cbdata. + */ + struct list_head set_cblist; + /** Completion callback, if only one. */ + set_interpreter_func set_interpret; + /** opaq argument passed to completion \a set_interpret callback. */ + void *set_arg; + /** + * Lock for \a set_new_requests manipulations + * locked so that any old caller can communicate requests to + * the set holder who can then fold them into the lock-free set + */ + spinlock_t set_new_req_lock; + /** List of new yet unsent requests. Only used with ptlrpcd now. */ + struct list_head set_new_requests; + + /** rq_status of requests that have been freed already */ + int set_rc; + /** Additional fields used by the flow control extension */ + /** Maximum number of RPCs in flight */ + int set_max_inflight; + /** Callback function used to generate RPCs */ + set_producer_func set_producer; + /** opaq argument passed to the producer callback */ + void *set_producer_arg; +}; + +/** + * Description of a single ptrlrpc_set callback + */ +struct ptlrpc_set_cbdata { + /** List linkage item */ + struct list_head psc_item; + /** Pointer to interpreting function */ + set_interpreter_func psc_interpret; + /** Opaq argument to pass to the callback */ + void *psc_data; +}; + +struct ptlrpc_bulk_desc; +struct ptlrpc_service_part; +struct ptlrpc_service; + +/** + * ptlrpc callback & work item stuff + */ +struct ptlrpc_cb_id { + void (*cbid_fn)(lnet_event_t *ev); /* specific callback fn */ + void *cbid_arg; /* additional arg */ +}; + +/** Maximum number of locks to fit into reply state */ +#define RS_MAX_LOCKS 8 +#define RS_DEBUG 0 + +/** + * Structure to define reply state on the server + * Reply state holds various reply message information. Also for "difficult" + * replies (rep-ack case) we store the state after sending reply and wait + * for the client to acknowledge the reception. In these cases locks could be + * added to the state for replay/failover consistency guarantees. + */ +struct ptlrpc_reply_state { + /** Callback description */ + struct ptlrpc_cb_id rs_cb_id; + /** Linkage for list of all reply states in a system */ + struct list_head rs_list; + /** Linkage for list of all reply states on same export */ + struct list_head rs_exp_list; + /** Linkage for list of all reply states for same obd */ + struct list_head rs_obd_list; +#if RS_DEBUG + struct list_head rs_debug_list; +#endif + /** A spinlock to protect the reply state flags */ + spinlock_t rs_lock; + /** Reply state flags */ + unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ + unsigned long rs_scheduled:1; /* being handled? */ + unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ + unsigned long rs_handled:1; /* been handled yet? */ + unsigned long rs_on_net:1; /* reply_out_callback pending? */ + unsigned long rs_prealloc:1; /* rs from prealloc list */ + unsigned long rs_committed:1;/* the transaction was committed + and the rs was dispatched + by ptlrpc_commit_replies */ + /** Size of the state */ + int rs_size; + /** opcode */ + __u32 rs_opc; + /** Transaction number */ + __u64 rs_transno; + /** xid */ + __u64 rs_xid; + struct obd_export *rs_export; + struct ptlrpc_service_part *rs_svcpt; + /** Lnet metadata handle for the reply */ + lnet_handle_md_t rs_md_h; + atomic_t rs_refcount; + + /** Context for the sevice thread */ + struct ptlrpc_svc_ctx *rs_svc_ctx; + /** Reply buffer (actually sent to the client), encoded if needed */ + struct lustre_msg *rs_repbuf; /* wrapper */ + /** Size of the reply buffer */ + int rs_repbuf_len; /* wrapper buf length */ + /** Size of the reply message */ + int rs_repdata_len; /* wrapper msg length */ + /** + * Actual reply message. Its content is encrupted (if needed) to + * produce reply buffer for actual sending. In simple case + * of no network encryption we jus set \a rs_repbuf to \a rs_msg + */ + struct lustre_msg *rs_msg; /* reply message */ + + /** Number of locks awaiting client ACK */ + int rs_nlocks; + /** Handles of locks awaiting client reply ACK */ + struct lustre_handle rs_locks[RS_MAX_LOCKS]; + /** Lock modes of locks in \a rs_locks */ + ldlm_mode_t rs_modes[RS_MAX_LOCKS]; +}; + +struct ptlrpc_thread; + +/** RPC stages */ +enum rq_phase { + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREGISTERING = 0xebc0de05, + RQ_PHASE_UNDEFINED = 0xebc0de06 +}; + +/** Type of request interpreter call-back */ +typedef int (*ptlrpc_interpterer_t)(const struct lu_env *env, + struct ptlrpc_request *req, + void *arg, int rc); + +/** + * Definition of request pool structure. + * The pool is used to store empty preallocated requests for the case + * when we would actually need to send something without performing + * any allocations (to avoid e.g. OOM). + */ +struct ptlrpc_request_pool { + /** Locks the list */ + spinlock_t prp_lock; + /** list of ptlrpc_request structs */ + struct list_head prp_req_list; + /** Maximum message size that would fit into a rquest from this pool */ + int prp_rq_size; + /** Function to allocate more requests for this pool */ + void (*prp_populate)(struct ptlrpc_request_pool *, int); +}; + +struct lu_context; +struct lu_env; + +struct ldlm_lock; + +/** + * \defgroup nrs Network Request Scheduler + * @{ + */ +struct ptlrpc_nrs_policy; +struct ptlrpc_nrs_resource; +struct ptlrpc_nrs_request; + +/** + * NRS control operations. + * + * These are common for all policies. + */ +enum ptlrpc_nrs_ctl { + /** + * Not a valid opcode. + */ + PTLRPC_NRS_CTL_INVALID, + /** + * Activate the policy. + */ + PTLRPC_NRS_CTL_START, + /** + * Reserved for multiple primary policies, which may be a possibility + * in the future. + */ + PTLRPC_NRS_CTL_STOP, + /** + * Policies can start using opcodes from this value and onwards for + * their own purposes; the assigned value itself is arbitrary. + */ + PTLRPC_NRS_CTL_1ST_POL_SPEC = 0x20, +}; + +/** + * ORR policy operations + */ +enum nrs_ctl_orr { + NRS_CTL_ORR_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + NRS_CTL_ORR_WR_QUANTUM, + NRS_CTL_ORR_RD_OFF_TYPE, + NRS_CTL_ORR_WR_OFF_TYPE, + NRS_CTL_ORR_RD_SUPP_REQ, + NRS_CTL_ORR_WR_SUPP_REQ, +}; + +/** + * NRS policy operations. + * + * These determine the behaviour of a policy, and are called in response to + * NRS core events. + */ +struct ptlrpc_nrs_pol_ops { + /** + * Called during policy registration; this operation is optional. + * + * \param[in,out] policy The policy being initialized + */ + int (*op_policy_init) (struct ptlrpc_nrs_policy *policy); + /** + * Called during policy unregistration; this operation is optional. + * + * \param[in,out] policy The policy being unregistered/finalized + */ + void (*op_policy_fini) (struct ptlrpc_nrs_policy *policy); + /** + * Called when activating a policy via lprocfs; policies allocate and + * initialize their resources here; this operation is optional. + * + * \param[in,out] policy The policy being started + * + * \see nrs_policy_start_locked() + */ + int (*op_policy_start) (struct ptlrpc_nrs_policy *policy); + /** + * Called when deactivating a policy via lprocfs; policies deallocate + * their resources here; this operation is optional + * + * \param[in,out] policy The policy being stopped + * + * \see nrs_policy_stop0() + */ + void (*op_policy_stop) (struct ptlrpc_nrs_policy *policy); + /** + * Used for policy-specific operations; i.e. not generic ones like + * \e PTLRPC_NRS_CTL_START and \e PTLRPC_NRS_CTL_GET_INFO; analogous + * to an ioctl; this operation is optional. + * + * \param[in,out] policy The policy carrying out operation \a opc + * \param[in] opc The command operation being carried out + * \param[in,out] arg An generic buffer for communication between the + * user and the control operation + * + * \retval -ve error + * \retval 0 success + * + * \see ptlrpc_nrs_policy_control() + */ + int (*op_policy_ctl) (struct ptlrpc_nrs_policy *policy, + enum ptlrpc_nrs_ctl opc, void *arg); + + /** + * Called when obtaining references to the resources of the resource + * hierarchy for a request that has arrived for handling at the PTLRPC + * service. Policies should return -ve for requests they do not wish + * to handle. This operation is mandatory. + * + * \param[in,out] policy The policy we're getting resources for. + * \param[in,out] nrq The request we are getting resources for. + * \param[in] parent The parent resource of the resource being + * requested; set to NULL if none. + * \param[out] resp The resource is to be returned here; the + * fallback policy in an NRS head should + * \e always return a non-NULL pointer value. + * \param[in] moving_req When set, signifies that this is an attempt + * to obtain resources for a request being moved + * to the high-priority NRS head by + * ldlm_lock_reorder_req(). + * This implies two things: + * 1. We are under obd_export::exp_rpc_lock and + * so should not sleep. + * 2. We should not perform non-idempotent or can + * skip performing idempotent operations that + * were carried out when resources were first + * taken for the request when it was initialized + * in ptlrpc_nrs_req_initialize(). + * + * \retval 0, +ve The level of the returned resource in the resource + * hierarchy; currently only 0 (for a non-leaf resource) + * and 1 (for a leaf resource) are supported by the + * framework. + * \retval -ve error + * + * \see ptlrpc_nrs_req_initialize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + int (*op_res_get) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq, + const struct ptlrpc_nrs_resource *parent, + struct ptlrpc_nrs_resource **resp, + bool moving_req); + /** + * Called when releasing references taken for resources in the resource + * hierarchy for the request; this operation is optional. + * + * \param[in,out] policy The policy the resource belongs to + * \param[in] res The resource to be freed + * + * \see ptlrpc_nrs_req_finalize() + * \see ptlrpc_nrs_hpreq_add_nolock() + * \see ptlrpc_nrs_req_hp_move() + */ + void (*op_res_put) (struct ptlrpc_nrs_policy *policy, + const struct ptlrpc_nrs_resource *res); + + /** + * Obtains a request for handling from the policy, and optionally + * removes the request from the policy; this operation is mandatory. + * + * \param[in,out] policy The policy to poll + * \param[in] peek When set, signifies that we just want to + * examine the request, and not handle it, so the + * request is not removed from the policy. + * \param[in] force When set, it will force a policy to return a + * request if it has one queued. + * + * \retval NULL No request available for handling + * \retval valid-pointer The request polled for handling + * + * \see ptlrpc_nrs_req_get_nolock() + */ + struct ptlrpc_nrs_request * + (*op_req_get) (struct ptlrpc_nrs_policy *policy, bool peek, + bool force); + /** + * Called when attempting to add a request to a policy for later + * handling; this operation is mandatory. + * + * \param[in,out] policy The policy on which to enqueue \a nrq + * \param[in,out] nrq The request to enqueue + * + * \retval 0 success + * \retval != 0 error + * + * \see ptlrpc_nrs_req_add_nolock() + */ + int (*op_req_enqueue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Removes a request from the policy's set of pending requests. Normally + * called after a request has been polled successfully from the policy + * for handling; this operation is mandatory. + * + * \param[in,out] policy The policy the request \a nrq belongs to + * \param[in,out] nrq The request to dequeue + * + * \see ptlrpc_nrs_req_del_nolock() + */ + void (*op_req_dequeue) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Called after the request being carried out. Could be used for + * job/resource control; this operation is optional. + * + * \param[in,out] policy The policy which is stopping to handle request + * \a nrq + * \param[in,out] nrq The request + * + * \pre spin_is_locked(&svcpt->scp_req_lock) + * + * \see ptlrpc_nrs_req_stop_nolock() + */ + void (*op_req_stop) (struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_request *nrq); + /** + * Registers the policy's lprocfs interface with a PTLRPC service. + * + * \param[in] svc The service + * + * \retval 0 success + * \retval != 0 error + */ + int (*op_lprocfs_init) (struct ptlrpc_service *svc); + /** + * Unegisters the policy's lprocfs interface with a PTLRPC service. + * + * In cases of failed policy registration in + * \e ptlrpc_nrs_policy_register(), this function may be called for a + * service which has not registered the policy successfully, so + * implementations of this method should make sure their operations are + * safe in such cases. + * + * \param[in] svc The service + */ + void (*op_lprocfs_fini) (struct ptlrpc_service *svc); +}; + +/** + * Policy flags + */ +enum nrs_policy_flags { + /** + * Fallback policy, use this flag only on a single supported policy per + * service. The flag cannot be used on policies that use + * \e PTLRPC_NRS_FL_REG_EXTERN + */ + PTLRPC_NRS_FL_FALLBACK = (1 << 0), + /** + * Start policy immediately after registering. + */ + PTLRPC_NRS_FL_REG_START = (1 << 1), + /** + * This is a policy registering from a module different to the one NRS + * core ships in (currently ptlrpc). + */ + PTLRPC_NRS_FL_REG_EXTERN = (1 << 2), +}; + +/** + * NRS queue type. + * + * Denotes whether an NRS instance is for handling normal or high-priority + * RPCs, or whether an operation pertains to one or both of the NRS instances + * in a service. + */ +enum ptlrpc_nrs_queue_type { + PTLRPC_NRS_QUEUE_REG = (1 << 0), + PTLRPC_NRS_QUEUE_HP = (1 << 1), + PTLRPC_NRS_QUEUE_BOTH = (PTLRPC_NRS_QUEUE_REG | PTLRPC_NRS_QUEUE_HP) +}; + +/** + * NRS head + * + * A PTLRPC service has at least one NRS head instance for handling normal + * priority RPCs, and may optionally have a second NRS head instance for + * handling high-priority RPCs. Each NRS head maintains a list of available + * policies, of which one and only one policy is acting as the fallback policy, + * and optionally a different policy may be acting as the primary policy. For + * all RPCs handled by this NRS head instance, NRS core will first attempt to + * enqueue the RPC using the primary policy (if any). The fallback policy is + * used in the following cases: + * - when there was no primary policy in the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state at the time the request + * was initialized. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, denoted it did not wish, or for some other reason was + * not able to handle the request, by returning a non-valid NRS resource + * reference. + * - when the primary policy that was at the + * ptlrpc_nrs_pol_state::PTLRPC_NRS_POL_STATE_STARTED state at the time the + * RPC was initialized, fails later during the request enqueueing stage. + * + * \see nrs_resource_get_safe() + * \see nrs_request_enqueue() + */ +struct ptlrpc_nrs { + spinlock_t nrs_lock; + /** XXX Possibly replace svcpt->scp_req_lock with another lock here. */ + /** + * List of registered policies + */ + struct list_head nrs_policy_list; + /** + * List of policies with queued requests. Policies that have any + * outstanding requests are queued here, and this list is queried + * in a round-robin manner from NRS core when obtaining a request + * for handling. This ensures that requests from policies that at some + * point transition away from the + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED state are drained. + */ + struct list_head nrs_policy_queued; + /** + * Service partition for this NRS head + */ + struct ptlrpc_service_part *nrs_svcpt; + /** + * Primary policy, which is the preferred policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_primary; + /** + * Fallback policy, which is the backup policy for handling RPCs + */ + struct ptlrpc_nrs_policy *nrs_policy_fallback; + /** + * This NRS head handles either HP or regular requests + */ + enum ptlrpc_nrs_queue_type nrs_queue_type; + /** + * # queued requests from all policies in this NRS head + */ + unsigned long nrs_req_queued; + /** + * # scheduled requests from all policies in this NRS head + */ + unsigned long nrs_req_started; + /** + * # policies on this NRS + */ + unsigned nrs_num_pols; + /** + * This NRS head is in progress of starting a policy + */ + unsigned nrs_policy_starting:1; + /** + * In progress of shutting down the whole NRS head; used during + * unregistration + */ + unsigned nrs_stopping:1; +}; + +#define NRS_POL_NAME_MAX 16 + +struct ptlrpc_nrs_pol_desc; + +/** + * Service compatibility predicate; this determines whether a policy is adequate + * for handling RPCs of a particular PTLRPC service. + * + * XXX:This should give the same result during policy registration and + * unregistration, and for all partitions of a service; so the result should not + * depend on temporal service or other properties, that may influence the + * result. + */ +typedef bool (*nrs_pol_desc_compat_t) (const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc); + +struct ptlrpc_nrs_pol_conf { + /** + * Human-readable policy name + */ + char nc_name[NRS_POL_NAME_MAX]; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *nc_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t nc_compat; + /** + * Set for policies that support a single ptlrpc service, i.e. ones that + * have \a pd_compat set to nrs_policy_compat_one(). The variable value + * depicts the name of the single service that such policies are + * compatible with. + */ + const char *nc_compat_svc_name; + /** + * Owner module for this policy descriptor; policies registering from a + * different module to the one the NRS framework is held within + * (currently ptlrpc), should set this field to THIS_MODULE. + */ + module_t *nc_owner; + /** + * Policy registration flags; a bitmast of \e nrs_policy_flags + */ + unsigned nc_flags; +}; + +/** + * NRS policy registering descriptor + * + * Is used to hold a description of a policy that can be passed to NRS core in + * order to register the policy with NRS heads in different PTLRPC services. + */ +struct ptlrpc_nrs_pol_desc { + /** + * Human-readable policy name + */ + char pd_name[NRS_POL_NAME_MAX]; + /** + * Link into nrs_core::nrs_policies + */ + struct list_head pd_list; + /** + * NRS operations for this policy + */ + const struct ptlrpc_nrs_pol_ops *pd_ops; + /** + * Service compatibility predicate + */ + nrs_pol_desc_compat_t pd_compat; + /** + * Set for policies that are compatible with only one PTLRPC service. + * + * \see ptlrpc_nrs_pol_conf::nc_compat_svc_name + */ + const char *pd_compat_svc_name; + /** + * Owner module for this policy descriptor. + * + * We need to hold a reference to the module whenever we might make use + * of any of the module's contents, i.e. + * - If one or more instances of the policy are at a state where they + * might be handling a request, i.e. + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STARTED or + * ptlrpc_nrs_pol_state::NRS_POL_STATE_STOPPING as we will have to + * call into the policy's ptlrpc_nrs_pol_ops() handlers. A reference + * is taken on the module when + * \e ptlrpc_nrs_pol_desc::pd_refs becomes 1, and released when it + * becomes 0, so that we hold only one reference to the module maximum + * at any time. + * + * We do not need to hold a reference to the module, even though we + * might use code and data from the module, in the following cases: + * - During external policy registration, because this should happen in + * the module's init() function, in which case the module is safe from + * removal because a reference is being held on the module by the + * kernel, and iirc kmod (and I guess module-init-tools also) will + * serialize any racing processes properly anyway. + * - During external policy unregistration, because this should happen + * in a module's exit() function, and any attempts to start a policy + * instance would need to take a reference on the module, and this is + * not possible once we have reached the point where the exit() + * handler is called. + * - During service registration and unregistration, as service setup + * and cleanup, and policy registration, unregistration and policy + * instance starting, are serialized by \e nrs_core::nrs_mutex, so + * as long as users adhere to the convention of registering policies + * in init() and unregistering them in module exit() functions, there + * should not be a race between these operations. + * - During any policy-specific lprocfs operations, because a reference + * is held by the kernel on a proc entry that has been entered by a + * syscall, so as long as proc entries are removed during unregistration time, + * then unregistration and lprocfs operations will be properly + * serialized. + */ + module_t *pd_owner; + /** + * Bitmask of \e nrs_policy_flags + */ + unsigned pd_flags; + /** + * # of references on this descriptor + */ + atomic_t pd_refs; +}; + +/** + * NRS policy state + * + * Policies transition from one state to the other during their lifetime + */ +enum ptlrpc_nrs_pol_state { + /** + * Not a valid policy state. + */ + NRS_POL_STATE_INVALID, + /** + * Policies are at this state either at the start of their life, or + * transition here when the user selects a different policy to act + * as the primary one. + */ + NRS_POL_STATE_STOPPED, + /** + * Policy is progress of stopping + */ + NRS_POL_STATE_STOPPING, + /** + * Policy is in progress of starting + */ + NRS_POL_STATE_STARTING, + /** + * A policy is in this state in two cases: + * - it is the fallback policy, which is always in this state. + * - it has been activated by the user; i.e. it is the primary policy, + */ + NRS_POL_STATE_STARTED, +}; + +/** + * NRS policy information + * + * Used for obtaining information for the status of a policy via lprocfs + */ +struct ptlrpc_nrs_pol_info { + /** + * Policy name + */ + char pi_name[NRS_POL_NAME_MAX]; + /** + * Current policy state + */ + enum ptlrpc_nrs_pol_state pi_state; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pi_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pi_req_started; + /** + * Is this a fallback policy? + */ + unsigned pi_fallback:1; +}; + +/** + * NRS policy + * + * There is one instance of this for each policy in each NRS head of each + * PTLRPC service partition. + */ +struct ptlrpc_nrs_policy { + /** + * Linkage into the NRS head's list of policies, + * ptlrpc_nrs:nrs_policy_list + */ + struct list_head pol_list; + /** + * Linkage into the NRS head's list of policies with enqueued + * requests ptlrpc_nrs:nrs_policy_queued + */ + struct list_head pol_list_queued; + /** + * Current state of this policy + */ + enum ptlrpc_nrs_pol_state pol_state; + /** + * Bitmask of nrs_policy_flags + */ + unsigned pol_flags; + /** + * # RPCs enqueued for later dispatching by the policy + */ + long pol_req_queued; + /** + * # RPCs started for dispatch by the policy + */ + long pol_req_started; + /** + * Usage Reference count taken on the policy instance + */ + long pol_ref; + /** + * The NRS head this policy has been created at + */ + struct ptlrpc_nrs *pol_nrs; + /** + * Private policy data; varies by policy type + */ + void *pol_private; + /** + * Policy descriptor for this policy instance. + */ + struct ptlrpc_nrs_pol_desc *pol_desc; +}; + +/** + * NRS resource + * + * Resources are embedded into two types of NRS entities: + * - Inside NRS policies, in the policy's private data in + * ptlrpc_nrs_policy::pol_private + * - In objects that act as prime-level scheduling entities in different NRS + * policies; e.g. on a policy that performs round robin or similar order + * scheduling across client NIDs, there would be one NRS resource per unique + * client NID. On a policy which performs round robin scheduling across + * backend filesystem objects, there would be one resource associated with + * each of the backend filesystem objects partaking in the scheduling + * performed by the policy. + * + * NRS resources share a parent-child relationship, in which resources embedded + * in policy instances are the parent entities, with all scheduling entities + * a policy schedules across being the children, thus forming a simple resource + * hierarchy. This hierarchy may be extended with one or more levels in the + * future if the ability to have more than one primary policy is added. + * + * Upon request initialization, references to the then active NRS policies are + * taken and used to later handle the dispatching of the request with one of + * these policies. + * + * \see nrs_resource_get_safe() + * \see ptlrpc_nrs_req_add() + */ +struct ptlrpc_nrs_resource { + /** + * This NRS resource's parent; is NULL for resources embedded in NRS + * policy instances; i.e. those are top-level ones. + */ + struct ptlrpc_nrs_resource *res_parent; + /** + * The policy associated with this resource. + */ + struct ptlrpc_nrs_policy *res_policy; +}; + +enum { + NRS_RES_FALLBACK, + NRS_RES_PRIMARY, + NRS_RES_MAX +}; + +/* \name fifo + * + * FIFO policy + * + * This policy is a logical wrapper around previous, non-NRS functionality. + * It dispatches RPCs in the same order as they arrive from the network. This + * policy is currently used as the fallback policy, and the only enabled policy + * on all NRS heads of all PTLRPC service partitions. + * @{ + */ + +/** + * Private data structure for the FIFO policy + */ +struct nrs_fifo_head { + /** + * Resource object for policy instance. + */ + struct ptlrpc_nrs_resource fh_res; + /** + * List of queued requests. + */ + struct list_head fh_list; + /** + * For debugging purposes. + */ + __u64 fh_sequence; +}; + +struct nrs_fifo_req { + struct list_head fr_list; + __u64 fr_sequence; +}; + +/** @} fifo */ + +/** + * \name CRR-N + * + * CRR-N, Client Round Robin over NIDs + * @{ + */ + +/** + * private data structure for CRR-N NRS + */ +struct nrs_crrn_net { + struct ptlrpc_nrs_resource cn_res; + cfs_binheap_t *cn_binheap; + cfs_hash_t *cn_cli_hash; + /** + * Used when a new scheduling round commences, in order to synchronize + * all clients with the new round number. + */ + __u64 cn_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 cn_sequence; + /** + * Round Robin quantum; the maximum number of RPCs that each request + * batch for each client can have in a scheduling round. + */ + __u16 cn_quantum; +}; + +/** + * Object representing a client in CRR-N, as identified by its NID + */ +struct nrs_crrn_client { + struct ptlrpc_nrs_resource cc_res; + struct hlist_node cc_hnode; + lnet_nid_t cc_nid; + /** + * The round number against which this client is currently scheduling + * requests. + */ + __u64 cc_round; + /** + * The sequence number used for requests scheduled by this client during + * the current round number. + */ + __u64 cc_sequence; + atomic_t cc_ref; + /** + * Round Robin quantum; the maximum number of RPCs the client is allowed + * to schedule in a single batch of each round. + */ + __u16 cc_quantum; + /** + * # of pending requests for this client, on all existing rounds + */ + __u16 cc_active; +}; + +/** + * CRR-N NRS request definition + */ +struct nrs_crrn_req { + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 cr_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 cr_sequence; +}; + +/** + * CRR-N policy operations. + */ +enum nrs_ctl_crr { + /** + * Read the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_RD_QUANTUM = PTLRPC_NRS_CTL_1ST_POL_SPEC, + /** + * Write the RR quantum size of a CRR-N policy. + */ + NRS_CTL_CRRN_WR_QUANTUM, +}; + +/** @} CRR-N */ + +/** + * \name ORR/TRR + * + * ORR/TRR (Object-based Round Robin/Target-based Round Robin) NRS policies + * @{ + */ + +/** + * Lower and upper byte offsets of a brw RPC + */ +struct nrs_orr_req_range { + __u64 or_start; + __u64 or_end; +}; + +/** + * RPC types supported by the ORR/TRR policies + */ +enum nrs_orr_supp { + NOS_OST_READ = (1 << 0), + NOS_OST_WRITE = (1 << 1), + NOS_OST_RW = (NOS_OST_READ | NOS_OST_WRITE), + /** + * Default value for policies. + */ + NOS_DFLT = NOS_OST_READ +}; + +/** + * As unique keys for grouping RPCs together, we use the object's OST FID for + * the ORR policy, and the OST index for the TRR policy. + * + * XXX: We waste some space for TRR policy instances by using a union, but it + * allows to consolidate some of the code between ORR and TRR, and these + * policies will probably eventually merge into one anyway. + */ +struct nrs_orr_key { + union { + /** object FID for ORR */ + struct lu_fid ok_fid; + /** OST index for TRR */ + __u32 ok_idx; + }; +}; + +/** + * The largest base string for unique hash/slab object names is + * "nrs_orr_reg_", so 13 characters. We add 3 to this to be used for the CPT + * id number, so this _should_ be more than enough for the maximum number of + * CPTs on any system. If it does happen that this statement is incorrect, + * nrs_orr_genobjname() will inevitably yield a non-unique name and cause + * kmem_cache_create() to complain (on Linux), so the erroneous situation + * will hopefully not go unnoticed. + */ +#define NRS_ORR_OBJ_NAME_MAX (sizeof("nrs_orr_reg_") + 3) + +/** + * private data structure for ORR and TRR NRS + */ +struct nrs_orr_data { + struct ptlrpc_nrs_resource od_res; + cfs_binheap_t *od_binheap; + cfs_hash_t *od_obj_hash; + struct kmem_cache *od_cache; + /** + * Used when a new scheduling round commences, in order to synchronize + * all object or OST batches with the new round number. + */ + __u64 od_round; + /** + * Determines the relevant ordering amongst request batches within a + * scheduling round. + */ + __u64 od_sequence; + /** + * RPC types that are currently supported. + */ + enum nrs_orr_supp od_supp; + /** + * Round Robin quantum; the maxium number of RPCs that each request + * batch for each object or OST can have in a scheduling round. + */ + __u16 od_quantum; + /** + * Whether to use physical disk offsets or logical file offsets. + */ + bool od_physical; + /** + * XXX: We need to provide a persistently allocated string to hold + * unique object names for this policy, since in currently supported + * versions of Linux by Lustre, kmem_cache_create() just sets a pointer + * to the name string provided. kstrdup() is used in the version of + * kmeme_cache_create() in current Linux mainline, so we may be able to + * remove this in the future. + */ + char od_objname[NRS_ORR_OBJ_NAME_MAX]; +}; + +/** + * Represents a backend-fs object or OST in the ORR and TRR policies + * respectively + */ +struct nrs_orr_object { + struct ptlrpc_nrs_resource oo_res; + struct hlist_node oo_hnode; + /** + * The round number against which requests are being scheduled for this + * object or OST + */ + __u64 oo_round; + /** + * The sequence number used for requests scheduled for this object or + * OST during the current round number. + */ + __u64 oo_sequence; + /** + * The key of the object or OST for which this structure instance is + * scheduling RPCs + */ + struct nrs_orr_key oo_key; + atomic_t oo_ref; + /** + * Round Robin quantum; the maximum number of RPCs that are allowed to + * be scheduled for the object or OST in a single batch of each round. + */ + __u16 oo_quantum; + /** + * # of pending requests for this object or OST, on all existing rounds + */ + __u16 oo_active; +}; + +/** + * ORR/TRR NRS request definition + */ +struct nrs_orr_req { + /** + * The offset range this request covers + */ + struct nrs_orr_req_range or_range; + /** + * Round number for this request; shared with all other requests in the + * same batch. + */ + __u64 or_round; + /** + * Sequence number for this request; shared with all other requests in + * the same batch. + */ + __u64 or_sequence; + /** + * For debugging purposes. + */ + struct nrs_orr_key or_key; + /** + * An ORR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_orr_set:1; + /** + * A TRR policy instance has filled in request information while + * enqueueing the request on the service partition's regular NRS head. + */ + unsigned int or_trr_set:1; + /** + * Request offset ranges have been filled in with logical offset + * values. + */ + unsigned int or_logical_set:1; + /** + * Request offset ranges have been filled in with physical offset + * values. + */ + unsigned int or_physical_set:1; +}; + +/** @} ORR/TRR */ + +/** + * NRS request + * + * Instances of this object exist embedded within ptlrpc_request; the main + * purpose of this object is to hold references to the request's resources + * for the lifetime of the request, and to hold properties that policies use + * use for determining the request's scheduling priority. + * */ +struct ptlrpc_nrs_request { + /** + * The request's resource hierarchy. + */ + struct ptlrpc_nrs_resource *nr_res_ptrs[NRS_RES_MAX]; + /** + * Index into ptlrpc_nrs_request::nr_res_ptrs of the resource of the + * policy that was used to enqueue the request. + * + * \see nrs_request_enqueue() + */ + unsigned nr_res_idx; + unsigned nr_initialized:1; + unsigned nr_enqueued:1; + unsigned nr_started:1; + unsigned nr_finalized:1; + cfs_binheap_node_t nr_node; + + /** + * Policy-specific fields, used for determining a request's scheduling + * priority, and other supporting functionality. + */ + union { + /** + * Fields for the FIFO policy + */ + struct nrs_fifo_req fifo; + /** + * CRR-N request defintion + */ + struct nrs_crrn_req crr; + /** ORR and TRR share the same request definition */ + struct nrs_orr_req orr; + } nr_u; + /** + * Externally-registering policies may want to use this to allocate + * their own request properties. + */ + void *ext; +}; + +/** @} nrs */ + +/** + * Basic request prioritization operations structure. + * The whole idea is centered around locks and RPCs that might affect locks. + * When a lock is contended we try to give priority to RPCs that might lead + * to fastest release of that lock. + * Currently only implemented for OSTs only in a way that makes all + * IO and truncate RPCs that are coming from a locked region where a lock is + * contended a priority over other requests. + */ +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); + /** + * Called after the request has been handled. + */ + void (*hpreq_fini)(struct ptlrpc_request *); +}; + +/** + * Represents remote procedure call. + * + * This is a staple structure used by everybody wanting to send a request + * in Lustre. + */ +struct ptlrpc_request { + /* Request type: one of PTL_RPC_MSG_* */ + int rq_type; + /** Result of request processing */ + int rq_status; + /** + * Linkage item through which this request is included into + * sending/delayed lists on client and into rqbd list on server + */ + struct list_head rq_list; + /** + * Server side list of incoming unserved requests sorted by arrival + * time. Traversed from time to time to notice about to expire + * requests and sent back "early replies" to clients to let them + * know server is alive and well, just very busy to service their + * requests in time + */ + struct list_head rq_timed_list; + /** server-side history, used for debuging purposes. */ + struct list_head rq_history_list; + /** server-side per-export list */ + struct list_head rq_exp_list; + /** server-side hp handlers */ + struct ptlrpc_hpreq_ops *rq_ops; + + /** initial thread servicing this request */ + struct ptlrpc_thread *rq_svc_thread; + + /** history sequence # */ + __u64 rq_history_seq; + /** \addtogroup nrs + * @{ + */ + /** stub for NRS request */ + struct ptlrpc_nrs_request rq_nrq; + /** @} nrs */ + /** the index of service's srv_at_array into which request is linked */ + time_t rq_at_index; + /** Lock to protect request flags and some other important bits, like + * rq_list + */ + spinlock_t rq_lock; + /** client-side flags are serialized by rq_lock */ + unsigned int rq_intr:1, rq_replied:1, rq_err:1, + rq_timedout:1, rq_resend:1, rq_restart:1, + /** + * when ->rq_replay is set, request is kept by the client even + * after server commits corresponding transaction. This is + * used for operations that require sequence of multiple + * requests to be replayed. The only example currently is file + * open/close. When last request in such a sequence is + * committed, ->rq_replay is cleared on all requests in the + * sequence. + */ + rq_replay:1, + rq_no_resend:1, rq_waiting:1, rq_receiving_reply:1, + rq_no_delay:1, rq_net_err:1, rq_wait_ctx:1, + rq_early:1, rq_must_unlink:1, + rq_memalloc:1, /* req originated from "kswapd" */ + /* server-side flags */ + rq_packed_final:1, /* packed final reply */ + rq_hp:1, /* high priority RPC */ + rq_at_linked:1, /* link into service's srv_at_array */ + rq_reply_truncate:1, + rq_committed:1, + /* whether the "rq_set" is a valid one */ + rq_invalid_rqset:1, + rq_generation_set:1, + /* do not resend request on -EINPROGRESS */ + rq_no_retry_einprogress:1, + /* allow the req to be sent if the import is in recovery + * status */ + rq_allow_replay:1; + + unsigned int rq_nr_resend; + + enum rq_phase rq_phase; /* one of RQ_PHASE_* */ + enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ + atomic_t rq_refcount;/* client-side refcount for SENT race, + server-side refcounf for multiple replies */ + + /** Portal to which this request would be sent */ + short rq_request_portal; /* XXX FIXME bug 249 */ + /** Portal where to wait for reply and where reply would be sent */ + short rq_reply_portal; /* XXX FIXME bug 249 */ + + /** + * client-side: + * !rq_truncate : # reply bytes actually received, + * rq_truncate : required repbuf_len for resend + */ + int rq_nob_received; + /** Request length */ + int rq_reqlen; + /** Reply length */ + int rq_replen; + /** Request message - what client sent */ + struct lustre_msg *rq_reqmsg; + /** Reply message - server response */ + struct lustre_msg *rq_repmsg; + /** Transaction number */ + __u64 rq_transno; + /** xid */ + __u64 rq_xid; + /** + * List item to for replay list. Not yet commited requests get linked + * there. + * Also see \a rq_replay comment above. + */ + struct list_head rq_replay_list; + + /** + * security and encryption data + * @{ */ + struct ptlrpc_cli_ctx *rq_cli_ctx; /**< client's half ctx */ + struct ptlrpc_svc_ctx *rq_svc_ctx; /**< server's half ctx */ + struct list_head rq_ctx_chain; /**< link to waited ctx */ + + struct sptlrpc_flavor rq_flvr; /**< for client & server */ + enum lustre_sec_part rq_sp_from; + + /* client/server security flags */ + unsigned int + rq_ctx_init:1, /* context initiation */ + rq_ctx_fini:1, /* context destroy */ + rq_bulk_read:1, /* request bulk read */ + rq_bulk_write:1, /* request bulk write */ + /* server authentication flags */ + rq_auth_gss:1, /* authenticated by gss */ + rq_auth_remote:1, /* authed as remote user */ + rq_auth_usr_root:1, /* authed as root */ + rq_auth_usr_mdt:1, /* authed as mdt */ + rq_auth_usr_ost:1, /* authed as ost */ + /* security tfm flags */ + rq_pack_udesc:1, + rq_pack_bulk:1, + /* doesn't expect reply FIXME */ + rq_no_reply:1, + rq_pill_init:1; /* pill initialized */ + + uid_t rq_auth_uid; /* authed uid */ + uid_t rq_auth_mapped_uid; /* authed uid mapped to */ + + /* (server side), pointed directly into req buffer */ + struct ptlrpc_user_desc *rq_user_desc; + + /* various buffer pointers */ + struct lustre_msg *rq_reqbuf; /* req wrapper */ + char *rq_repbuf; /* rep buffer */ + struct lustre_msg *rq_repdata; /* rep wrapper msg */ + struct lustre_msg *rq_clrbuf; /* only in priv mode */ + int rq_reqbuf_len; /* req wrapper buf len */ + int rq_reqdata_len; /* req wrapper msg len */ + int rq_repbuf_len; /* rep buffer len */ + int rq_repdata_len; /* rep wrapper msg len */ + int rq_clrbuf_len; /* only in priv mode */ + int rq_clrdata_len; /* only in priv mode */ + + /** early replies go to offset 0, regular replies go after that */ + unsigned int rq_reply_off; + + /** @} */ + + /** Fields that help to see if request and reply were swabbed or not */ + __u32 rq_req_swab_mask; + __u32 rq_rep_swab_mask; + + /** What was import generation when this request was sent */ + int rq_import_generation; + enum lustre_imp_state rq_send_state; + + /** how many early replies (for stats) */ + int rq_early_count; + + /** client+server request */ + lnet_handle_md_t rq_req_md_h; + struct ptlrpc_cb_id rq_req_cbid; + /** optional time limit for send attempts */ + cfs_duration_t rq_delay_limit; + /** time request was first queued */ + cfs_time_t rq_queued_time; + + /* server-side... */ + /** request arrival time */ + struct timeval rq_arrival_time; + /** separated reply state */ + struct ptlrpc_reply_state *rq_reply_state; + /** incoming request buffer */ + struct ptlrpc_request_buffer_desc *rq_rqbd; + + /** client-only incoming reply */ + lnet_handle_md_t rq_reply_md_h; + wait_queue_head_t rq_reply_waitq; + struct ptlrpc_cb_id rq_reply_cbid; + + /** our LNet NID */ + lnet_nid_t rq_self; + /** Peer description (the other side) */ + lnet_process_id_t rq_peer; + /** Server-side, export on which request was received */ + struct obd_export *rq_export; + /** Client side, import where request is being sent */ + struct obd_import *rq_import; + + /** Replay callback, called after request is replayed at recovery */ + void (*rq_replay_cb)(struct ptlrpc_request *); + /** + * Commit callback, called when request is committed and about to be + * freed. + */ + void (*rq_commit_cb)(struct ptlrpc_request *); + /** Opaq data for replay and commit callbacks. */ + void *rq_cb_data; + + /** For bulk requests on client only: bulk descriptor */ + struct ptlrpc_bulk_desc *rq_bulk; + + /** client outgoing req */ + /** + * when request/reply sent (secs), or time when request should be sent + */ + time_t rq_sent; + /** time for request really sent out */ + time_t rq_real_sent; + + /** when request must finish. volatile + * so that servers' early reply updates to the deadline aren't + * kept in per-cpu cache */ + volatile time_t rq_deadline; + /** when req reply unlink must finish. */ + time_t rq_reply_deadline; + /** when req bulk unlink must finish. */ + time_t rq_bulk_deadline; + /** + * service time estimate (secs) + * If the requestsis not served by this time, it is marked as timed out. + */ + int rq_timeout; + + /** Multi-rpc bits */ + /** Per-request waitq introduced by bug 21938 for recovery waiting */ + wait_queue_head_t rq_set_waitq; + /** Link item for request set lists */ + struct list_head rq_set_chain; + /** Link back to the request set */ + struct ptlrpc_request_set *rq_set; + /** Async completion handler, called when reply is received */ + ptlrpc_interpterer_t rq_interpret_reply; + /** Async completion context */ + union ptlrpc_async_args rq_async_args; + + /** Pool if request is from preallocated list */ + struct ptlrpc_request_pool *rq_pool; + + struct lu_context rq_session; + struct lu_context rq_recov_session; + + /** request format description */ + struct req_capsule rq_pill; +}; + +/** + * Call completion handler for rpc if any, return it's status or original + * rc if there was no handler defined for this request. + */ +static inline int ptlrpc_req_interpret(const struct lu_env *env, + struct ptlrpc_request *req, int rc) +{ + if (req->rq_interpret_reply != NULL) { + req->rq_status = req->rq_interpret_reply(env, req, + &req->rq_async_args, + rc); + return req->rq_status; + } + return rc; +} + +/** \addtogroup nrs + * @{ + */ +int ptlrpc_nrs_policy_register(struct ptlrpc_nrs_pol_conf *conf); +int ptlrpc_nrs_policy_unregister(struct ptlrpc_nrs_pol_conf *conf); +void ptlrpc_nrs_req_hp_move(struct ptlrpc_request *req); +void nrs_policy_get_info_locked(struct ptlrpc_nrs_policy *policy, + struct ptlrpc_nrs_pol_info *info); + +/* + * Can the request be moved from the regular NRS head to the high-priority NRS + * head (of the same PTLRPC service partition), if any? + * + * For a reliable result, this should be checked under svcpt->scp_req lock. + */ +static inline bool ptlrpc_nrs_req_can_move(struct ptlrpc_request *req) +{ + struct ptlrpc_nrs_request *nrq = &req->rq_nrq; + + /** + * LU-898: Check ptlrpc_nrs_request::nr_enqueued to make sure the + * request has been enqueued first, and ptlrpc_nrs_request::nr_started + * to make sure it has not been scheduled yet (analogous to previous + * (non-NRS) checking of !list_empty(&ptlrpc_request::rq_list). + */ + return nrq->nr_enqueued && !nrq->nr_started && !req->rq_hp; +} +/** @} nrs */ + +/** + * Returns 1 if request buffer at offset \a index was already swabbed + */ +static inline int lustre_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + return req->rq_req_swab_mask & (1 << index); +} + +/** + * Returns 1 if request reply buffer at offset \a index was already swabbed + */ +static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + return req->rq_rep_swab_mask & (1 << index); +} + +/** + * Returns 1 if request needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_req_need_swab(struct ptlrpc_request *req) +{ + return lustre_req_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Returns 1 if request reply needs to be swabbed into local cpu byteorder + */ +static inline int ptlrpc_rep_need_swab(struct ptlrpc_request *req) +{ + return lustre_rep_swabbed(req, MSG_PTLRPC_HEADER_OFF); +} + +/** + * Mark request buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_req_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_req_swab_mask) * 8); + LASSERT((req->rq_req_swab_mask & (1 << index)) == 0); + req->rq_req_swab_mask |= 1 << index; +} + +/** + * Mark request reply buffer at offset \a index that it was already swabbed + */ +static inline void lustre_set_rep_swabbed(struct ptlrpc_request *req, int index) +{ + LASSERT(index < sizeof(req->rq_rep_swab_mask) * 8); + LASSERT((req->rq_rep_swab_mask & (1 << index)) == 0); + req->rq_rep_swab_mask |= 1 << index; +} + +/** + * Convert numerical request phase value \a phase into text string description + */ +static inline const char * +ptlrpc_phase2str(enum rq_phase phase) +{ + switch (phase) { + case RQ_PHASE_NEW: + return "New"; + case RQ_PHASE_RPC: + return "Rpc"; + case RQ_PHASE_BULK: + return "Bulk"; + case RQ_PHASE_INTERPRET: + return "Interpret"; + case RQ_PHASE_COMPLETE: + return "Complete"; + case RQ_PHASE_UNREGISTERING: + return "Unregistering"; + default: + return "?Phase?"; + } +} + +/** + * Convert numerical request phase of the request \a req into text stringi + * description + */ +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + +/** + * Debugging functions and helpers to print request structure into debug log + * @{ + */ +/* Spare the preprocessor, spoil the bugs. */ +#define FLAG(field, str) (field ? str : "") + +/** Convert bit flags into a string */ +#define DEBUG_REQ_FLAGS(req) \ + ptlrpc_rqphase2str(req), \ + FLAG(req->rq_intr, "I"), FLAG(req->rq_replied, "R"), \ + FLAG(req->rq_err, "E"), \ + FLAG(req->rq_timedout, "X") /* eXpired */, FLAG(req->rq_resend, "S"), \ + FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ + FLAG(req->rq_no_resend, "N"), \ + FLAG(req->rq_waiting, "W"), \ + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H"), \ + FLAG(req->rq_committed, "M") + +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s%s" + +void _debug_req(struct ptlrpc_request *req, + struct libcfs_debug_msg_data *data, const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); + +/** + * Helper that decides if we need to print request accordig to current debug + * level settings + */ +#define debug_req(msgdata, mask, cdls, req, fmt, a...) \ +do { \ + CFS_CHECK_STACK(msgdata, mask, cdls); \ + \ + if (((mask) & D_CANTMASK) != 0 || \ + ((libcfs_debug & (mask)) != 0 && \ + (libcfs_subsystem_debug & DEBUG_SUBSYSTEM) != 0)) \ + _debug_req((req), msgdata, fmt, ##a); \ +} while(0) + +/** + * This is the debug print function you need to use to print request sturucture + * content into lustre debug log. + * for most callers (level is a constant) this is resolved at compile time */ +#define DEBUG_REQ(level, req, fmt, args...) \ +do { \ + if ((level) & (D_ERROR | D_WARNING)) { \ + static cfs_debug_limit_state_t cdls; \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, &cdls); \ + debug_req(&msgdata, level, &cdls, req, "@@@ "fmt" ", ## args);\ + } else { \ + LIBCFS_DEBUG_MSG_DATA_DECL(msgdata, level, NULL); \ + debug_req(&msgdata, level, NULL, req, "@@@ "fmt" ", ## args); \ + } \ +} while (0) +/** @} */ + +/** + * Structure that defines a single page of a bulk transfer + */ +struct ptlrpc_bulk_page { + /** Linkage to list of pages in a bulk */ + struct list_head bp_link; + /** + * Number of bytes in a page to transfer starting from \a bp_pageoffset + */ + int bp_buflen; + /** offset within a page */ + int bp_pageoffset; + /** The page itself */ + struct page *bp_page; +}; + +#define BULK_GET_SOURCE 0 +#define BULK_PUT_SINK 1 +#define BULK_GET_SINK 2 +#define BULK_PUT_SOURCE 3 + +/** + * Definition of bulk descriptor. + * Bulks are special "Two phase" RPCs where initial request message + * is sent first and it is followed bt a transfer (o receiving) of a large + * amount of data to be settled into pages referenced from the bulk descriptors. + * Bulks transfers (the actual data following the small requests) are done + * on separate LNet portals. + * In lustre we use bulk transfers for READ and WRITE transfers from/to OSTs. + * Another user is readpage for MDT. + */ +struct ptlrpc_bulk_desc { + /** completed with failure */ + unsigned long bd_failure:1; + /** {put,get}{source,sink} */ + unsigned long bd_type:2; + /** client side */ + unsigned long bd_registered:1; + /** For serialization with callback */ + spinlock_t bd_lock; + /** Import generation when request for this bulk was sent */ + int bd_import_generation; + /** LNet portal for this bulk */ + __u32 bd_portal; + /** Server side - export this bulk created for */ + struct obd_export *bd_export; + /** Client side - import this bulk was sent on */ + struct obd_import *bd_import; + /** Back pointer to the request */ + struct ptlrpc_request *bd_req; + wait_queue_head_t bd_waitq; /* server side only WQ */ + int bd_iov_count; /* # entries in bd_iov */ + int bd_max_iov; /* allocated size of bd_iov */ + int bd_nob; /* # bytes covered */ + int bd_nob_transferred; /* # bytes GOT/PUT */ + + __u64 bd_last_xid; + + struct ptlrpc_cb_id bd_cbid; /* network callback info */ + lnet_nid_t bd_sender; /* stash event::sender */ + int bd_md_count; /* # valid entries in bd_mds */ + int bd_md_max_brw; /* max entries in bd_mds */ + /** array of associated MDs */ + lnet_handle_md_t bd_mds[PTLRPC_BULK_OPS_COUNT]; + + /* + * encrypt iov, size is either 0 or bd_iov_count. + */ + lnet_kiov_t *bd_enc_iov; + + lnet_kiov_t bd_iov[0]; +}; + +enum { + SVC_STOPPED = 1 << 0, + SVC_STOPPING = 1 << 1, + SVC_STARTING = 1 << 2, + SVC_RUNNING = 1 << 3, + SVC_EVENT = 1 << 4, + SVC_SIGNAL = 1 << 5, +}; + +#define PTLRPC_THR_NAME_LEN 32 +/** + * Definition of server service thread structure + */ +struct ptlrpc_thread { + /** + * List of active threads in svc->srv_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated memory) + */ + void *t_data; + __u32 t_flags; + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * service thread pid + */ + pid_t t_pid; + /** + * put watchdog in the structure per thread b=14840 + */ + struct lc_watchdog *t_watchdog; + /** + * the svc this thread belonged to b=18582 + */ + struct ptlrpc_service_part *t_svcpt; + wait_queue_head_t t_ctl_waitq; + struct lu_env *t_env; + char t_name[PTLRPC_THR_NAME_LEN]; +}; + +static inline int thread_is_init(struct ptlrpc_thread *thread) +{ + return thread->t_flags == 0; +} + +static inline int thread_is_stopped(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPED); +} + +static inline int thread_is_stopping(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STOPPING); +} + +static inline int thread_is_starting(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_STARTING); +} + +static inline int thread_is_running(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_RUNNING); +} + +static inline int thread_is_event(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_EVENT); +} + +static inline int thread_is_signal(struct ptlrpc_thread *thread) +{ + return !!(thread->t_flags & SVC_SIGNAL); +} + +static inline void thread_clear_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags &= ~flags; +} + +static inline void thread_set_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags = flags; +} + +static inline void thread_add_flags(struct ptlrpc_thread *thread, __u32 flags) +{ + thread->t_flags |= flags; +} + +static inline int thread_test_and_clear_flags(struct ptlrpc_thread *thread, + __u32 flags) +{ + if (thread->t_flags & flags) { + thread->t_flags &= ~flags; + return 1; + } + return 0; +} + +/** + * Request buffer descriptor structure. + * This is a structure that contains one posted request buffer for service. + * Once data land into a buffer, event callback creates actual request and + * notifies wakes one of the service threads to process new incoming request. + * More than one request can fit into the buffer. + */ +struct ptlrpc_request_buffer_desc { + /** Link item for rqbds on a service */ + struct list_head rqbd_list; + /** History of requests for this buffer */ + struct list_head rqbd_reqs; + /** Back pointer to service for which this buffer is registered */ + struct ptlrpc_service_part *rqbd_svcpt; + /** LNet descriptor */ + lnet_handle_md_t rqbd_md_h; + int rqbd_refcount; + /** The buffer itself */ + char *rqbd_buffer; + struct ptlrpc_cb_id rqbd_cbid; + /** + * This "embedded" request structure is only used for the + * last request to fit into the buffer + */ + struct ptlrpc_request rqbd_req; +}; + +typedef int (*svc_handler_t)(struct ptlrpc_request *req); + +struct ptlrpc_service_ops { + /** + * if non-NULL called during thread creation (ptlrpc_start_thread()) + * to initialize service specific per-thread state. + */ + int (*so_thr_init)(struct ptlrpc_thread *thr); + /** + * if non-NULL called during thread shutdown (ptlrpc_main()) to + * destruct state created by ->srv_init(). + */ + void (*so_thr_done)(struct ptlrpc_thread *thr); + /** + * Handler function for incoming requests for this service + */ + int (*so_req_handler)(struct ptlrpc_request *req); + /** + * function to determine priority of the request, it's called + * on every new request + */ + int (*so_hpreq_handler)(struct ptlrpc_request *); + /** + * service-specific print fn + */ + void (*so_req_printer)(void *, struct ptlrpc_request *); +}; + +#ifndef __cfs_cacheline_aligned +/* NB: put it here for reducing patche dependence */ +# define __cfs_cacheline_aligned +#endif + +/** + * How many high priority requests to serve before serving one normal + * priority request + */ +#define PTLRPC_SVC_HP_RATIO 10 + +/** + * Definition of PortalRPC service. + * The service is listening on a particular portal (like tcp port) + * and perform actions for a specific server like IO service for OST + * or general metadata service for MDS. + */ +struct ptlrpc_service { + /** serialize /proc operations */ + spinlock_t srv_lock; + /** most often accessed fields */ + /** chain thru all services */ + struct list_head srv_list; + /** service operations table */ + struct ptlrpc_service_ops srv_ops; + /** only statically allocated strings here; we don't clean them */ + char *srv_name; + /** only statically allocated strings here; we don't clean them */ + char *srv_thread_name; + /** service thread list */ + struct list_head srv_threads; + /** threads # should be created for each partition on initializing */ + int srv_nthrs_cpt_init; + /** limit of threads number for each partition */ + int srv_nthrs_cpt_limit; + /** Root of /proc dir tree for this service */ + proc_dir_entry_t *srv_procroot; + /** Pointer to statistic data for this service */ + struct lprocfs_stats *srv_stats; + /** # hp per lp reqs to handle */ + int srv_hpreq_ratio; + /** biggest request to receive */ + int srv_max_req_size; + /** biggest reply to send */ + int srv_max_reply_size; + /** size of individual buffers */ + int srv_buf_size; + /** # buffers to allocate in 1 group */ + int srv_nbuf_per_group; + /** Local portal on which to receive requests */ + __u32 srv_req_portal; + /** Portal on the client to send replies to */ + __u32 srv_rep_portal; + /** + * Tags for lu_context associated with this thread, see struct + * lu_context. + */ + __u32 srv_ctx_tags; + /** soft watchdog timeout multiplier */ + int srv_watchdog_factor; + /** under unregister_service */ + unsigned srv_is_stopping:1; + + /** max # request buffers in history per partition */ + int srv_hist_nrqbds_cpt_max; + /** number of CPTs this service bound on */ + int srv_ncpts; + /** CPTs array this service bound on */ + __u32 *srv_cpts; + /** 2^srv_cptab_bits >= cfs_cpt_numbert(srv_cptable) */ + int srv_cpt_bits; + /** CPT table this service is running over */ + struct cfs_cpt_table *srv_cptable; + /** + * partition data for ptlrpc service + */ + struct ptlrpc_service_part *srv_parts[0]; +}; + +/** + * Definition of PortalRPC service partition data. + * Although a service only has one instance of it right now, but we + * will have multiple instances very soon (instance per CPT). + * + * it has four locks: + * \a scp_lock + * serialize operations on rqbd and requests waiting for preprocess + * \a scp_req_lock + * serialize operations active requests sent to this portal + * \a scp_at_lock + * serialize adaptive timeout stuff + * \a scp_rep_lock + * serialize operations on RS list (reply states) + * + * We don't have any use-case to take two or more locks at the same time + * for now, so there is no lock order issue. + */ +struct ptlrpc_service_part { + /** back reference to owner */ + struct ptlrpc_service *scp_service __cfs_cacheline_aligned; + /* CPT id, reserved */ + int scp_cpt; + /** always increasing number */ + int scp_thr_nextid; + /** # of starting threads */ + int scp_nthrs_starting; + /** # of stopping threads, reserved for shrinking threads */ + int scp_nthrs_stopping; + /** # running threads */ + int scp_nthrs_running; + /** service threads list */ + struct list_head scp_threads; + + /** + * serialize the following fields, used for protecting + * rqbd list and incoming requests waiting for preprocess, + * threads starting & stopping are also protected by this lock. + */ + spinlock_t scp_lock __cfs_cacheline_aligned; + /** total # req buffer descs allocated */ + int scp_nrqbds_total; + /** # posted request buffers for receiving */ + int scp_nrqbds_posted; + /** in progress of allocating rqbd */ + int scp_rqbd_allocating; + /** # incoming reqs */ + int scp_nreqs_incoming; + /** request buffers to be reposted */ + struct list_head scp_rqbd_idle; + /** req buffers receiving */ + struct list_head scp_rqbd_posted; + /** incoming reqs */ + struct list_head scp_req_incoming; + /** timeout before re-posting reqs, in tick */ + cfs_duration_t scp_rqbd_timeout; + /** + * all threads sleep on this. This wait-queue is signalled when new + * incoming request arrives and when difficult reply has to be handled. + */ + wait_queue_head_t scp_waitq; + + /** request history */ + struct list_head scp_hist_reqs; + /** request buffer history */ + struct list_head scp_hist_rqbds; + /** # request buffers in history */ + int scp_hist_nrqbds; + /** sequence number for request */ + __u64 scp_hist_seq; + /** highest seq culled from history */ + __u64 scp_hist_seq_culled; + + /** + * serialize the following fields, used for processing requests + * sent to this portal + */ + spinlock_t scp_req_lock __cfs_cacheline_aligned; + /** # reqs in either of the NRS heads below */ + /** # reqs being served */ + int scp_nreqs_active; + /** # HPreqs being served */ + int scp_nhreqs_active; + /** # hp requests handled */ + int scp_hreq_count; + + /** NRS head for regular requests */ + struct ptlrpc_nrs scp_nrs_reg; + /** NRS head for HP requests; this is only valid for services that can + * handle HP requests */ + struct ptlrpc_nrs *scp_nrs_hp; + + /** AT stuff */ + /** @{ */ + /** + * serialize the following fields, used for changes on + * adaptive timeout + */ + spinlock_t scp_at_lock __cfs_cacheline_aligned; + /** estimated rpc service time */ + struct adaptive_timeout scp_at_estimate; + /** reqs waiting for replies */ + struct ptlrpc_at_array scp_at_array; + /** early reply timer */ + timer_list_t scp_at_timer; + /** debug */ + cfs_time_t scp_at_checktime; + /** check early replies */ + unsigned scp_at_check; + /** @} */ + + /** + * serialize the following fields, used for processing + * replies for this portal + */ + spinlock_t scp_rep_lock __cfs_cacheline_aligned; + /** all the active replies */ + struct list_head scp_rep_active; + /** List of free reply_states */ + struct list_head scp_rep_idle; + /** waitq to run, when adding stuff to srv_free_rs_list */ + wait_queue_head_t scp_rep_waitq; + /** # 'difficult' replies */ + atomic_t scp_nreps_difficult; +}; + +#define ptlrpc_service_for_each_part(part, i, svc) \ + for (i = 0; \ + i < (svc)->srv_ncpts && \ + (svc)->srv_parts != NULL && \ + ((part) = (svc)->srv_parts[i]) != NULL; i++) + +/** + * Declaration of ptlrpcd control structure + */ +struct ptlrpcd_ctl { + /** + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) + */ + unsigned long pc_flags; + /** + * Thread lock protecting structure fields. + */ + spinlock_t pc_lock; + /** + * Start completion. + */ + struct completion pc_starting; + /** + * Stop completion. + */ + struct completion pc_finishing; + /** + * Thread requests set. + */ + struct ptlrpc_request_set *pc_set; + /** + * Thread name used in cfs_daemonize() + */ + char pc_name[16]; + /** + * Environment for request interpreters to run in. + */ + struct lu_env pc_env; + /** + * Index of ptlrpcd thread in the array. + */ + int pc_index; + /** + * Number of the ptlrpcd's partners. + */ + int pc_npartners; + /** + * Pointer to the array of partners' ptlrpcd_ctl structure. + */ + struct ptlrpcd_ctl **pc_partners; + /** + * Record the partner index to be processed next. + */ + int pc_cursor; +}; + +/* Bits for pc_flags */ +enum ptlrpcd_ctl_flags { + /** + * Ptlrpc thread start flag. + */ + LIOD_START = 1 << 0, + /** + * Ptlrpc thread stop flag. + */ + LIOD_STOP = 1 << 1, + /** + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. + */ + LIOD_FORCE = 1 << 2, + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = 1 << 3, + /** + * The ptlrpcd is bound to some CPU core. + */ + LIOD_BIND = 1 << 4, +}; + +/** + * \addtogroup nrs + * @{ + * + * Service compatibility function; the policy is compatible with all services. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_all(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + return true; +} + +/** + * Service compatibility function; the policy is compatible with only a specific + * service which is identified by its human-readable name at + * ptlrpc_service::srv_name. + * + * \param[in] svc The service the policy is attempting to register with. + * \param[in] desc The policy descriptor + * + * \retval false The policy is not compatible with the service + * \retval true The policy is compatible with the service + * + * \see ptlrpc_nrs_pol_desc::pd_compat() + */ +static inline bool nrs_policy_compat_one(const struct ptlrpc_service *svc, + const struct ptlrpc_nrs_pol_desc *desc) +{ + LASSERT(desc->pd_compat_svc_name != NULL); + return strcmp(svc->srv_name, desc->pd_compat_svc_name) == 0; +} + +/** @} nrs */ + +/* ptlrpc/events.c */ +extern lnet_handle_eq_t ptlrpc_eq_h; +extern int ptlrpc_uuid_to_peer(struct obd_uuid *uuid, + lnet_process_id_t *peer, lnet_nid_t *self); +/** + * These callbacks are invoked by LNet when something happened to + * underlying buffer + * @{ + */ +extern void request_out_callback(lnet_event_t *ev); +extern void reply_in_callback(lnet_event_t *ev); +extern void client_bulk_callback(lnet_event_t *ev); +extern void request_in_callback(lnet_event_t *ev); +extern void reply_out_callback(lnet_event_t *ev); +/** @} */ + +/* ptlrpc/connection.c */ +struct ptlrpc_connection *ptlrpc_connection_get(lnet_process_id_t peer, + lnet_nid_t self, + struct obd_uuid *uuid); +int ptlrpc_connection_put(struct ptlrpc_connection *c); +struct ptlrpc_connection *ptlrpc_connection_addref(struct ptlrpc_connection *); +int ptlrpc_connection_init(void); +void ptlrpc_connection_fini(void); +extern lnet_pid_t ptl_get_pid(void); + +/* ptlrpc/niobuf.c */ +/** + * Actual interfacing with LNet to put/get/register/unregister stuff + * @{ + */ + +int ptlrpc_register_bulk(struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) +{ + struct ptlrpc_bulk_desc *desc; + int rc; + + LASSERT(req != NULL); + desc = req->rq_bulk; + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + req->rq_bulk_deadline > cfs_time_current_sec()) + return 1; + + if (!desc) + return 0; + + spin_lock(&desc->bd_lock); + rc = desc->bd_md_count; + spin_unlock(&desc->bd_lock); + return rc; +} + +#define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 +#define PTLRPC_REPLY_EARLY 0x02 +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags); +int ptlrpc_reply(struct ptlrpc_request *req); +int ptlrpc_send_error(struct ptlrpc_request *req, int difficult); +int ptlrpc_error(struct ptlrpc_request *req); +void ptlrpc_resend_req(struct ptlrpc_request *request); +int ptlrpc_at_get_net_latency(struct ptlrpc_request *req); +int ptl_send_rpc(struct ptlrpc_request *request, int noreply); +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd); +/** @} */ + +/* ptlrpc/client.c */ +/** + * Client-side portals API. Everything to send requests, receive replies, + * request queues, request management, etc. + * @{ + */ +void ptlrpc_init_client(int req_portal, int rep_portal, char *name, + struct ptlrpc_client *); +void ptlrpc_cleanup_client(struct obd_import *imp); +struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); + +int ptlrpc_queue_wait(struct ptlrpc_request *req); +int ptlrpc_replay_req(struct ptlrpc_request *req); +int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async); +void ptlrpc_restart_req(struct ptlrpc_request *req); +void ptlrpc_abort_inflight(struct obd_import *imp); +void ptlrpc_cleanup_imp(struct obd_import *imp); +void ptlrpc_abort_set(struct ptlrpc_request_set *set); + +struct ptlrpc_request_set *ptlrpc_prep_set(void); +struct ptlrpc_request_set *ptlrpc_prep_fcset(int max, set_producer_func func, + void *arg); +int ptlrpc_set_add_cb(struct ptlrpc_request_set *set, + set_interpreter_func fn, void *data); +int ptlrpc_set_next_timeout(struct ptlrpc_request_set *); +int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set); +int ptlrpc_set_wait(struct ptlrpc_request_set *); +int ptlrpc_expired_set(void *data); +void ptlrpc_interrupted_set(void *data); +void ptlrpc_mark_interrupted(struct ptlrpc_request *req); +void ptlrpc_set_destroy(struct ptlrpc_request_set *); +void ptlrpc_set_add_req(struct ptlrpc_request_set *, struct ptlrpc_request *); +void ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc, + struct ptlrpc_request *req); + +void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool); +void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq); + +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int, int, + void (*populate_pool)(struct ptlrpc_request_pool *, int)); + +void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req); +struct ptlrpc_request *ptlrpc_request_alloc(struct obd_import *imp, + const struct req_format *format); +struct ptlrpc_request *ptlrpc_request_alloc_pool(struct obd_import *imp, + struct ptlrpc_request_pool *, + const struct req_format *format); +void ptlrpc_request_free(struct ptlrpc_request *request); +int ptlrpc_request_pack(struct ptlrpc_request *request, + __u32 version, int opcode); +struct ptlrpc_request *ptlrpc_request_alloc_pack(struct obd_import *imp, + const struct req_format *format, + __u32 version, int opcode); +int ptlrpc_request_bufs_pack(struct ptlrpc_request *request, + __u32 version, int opcode, char **bufs, + struct ptlrpc_cli_ctx *ctx); +struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version, + int opcode, int count, __u32 *lengths, + char **bufs); +struct ptlrpc_request *ptlrpc_prep_req_pool(struct obd_import *imp, + __u32 version, int opcode, + int count, __u32 *lengths, char **bufs, + struct ptlrpc_request_pool *pool); +void ptlrpc_req_finished(struct ptlrpc_request *request); +void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request); +struct ptlrpc_request *ptlrpc_request_addref(struct ptlrpc_request *req); +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + unsigned npages, unsigned max_brw, + unsigned type, unsigned portal); +void __ptlrpc_free_bulk(struct ptlrpc_bulk_desc *bulk, int pin); +static inline void ptlrpc_free_bulk_pin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 1); +} +static inline void ptlrpc_free_bulk_nopin(struct ptlrpc_bulk_desc *bulk) +{ + __ptlrpc_free_bulk(bulk, 0); +} +void __ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, int len, int); +static inline void ptlrpc_prep_bulk_page_pin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 1); +} + +static inline void ptlrpc_prep_bulk_page_nopin(struct ptlrpc_bulk_desc *desc, + struct page *page, int pageoffset, + int len) +{ + __ptlrpc_prep_bulk_page(desc, page, pageoffset, len, 0); +} + +void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, + struct obd_import *imp); +__u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); + +/* Set of routines to run a function in ptlrpcd context */ +void *ptlrpcd_alloc_work(struct obd_import *imp, + int (*cb)(const struct lu_env *, void *), void *data); +void ptlrpcd_destroy_work(void *handler); +int ptlrpcd_queue_work(void *handler); + +/** @} */ +struct ptlrpc_service_buf_conf { + /* nbufs is buffers # to allocate when growing the pool */ + unsigned int bc_nbufs; + /* buffer size to post */ + unsigned int bc_buf_size; + /* portal to listed for requests on */ + unsigned int bc_req_portal; + /* portal of where to send replies to */ + unsigned int bc_rep_portal; + /* maximum request size to be accepted for this service */ + unsigned int bc_req_max_size; + /* maximum reply size this service can ever send */ + unsigned int bc_rep_max_size; +}; + +struct ptlrpc_service_thr_conf { + /* threadname should be 8 characters or less - 6 will be added on */ + char *tc_thr_name; + /* threads increasing factor for each CPU */ + unsigned int tc_thr_factor; + /* service threads # to start on each partition while initializing */ + unsigned int tc_nthrs_init; + /* + * low water of threads # upper-limit on each partition while running, + * service availability may be impacted if threads number is lower + * than this value. It can be ZERO if the service doesn't require + * CPU affinity or there is only one partition. + */ + unsigned int tc_nthrs_base; + /* "soft" limit for total threads number */ + unsigned int tc_nthrs_max; + /* user specified threads number, it will be validated due to + * other members of this structure. */ + unsigned int tc_nthrs_user; + /* set NUMA node affinity for service threads */ + unsigned int tc_cpu_affinity; + /* Tags for lu_context associated with service thread */ + __u32 tc_ctx_tags; +}; + +struct ptlrpc_service_cpt_conf { + struct cfs_cpt_table *cc_cptable; + /* string pattern to describe CPTs for a service */ + char *cc_pattern; +}; + +struct ptlrpc_service_conf { + /* service name */ + char *psc_name; + /* soft watchdog timeout multiplifier to print stuck service traces */ + unsigned int psc_watchdog_factor; + /* buffer information */ + struct ptlrpc_service_buf_conf psc_buf; + /* thread information */ + struct ptlrpc_service_thr_conf psc_thr; + /* CPU partition information */ + struct ptlrpc_service_cpt_conf psc_cpt; + /* function table */ + struct ptlrpc_service_ops psc_ops; +}; + +/* ptlrpc/service.c */ +/** + * Server-side services API. Register/unregister service, request state + * management, service thread management + * + * @{ + */ +void ptlrpc_save_lock(struct ptlrpc_request *req, + struct lustre_handle *lock, int mode, int no_ack); +void ptlrpc_commit_replies(struct obd_export *exp); +void ptlrpc_dispatch_difficult_reply(struct ptlrpc_reply_state *rs); +void ptlrpc_schedule_difficult_reply(struct ptlrpc_reply_state *rs); +int ptlrpc_hpreq_handler(struct ptlrpc_request *req); +struct ptlrpc_service *ptlrpc_register_service( + struct ptlrpc_service_conf *conf, + struct proc_dir_entry *proc_entry); +void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); + +int ptlrpc_start_threads(struct ptlrpc_service *svc); +int ptlrpc_unregister_service(struct ptlrpc_service *service); +int liblustre_check_services(void *arg); +void ptlrpc_daemonize(char *name); +int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_server_drop_request(struct ptlrpc_request *req); +void ptlrpc_request_change_export(struct ptlrpc_request *req, + struct obd_export *export); + +int ptlrpc_hr_init(void); +void ptlrpc_hr_fini(void); + +/** @} */ + +/* ptlrpc/import.c */ +/** + * Import API + * @{ + */ +int ptlrpc_connect_import(struct obd_import *imp); +int ptlrpc_init_import(struct obd_import *imp); +int ptlrpc_disconnect_import(struct obd_import *imp, int noclose); +int ptlrpc_import_recovery_state_machine(struct obd_import *imp); +void deuuidify(char *uuid, const char *prefix, char **uuid_start, + int *uuid_len); + +/* ptlrpc/pack_generic.c */ +int ptlrpc_reconnect_import(struct obd_import *imp); +/** @} */ + +/** + * ptlrpc msg buffer and swab interface + * + * @{ + */ +int ptlrpc_buf_need_swab(struct ptlrpc_request *req, const int inout, + int index); +void ptlrpc_buf_set_swabbed(struct ptlrpc_request *req, const int inout, + int index); +int ptlrpc_unpack_rep_msg(struct ptlrpc_request *req, int len); +int ptlrpc_unpack_req_msg(struct ptlrpc_request *req, int len); + +int lustre_msg_check_version(struct lustre_msg *msg, __u32 version); +void lustre_init_msg_v2(struct lustre_msg_v2 *msg, int count, __u32 *lens, + char **bufs); +int lustre_pack_request(struct ptlrpc_request *, __u32 magic, int count, + __u32 *lens, char **bufs); +int lustre_pack_reply(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs); +int lustre_pack_reply_v2(struct ptlrpc_request *req, int count, + __u32 *lens, char **bufs, int flags); +#define LPRFL_EARLY_REPLY 1 +int lustre_pack_reply_flags(struct ptlrpc_request *, int count, __u32 *lens, + char **bufs, int flags); +int lustre_shrink_msg(struct lustre_msg *msg, int segment, + unsigned int newlen, int move_data); +void lustre_free_reply_state(struct ptlrpc_reply_state *rs); +int __lustre_unpack_msg(struct lustre_msg *m, int len); +int lustre_msg_hdr_size(__u32 magic, int count); +int lustre_msg_size(__u32 magic, int count, __u32 *lengths); +int lustre_msg_size_v2(int count, __u32 *lengths); +int lustre_packed_msg_size(struct lustre_msg *msg); +int lustre_msg_early_size(void); +void *lustre_msg_buf_v2(struct lustre_msg_v2 *m, int n, int min_size); +void *lustre_msg_buf(struct lustre_msg *m, int n, int minlen); +int lustre_msg_buflen(struct lustre_msg *m, int n); +void lustre_msg_set_buflen(struct lustre_msg *m, int n, int len); +int lustre_msg_bufcount(struct lustre_msg *m); +char *lustre_msg_string(struct lustre_msg *m, int n, int max_len); +__u32 lustre_msghdr_get_flags(struct lustre_msg *msg); +void lustre_msghdr_set_flags(struct lustre_msg *msg, __u32 flags); +__u32 lustre_msg_get_flags(struct lustre_msg *msg); +void lustre_msg_add_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_flags(struct lustre_msg *msg, int flags); +void lustre_msg_clear_flags(struct lustre_msg *msg, int flags); +__u32 lustre_msg_get_op_flags(struct lustre_msg *msg); +void lustre_msg_add_op_flags(struct lustre_msg *msg, int flags); +void lustre_msg_set_op_flags(struct lustre_msg *msg, int flags); +struct lustre_handle *lustre_msg_get_handle(struct lustre_msg *msg); +__u32 lustre_msg_get_type(struct lustre_msg *msg); +__u32 lustre_msg_get_version(struct lustre_msg *msg); +void lustre_msg_add_version(struct lustre_msg *msg, int version); +__u32 lustre_msg_get_opc(struct lustre_msg *msg); +__u64 lustre_msg_get_last_xid(struct lustre_msg *msg); +__u64 lustre_msg_get_last_committed(struct lustre_msg *msg); +__u64 *lustre_msg_get_versions(struct lustre_msg *msg); +__u64 lustre_msg_get_transno(struct lustre_msg *msg); +__u64 lustre_msg_get_slv(struct lustre_msg *msg); +__u32 lustre_msg_get_limit(struct lustre_msg *msg); +void lustre_msg_set_slv(struct lustre_msg *msg, __u64 slv); +void lustre_msg_set_limit(struct lustre_msg *msg, __u64 limit); +int lustre_msg_get_status(struct lustre_msg *msg); +__u32 lustre_msg_get_conn_cnt(struct lustre_msg *msg); +int lustre_msg_is_v1(struct lustre_msg *msg); +__u32 lustre_msg_get_magic(struct lustre_msg *msg); +__u32 lustre_msg_get_timeout(struct lustre_msg *msg); +__u32 lustre_msg_get_service_time(struct lustre_msg *msg); +char *lustre_msg_get_jobid(struct lustre_msg *msg); +__u32 lustre_msg_get_cksum(struct lustre_msg *msg); +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(2, 7, 50, 0) +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg, int compat18); +#else +# warning "remove checksum compatibility support for b1_8" +__u32 lustre_msg_calc_cksum(struct lustre_msg *msg); +#endif +void lustre_msg_set_handle(struct lustre_msg *msg,struct lustre_handle *handle); +void lustre_msg_set_type(struct lustre_msg *msg, __u32 type); +void lustre_msg_set_opc(struct lustre_msg *msg, __u32 opc); +void lustre_msg_set_last_xid(struct lustre_msg *msg, __u64 last_xid); +void lustre_msg_set_last_committed(struct lustre_msg *msg,__u64 last_committed); +void lustre_msg_set_versions(struct lustre_msg *msg, __u64 *versions); +void lustre_msg_set_transno(struct lustre_msg *msg, __u64 transno); +void lustre_msg_set_status(struct lustre_msg *msg, __u32 status); +void lustre_msg_set_conn_cnt(struct lustre_msg *msg, __u32 conn_cnt); +void ptlrpc_req_set_repsize(struct ptlrpc_request *req, int count, __u32 *sizes); +void ptlrpc_request_set_replen(struct ptlrpc_request *req); +void lustre_msg_set_timeout(struct lustre_msg *msg, __u32 timeout); +void lustre_msg_set_service_time(struct lustre_msg *msg, __u32 service_time); +void lustre_msg_set_jobid(struct lustre_msg *msg, char *jobid); +void lustre_msg_set_cksum(struct lustre_msg *msg, __u32 cksum); + +static inline void +lustre_shrink_reply(struct ptlrpc_request *req, int segment, + unsigned int newlen, int move_data) +{ + LASSERT(req->rq_reply_state); + LASSERT(req->rq_repmsg); + req->rq_replen = lustre_shrink_msg(req->rq_repmsg, segment, + newlen, move_data); +} +/** @} */ + +/** Change request phase of \a req to \a new_phase */ +static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREGISTERING) { + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_INFO, req, "move req \"%s\" -> \"%s\"", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +/** + * Returns true if request \a req got early reply and hard deadline is not met + */ +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_early; +} + +/** + * Returns true if we got real reply from server for this request + */ +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_replied; +} + +/** Returns true if request \a req is in process of receiving server reply */ +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 1; + return req->rq_receiving_reply; +} + +static inline int +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) { + spin_unlock(&req->rq_lock); + return 1; + } + rc = req->rq_receiving_reply || req->rq_must_unlink; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + if (req->rq_set == NULL) + wake_up(&req->rq_reply_waitq); + else + wake_up(&req->rq_set->set_waitq); +} + +static inline void +ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + atomic_inc(&rs->rs_refcount); +} + +static inline void +ptlrpc_rs_decref(struct ptlrpc_reply_state *rs) +{ + LASSERT(atomic_read(&rs->rs_refcount) > 0); + if (atomic_dec_and_test(&rs->rs_refcount)) + lustre_free_reply_state(rs); +} + +/* Should only be called once per req */ +static inline void ptlrpc_req_drop_rs(struct ptlrpc_request *req) +{ + if (req->rq_reply_state == NULL) + return; /* shouldn't occur */ + ptlrpc_rs_decref(req->rq_reply_state); + req->rq_reply_state = NULL; + req->rq_repmsg = NULL; +} + +static inline __u32 lustre_request_magic(struct ptlrpc_request *req) +{ + return lustre_msg_get_magic(req->rq_reqmsg); +} + +static inline int ptlrpc_req_get_repsize(struct ptlrpc_request *req) +{ + switch (req->rq_reqmsg->lm_magic) { + case LUSTRE_MSG_MAGIC_V2: + return req->rq_reqmsg->lm_repsize; + default: + LASSERTF(0, "incorrect message magic: %08x\n", + req->rq_reqmsg->lm_magic); + return -EFAULT; + } +} + +static inline int ptlrpc_send_limit_expired(struct ptlrpc_request *req) +{ + if (req->rq_delay_limit != 0 && + cfs_time_before(cfs_time_add(req->rq_queued_time, + cfs_time_seconds(req->rq_delay_limit)), + cfs_time_current())) { + return 1; + } + return 0; +} + +static inline int ptlrpc_no_resend(struct ptlrpc_request *req) +{ + if (!req->rq_no_resend && ptlrpc_send_limit_expired(req)) { + spin_lock(&req->rq_lock); + req->rq_no_resend = 1; + spin_unlock(&req->rq_lock); + } + return req->rq_no_resend; +} + +static inline int +ptlrpc_server_get_timeout(struct ptlrpc_service_part *svcpt) +{ + int at = AT_OFF ? 0 : at_get(&svcpt->scp_at_estimate); + + return svcpt->scp_service->srv_watchdog_factor * + max_t(int, at, obd_timeout); +} + +static inline struct ptlrpc_service * +ptlrpc_req2svc(struct ptlrpc_request *req) +{ + LASSERT(req->rq_rqbd != NULL); + return req->rq_rqbd->rqbd_svcpt->scp_service; +} + +/* ldlm/ldlm_lib.c */ +/** + * Target client logic + * @{ + */ +int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg); +int client_obd_cleanup(struct obd_device *obddev); +int client_connect_import(const struct lu_env *env, + struct obd_export **exp, struct obd_device *obd, + struct obd_uuid *cluuid, struct obd_connect_data *, + void *localdata); +int client_disconnect_export(struct obd_export *exp); +int client_import_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority); +int client_import_del_conn(struct obd_import *imp, struct obd_uuid *uuid); +int client_import_find_conn(struct obd_import *imp, lnet_nid_t peer, + struct obd_uuid *uuid); +int import_set_conn_priority(struct obd_import *imp, struct obd_uuid *uuid); +void client_destroy_import(struct obd_import *imp); +/** @} */ + + +/* ptlrpc/pinger.c */ +/** + * Pinger API (client side only) + * @{ + */ +enum timeout_event { + TIMEOUT_GRANT = 1 +}; +struct timeout_item; +typedef int (*timeout_cb_t)(struct timeout_item *, void *); +int ptlrpc_pinger_add_import(struct obd_import *imp); +int ptlrpc_pinger_del_import(struct obd_import *imp); +int ptlrpc_add_timeout_client(int time, enum timeout_event event, + timeout_cb_t cb, void *data, + struct list_head *obd_list); +int ptlrpc_del_timeout_client(struct list_head *obd_list, + enum timeout_event event); +struct ptlrpc_request * ptlrpc_prep_ping(struct obd_import *imp); +int ptlrpc_obd_ping(struct obd_device *obd); +cfs_time_t ptlrpc_suspend_wakeup_time(void); +void ping_evictor_start(void); +void ping_evictor_stop(void); +int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req); +void ptlrpc_pinger_ir_up(void); +void ptlrpc_pinger_ir_down(void); +/** @} */ +int ptlrpc_pinger_suppress_pings(void); + +/* ptlrpc daemon bind policy */ +typedef enum { + /* all ptlrpcd threads are free mode */ + PDB_POLICY_NONE = 1, + /* all ptlrpcd threads are bound mode */ + PDB_POLICY_FULL = 2, + /* <free1 bound1> <free2 bound2> ... <freeN boundN> */ + PDB_POLICY_PAIR = 3, + /* <free1 bound1> <bound1 free2> ... <freeN boundN> <boundN free1>, + * means each ptlrpcd[X] has two partners: thread[X-1] and thread[X+1]. + * If kernel supports NUMA, pthrpcd threads are binded and + * grouped by NUMA node */ + PDB_POLICY_NEIGHBOR = 4, +} pdb_policy_t; + +/* ptlrpc daemon load policy + * It is caller's duty to specify how to push the async RPC into some ptlrpcd + * queue, but it is not enforced, affected by "ptlrpcd_bind_policy". If it is + * "PDB_POLICY_FULL", then the RPC will be processed by the selected ptlrpcd, + * Otherwise, the RPC may be processed by the selected ptlrpcd or its partner, + * depends on which is scheduled firstly, to accelerate the RPC processing. */ +typedef enum { + /* on the same CPU core as the caller */ + PDL_POLICY_SAME = 1, + /* within the same CPU partition, but not the same core as the caller */ + PDL_POLICY_LOCAL = 2, + /* round-robin on all CPU cores, but not the same core as the caller */ + PDL_POLICY_ROUND = 3, + /* the specified CPU core is preferred, but not enforced */ + PDL_POLICY_PREFERRED = 4, +} pdl_policy_t; + +/* ptlrpc/ptlrpcd.c */ +void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); +void ptlrpcd_free(struct ptlrpcd_ctl *pc); +void ptlrpcd_wake(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req, pdl_policy_t policy, int idx); +void ptlrpcd_add_rqset(struct ptlrpc_request_set *set); +int ptlrpcd_addref(void); +void ptlrpcd_decref(void); + +/* ptlrpc/lproc_ptlrpc.c */ +/** + * procfs output related functions + * @{ + */ +const char* ll_opcode2str(__u32 opcode); +#ifdef LPROCFS +void ptlrpc_lprocfs_register_obd(struct obd_device *obd); +void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd); +void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes); +#else +static inline void ptlrpc_lprocfs_register_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_unregister_obd(struct obd_device *obd) {} +static inline void ptlrpc_lprocfs_brw(struct ptlrpc_request *req, int bytes) {} +#endif +/** @} */ + +/* ptlrpc/llog_server.c */ +int llog_origin_handle_open(struct ptlrpc_request *req); +int llog_origin_handle_destroy(struct ptlrpc_request *req); +int llog_origin_handle_prev_block(struct ptlrpc_request *req); +int llog_origin_handle_next_block(struct ptlrpc_request *req); +int llog_origin_handle_read_header(struct ptlrpc_request *req); +int llog_origin_handle_close(struct ptlrpc_request *req); +int llog_origin_handle_cancel(struct ptlrpc_request *req); + +/* ptlrpc/llog_client.c */ +extern struct llog_operations llog_client_ops; + +/** @} net */ + +#endif +/** @} PtlRPC */ diff --git a/drivers/staging/lustre/lustre/include/lustre_param.h b/drivers/staging/lustre/lustre/include/lustre_param.h new file mode 100644 index 000000000000..ed654684cb64 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_param.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_param.h + * + * User-settable parameter keys + * + * Author: Nathan Rutman <nathan@clusterfs.com> + */ + +#ifndef _LUSTRE_PARAM_H +#define _LUSTRE_PARAM_H + +/** \defgroup param param + * + * @{ + */ + +/* For interoperability */ +struct cfg_interop_param { + char *old_param; + char *new_param; +}; + +/* obd_config.c */ +int class_find_param(char *buf, char *key, char **valp); +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr); +int class_get_next_param(char **params, char *copy); +int class_match_param(char *buf, char *key, char **valp); +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh); +int class_parse_net(char *buf, __u32 *net, char **endh); +int class_match_nid(char *buf, char *key, lnet_nid_t nid); +int class_match_net(char *buf, char *key, __u32 net); +/* obd_mount.c */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4); + + + +/****************** User-settable parameter keys *********************/ +/* e.g. + tunefs.lustre --param="failover.node=192.168.0.13@tcp0" /dev/sda + lctl conf_param testfs-OST0000 failover.node=3@elan,192.168.0.3@tcp0 + ... testfs-MDT0000.lov.stripesize=4M + ... testfs-OST0000.ost.client_cache_seconds=15 + ... testfs.sys.timeout=<secs> + ... testfs.llite.max_read_ahead_mb=16 +*/ + +/* System global or special params not handled in obd's proc + * See mgs_write_log_sys() + */ +#define PARAM_TIMEOUT "timeout=" /* global */ +#define PARAM_LDLM_TIMEOUT "ldlm_timeout=" /* global */ +#define PARAM_AT_MIN "at_min=" /* global */ +#define PARAM_AT_MAX "at_max=" /* global */ +#define PARAM_AT_EXTRA "at_extra=" /* global */ +#define PARAM_AT_EARLY_MARGIN "at_early_margin=" /* global */ +#define PARAM_AT_HISTORY "at_history=" /* global */ +#define PARAM_JOBID_VAR "jobid_var=" /* global */ +#define PARAM_MGSNODE "mgsnode=" /* only at mounttime */ +#define PARAM_FAILNODE "failover.node=" /* add failover nid */ +#define PARAM_FAILMODE "failover.mode=" /* initial mount only */ +#define PARAM_ACTIVE "active=" /* activate/deactivate */ +#define PARAM_NETWORK "network=" /* bind on nid */ +#define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ + +/* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ +#define PARAM_OST "ost." +#define PARAM_OSC "osc." +#define PARAM_MDT "mdt." +#define PARAM_MDD "mdd." +#define PARAM_MDC "mdc." +#define PARAM_LLITE "llite." +#define PARAM_LOV "lov." +#define PARAM_LOD "lod." +#define PARAM_OSP "osp." +#define PARAM_SYS "sys." /* global */ +#define PARAM_SRPC "srpc." +#define PARAM_SRPC_FLVR "srpc.flavor." +#define PARAM_SRPC_UDESC "srpc.udesc.cli2mdt" +#define PARAM_SEC "security." +#define PARAM_QUOTA "quota." /* global */ + +/** @} param */ + +#endif /* _LUSTRE_PARAM_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_quota.h b/drivers/staging/lustre/lustre/include/lustre_quota.h new file mode 100644 index 000000000000..1c3041f50049 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_quota.h @@ -0,0 +1,239 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + * Use is subject to license terms. + */ + +#ifndef _LUSTRE_QUOTA_H +#define _LUSTRE_QUOTA_H + +/** \defgroup quota quota + * + */ + +#include <linux/lustre_quota.h> + +#include <dt_object.h> +#include <lustre_fid.h> +#include <lustre_dlm.h> + +#ifndef MAX_IQ_TIME +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +#ifndef MAX_DQ_TIME +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ +#endif + +struct lquota_id_info; +struct lquota_trans; + +/* Gather all quota record type in an union that can be used to read any records + * from disk. All fields of these records must be 64-bit aligned, otherwise the + * OSD layer may swab them incorrectly. */ +union lquota_rec { + struct lquota_glb_rec lqr_glb_rec; + struct lquota_slv_rec lqr_slv_rec; + struct lquota_acct_rec lqr_acct_rec; +}; + +/* Index features supported by the global index objects + * Only used for migration purpose and should be removed once on-disk migration + * is no longer needed */ +extern struct dt_index_features dt_quota_iusr_features; +extern struct dt_index_features dt_quota_busr_features; +extern struct dt_index_features dt_quota_igrp_features; +extern struct dt_index_features dt_quota_bgrp_features; + +/* Name used in the configuration logs to identify the default metadata pool + * (composed of all the MDTs, with pool ID 0) and the default data pool (all + * the OSTs, with pool ID 0 too). */ +#define QUOTA_METAPOOL_NAME "mdt=" +#define QUOTA_DATAPOOL_NAME "ost=" + +/* + * Quota Master Target support + */ + +/* Request handlers for quota master operations. + * This is used by the MDT to pass quota/lock requests to the quota master + * target. This won't be needed any more once the QMT is a real target and + * does not rely any more on the MDT service threads and namespace. */ +struct qmt_handlers { + /* Handle quotactl request from client. */ + int (*qmth_quotactl)(const struct lu_env *, struct lu_device *, + struct obd_quotactl *); + + /* Handle dqacq/dqrel request from slave. */ + int (*qmth_dqacq)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *); + + /* LDLM intent policy associated with quota locks */ + int (*qmth_intent_policy)(const struct lu_env *, struct lu_device *, + struct ptlrpc_request *, struct ldlm_lock **, + int); + + /* Initialize LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_init)(struct lu_device *, struct ldlm_resource *); + + /* Update LVB of ldlm resource associated with quota objects */ + int (*qmth_lvbo_update)(struct lu_device *, struct ldlm_resource *, + struct ptlrpc_request *, int); + + /* Return size of LVB to be packed in ldlm message */ + int (*qmth_lvbo_size)(struct lu_device *, struct ldlm_lock *); + + /* Fill request buffer with lvb */ + int (*qmth_lvbo_fill)(struct lu_device *, struct ldlm_lock *, void *, + int); + + /* Free lvb associated with ldlm resource */ + int (*qmth_lvbo_free)(struct lu_device *, struct ldlm_resource *); +}; + +/* actual handlers are defined in lustre/quota/qmt_handler.c */ +extern struct qmt_handlers qmt_hdls; + +/* + * Quota enforcement support on slaves + */ + +struct qsd_instance; + +/* The quota slave feature is implemented under the form of a library. + * The API is the following: + * + * - qsd_init(): the user (mostly the OSD layer) should first allocate a qsd + * instance via qsd_init(). This creates all required structures + * to manage quota enforcement for this target and performs all + * low-level initialization which does not involve any lustre + * object. qsd_init() should typically be called when the OSD + * is being set up. + * + * - qsd_prepare(): This sets up on-disk objects associated with the quota slave + * feature and initiates the quota reintegration procedure if + * needed. qsd_prepare() should typically be called when + * ->ldo_prepare is invoked. + * + * - qsd_start(): a qsd instance should be started once recovery is completed + * (i.e. when ->ldo_recovery_complete is called). This is used + * to notify the qsd layer that quota should now be enforced + * again via the qsd_op_begin/end functions. The last step of the + * reintegration prodecure (namely usage reconciliation) will be + * completed during start. + * + * - qsd_fini(): is used to release a qsd_instance structure allocated with + * qsd_init(). This releases all quota slave objects and frees the + * structures associated with the qsd_instance. + * + * - qsd_op_begin(): is used to enforce quota, it must be called in the + * declaration of each operation. qsd_op_end() should then be + * invoked later once all operations have been completed in + * order to release/adjust the quota space. + * Running qsd_op_begin() before qsd_start() isn't fatal and + * will return success. + * Once qsd_start() has been run, qsd_op_begin() will block + * until the reintegration procedure is completed. + * + * - qsd_op_end(): performs the post operation quota processing. This must be + * called after the operation transaction stopped. + * While qsd_op_begin() must be invoked each time a new + * operation is declared, qsd_op_end() should be called only + * once for the whole transaction. + * + * - qsd_op_adjust(): triggers pre-acquire/release if necessary. + * + * Below are the function prototypes to be used by OSD layer to manage quota + * enforcement. Arguments are documented where each function is defined. */ + +struct qsd_instance *qsd_init(const struct lu_env *, char *, struct dt_device *, + proc_dir_entry_t *); +int qsd_prepare(const struct lu_env *, struct qsd_instance *); +int qsd_start(const struct lu_env *, struct qsd_instance *); +void qsd_fini(const struct lu_env *, struct qsd_instance *); +int qsd_op_begin(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *, struct lquota_id_info *, int *); +void qsd_op_end(const struct lu_env *, struct qsd_instance *, + struct lquota_trans *); +void qsd_op_adjust(const struct lu_env *, struct qsd_instance *, + union lquota_id *, int); +/* This is exported for the ldiskfs quota migration only, + * see convert_quota_file() */ +int lquota_disk_write_glb(const struct lu_env *, struct dt_object *, + __u64, struct lquota_glb_rec *); + +/* + * Quota information attached to a transaction + */ + +struct lquota_entry; + +struct lquota_id_info { + /* quota identifier */ + union lquota_id lqi_id; + + /* USRQUOTA or GRPQUOTA for now, could be expanded for + * directory quota or other types later. */ + int lqi_type; + + /* inodes or kbytes to be consumed or released, it could + * be negative when releasing space. */ + long long lqi_space; + + /* quota slave entry structure associated with this ID */ + struct lquota_entry *lqi_qentry; + + /* whether we are reporting blocks or inodes */ + bool lqi_is_blk; +}; + +/* Since we enforce only inode quota in meta pool (MDTs), and block quota in + * data pool (OSTs), there are at most 4 quota ids being enforced in a single + * transaction, which is chown transaction: + * original uid and gid, new uid and gid. + * + * This value might need to be revised when directory quota is added. */ +#define QUOTA_MAX_TRANSIDS 4 + +/* all qids involved in a single transaction */ +struct lquota_trans { + unsigned short lqt_id_cnt; + struct lquota_id_info lqt_ids[QUOTA_MAX_TRANSIDS]; +}; + +/* flags for quota local enforcement */ +#define QUOTA_FL_OVER_USRQUOTA 0x01 +#define QUOTA_FL_OVER_GRPQUOTA 0x02 +#define QUOTA_FL_SYNC 0x04 + +#define IS_LQUOTA_RES(res) \ + (res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA || \ + res->lr_name.name[LUSTRE_RES_ID_SEQ_OFF] == FID_SEQ_QUOTA_GLB) + +/* helper function used by MDT & OFD to retrieve quota accounting information + * on slave */ +int lquotactl_slv(const struct lu_env *, struct dt_device *, + struct obd_quotactl *); +/** @} quota */ +#endif /* _LUSTRE_QUOTA_H */ diff --git a/drivers/staging/lustre/lustre/include/lustre_req_layout.h b/drivers/staging/lustre/lustre/include/lustre_req_layout.h new file mode 100644 index 000000000000..f4d3820865f1 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_req_layout.h @@ -0,0 +1,334 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lustre_req_layout.h + * + * Lustre Metadata Target (mdt) request handler + * + * Author: Nikita Danilov <nikita@clusterfs.com> + */ + +#ifndef _LUSTRE_REQ_LAYOUT_H__ +#define _LUSTRE_REQ_LAYOUT_H__ + +/** \defgroup req_layout req_layout + * + * @{ + */ + +struct req_msg_field; +struct req_format; +struct req_capsule; + +struct ptlrpc_request; + +enum req_location { + RCL_CLIENT, + RCL_SERVER, + RCL_NR +}; + +/* Maximal number of fields (buffers) in a request message. */ +#define REQ_MAX_FIELD_NR 9 + +struct req_capsule { + struct ptlrpc_request *rc_req; + const struct req_format *rc_fmt; + enum req_location rc_loc; + __u32 rc_area[RCL_NR][REQ_MAX_FIELD_NR]; +}; + +#if !defined(__REQ_LAYOUT_USER__) + +/* struct ptlrpc_request, lustre_msg* */ +#include <lustre_net.h> + +void req_capsule_init(struct req_capsule *pill, struct ptlrpc_request *req, + enum req_location location); +void req_capsule_fini(struct req_capsule *pill); + +void req_capsule_set(struct req_capsule *pill, const struct req_format *fmt); +void req_capsule_client_dump(struct req_capsule *pill); +void req_capsule_server_dump(struct req_capsule *pill); +void req_capsule_init_area(struct req_capsule *pill); +int req_capsule_filled_sizes(struct req_capsule *pill, enum req_location loc); +int req_capsule_server_pack(struct req_capsule *pill); + +void *req_capsule_client_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_client_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_client_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_get(struct req_capsule *pill, + const struct req_msg_field *field); +void *req_capsule_server_sized_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len); +void *req_capsule_server_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + void *swabber); +void *req_capsule_server_sized_swab_get(struct req_capsule *pill, + const struct req_msg_field *field, + int len, void *swabber); +const void *req_capsule_other_get(struct req_capsule *pill, + const struct req_msg_field *field); + +void req_capsule_set_size(struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc, int size); +int req_capsule_get_size(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_msg_size(struct req_capsule *pill, enum req_location loc); +int req_capsule_fmt_size(__u32 magic, const struct req_format *fmt, + enum req_location loc); +void req_capsule_extend(struct req_capsule *pill, const struct req_format *fmt); + +int req_capsule_has_field(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +int req_capsule_field_present(const struct req_capsule *pill, + const struct req_msg_field *field, + enum req_location loc); +void req_capsule_shrink(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen, + enum req_location loc); +int req_capsule_server_grow(struct req_capsule *pill, + const struct req_msg_field *field, + unsigned int newlen); +int req_layout_init(void); +void req_layout_fini(void); + +/* __REQ_LAYOUT_USER__ */ +#endif + +extern struct req_format RQF_OBD_PING; +extern struct req_format RQF_OBD_SET_INFO; +extern struct req_format RQF_SEC_CTX; +extern struct req_format RQF_OBD_IDX_READ; +/* MGS req_format */ +extern struct req_format RQF_MGS_TARGET_REG; +extern struct req_format RQF_MGS_SET_INFO; +extern struct req_format RQF_MGS_CONFIG_READ; +/* fid/fld req_format */ +extern struct req_format RQF_SEQ_QUERY; +extern struct req_format RQF_FLD_QUERY; +/* MDS req_format */ +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_STATFS; +extern struct req_format RQF_MDS_GETSTATUS; +extern struct req_format RQF_MDS_SYNC; +extern struct req_format RQF_MDS_GETXATTR; +extern struct req_format RQF_MDS_GETATTR; +extern struct req_format RQF_UPDATE_OBJ; + +/* + * This is format of direct (non-intent) MDS_GETATTR_NAME request. + */ +extern struct req_format RQF_MDS_GETATTR_NAME; +extern struct req_format RQF_MDS_CLOSE; +extern struct req_format RQF_MDS_PIN; +extern struct req_format RQF_MDS_UNPIN; +extern struct req_format RQF_MDS_CONNECT; +extern struct req_format RQF_MDS_DISCONNECT; +extern struct req_format RQF_MDS_GET_INFO; +extern struct req_format RQF_MDS_READPAGE; +extern struct req_format RQF_MDS_WRITEPAGE; +extern struct req_format RQF_MDS_IS_SUBDIR; +extern struct req_format RQF_MDS_DONE_WRITING; +extern struct req_format RQF_MDS_REINT; +extern struct req_format RQF_MDS_REINT_CREATE; +extern struct req_format RQF_MDS_REINT_CREATE_RMT_ACL; +extern struct req_format RQF_MDS_REINT_CREATE_SLAVE; +extern struct req_format RQF_MDS_REINT_CREATE_SYM; +extern struct req_format RQF_MDS_REINT_OPEN; +extern struct req_format RQF_MDS_REINT_UNLINK; +extern struct req_format RQF_MDS_REINT_LINK; +extern struct req_format RQF_MDS_REINT_RENAME; +extern struct req_format RQF_MDS_REINT_SETATTR; +extern struct req_format RQF_MDS_REINT_SETXATTR; +extern struct req_format RQF_MDS_QUOTACHECK; +extern struct req_format RQF_MDS_QUOTACTL; +extern struct req_format RQF_QC_CALLBACK; +extern struct req_format RQF_QUOTA_DQACQ; +extern struct req_format RQF_MDS_SWAP_LAYOUTS; +/* MDS hsm formats */ +extern struct req_format RQF_MDS_HSM_STATE_GET; +extern struct req_format RQF_MDS_HSM_STATE_SET; +extern struct req_format RQF_MDS_HSM_ACTION; +extern struct req_format RQF_MDS_HSM_PROGRESS; +extern struct req_format RQF_MDS_HSM_CT_REGISTER; +extern struct req_format RQF_MDS_HSM_CT_UNREGISTER; +extern struct req_format RQF_MDS_HSM_REQUEST; +/* OST req_format */ +extern struct req_format RQF_OST_CONNECT; +extern struct req_format RQF_OST_DISCONNECT; +extern struct req_format RQF_OST_QUOTACHECK; +extern struct req_format RQF_OST_QUOTACTL; +extern struct req_format RQF_OST_GETATTR; +extern struct req_format RQF_OST_SETATTR; +extern struct req_format RQF_OST_CREATE; +extern struct req_format RQF_OST_PUNCH; +extern struct req_format RQF_OST_SYNC; +extern struct req_format RQF_OST_DESTROY; +extern struct req_format RQF_OST_BRW_READ; +extern struct req_format RQF_OST_BRW_WRITE; +extern struct req_format RQF_OST_STATFS; +extern struct req_format RQF_OST_SET_GRANT_INFO; +extern struct req_format RQF_OST_GET_INFO_GENERIC; +extern struct req_format RQF_OST_GET_INFO_LAST_ID; +extern struct req_format RQF_OST_GET_INFO_LAST_FID; +extern struct req_format RQF_OST_SET_INFO_LAST_FID; +extern struct req_format RQF_OST_GET_INFO_FIEMAP; + +/* LDLM req_format */ +extern struct req_format RQF_LDLM_ENQUEUE; +extern struct req_format RQF_LDLM_ENQUEUE_LVB; +extern struct req_format RQF_LDLM_CONVERT; +extern struct req_format RQF_LDLM_INTENT; +extern struct req_format RQF_LDLM_INTENT_BASIC; +extern struct req_format RQF_LDLM_INTENT_LAYOUT; +extern struct req_format RQF_LDLM_INTENT_GETATTR; +extern struct req_format RQF_LDLM_INTENT_OPEN; +extern struct req_format RQF_LDLM_INTENT_CREATE; +extern struct req_format RQF_LDLM_INTENT_UNLINK; +extern struct req_format RQF_LDLM_INTENT_QUOTA; +extern struct req_format RQF_LDLM_CANCEL; +extern struct req_format RQF_LDLM_CALLBACK; +extern struct req_format RQF_LDLM_CP_CALLBACK; +extern struct req_format RQF_LDLM_BL_CALLBACK; +extern struct req_format RQF_LDLM_GL_CALLBACK; +extern struct req_format RQF_LDLM_GL_DESC_CALLBACK; +/* LOG req_format */ +extern struct req_format RQF_LOG_CANCEL; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_CREATE; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_DESTROY; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_NEXT_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_PREV_BLOCK; +extern struct req_format RQF_LLOG_ORIGIN_HANDLE_READ_HEADER; +extern struct req_format RQF_LLOG_ORIGIN_CONNECT; + +extern struct req_msg_field RMF_GENERIC_DATA; +extern struct req_msg_field RMF_PTLRPC_BODY; +extern struct req_msg_field RMF_MDT_BODY; +extern struct req_msg_field RMF_MDT_EPOCH; +extern struct req_msg_field RMF_OBD_STATFS; +extern struct req_msg_field RMF_NAME; +extern struct req_msg_field RMF_SYMTGT; +extern struct req_msg_field RMF_TGTUUID; +extern struct req_msg_field RMF_CLUUID; +extern struct req_msg_field RMF_SETINFO_VAL; +extern struct req_msg_field RMF_SETINFO_KEY; +extern struct req_msg_field RMF_GETINFO_VAL; +extern struct req_msg_field RMF_GETINFO_VALLEN; +extern struct req_msg_field RMF_GETINFO_KEY; +extern struct req_msg_field RMF_IDX_INFO; + +/* + * connection handle received in MDS_CONNECT request. + */ +extern struct req_msg_field RMF_CONN; +extern struct req_msg_field RMF_CONNECT_DATA; +extern struct req_msg_field RMF_DLM_REQ; +extern struct req_msg_field RMF_DLM_REP; +extern struct req_msg_field RMF_DLM_LVB; +extern struct req_msg_field RMF_DLM_GL_DESC; +extern struct req_msg_field RMF_LDLM_INTENT; +extern struct req_msg_field RMF_LAYOUT_INTENT; +extern struct req_msg_field RMF_MDT_MD; +extern struct req_msg_field RMF_REC_REINT; +extern struct req_msg_field RMF_EADATA; +extern struct req_msg_field RMF_ACL; +extern struct req_msg_field RMF_LOGCOOKIES; +extern struct req_msg_field RMF_CAPA1; +extern struct req_msg_field RMF_CAPA2; +extern struct req_msg_field RMF_OBD_QUOTACHECK; +extern struct req_msg_field RMF_OBD_QUOTACTL; +extern struct req_msg_field RMF_QUOTA_BODY; +extern struct req_msg_field RMF_STRING; +extern struct req_msg_field RMF_SWAP_LAYOUTS; +extern struct req_msg_field RMF_MDS_HSM_PROGRESS; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; +extern struct req_msg_field RMF_MDS_HSM_USER_ITEM; +extern struct req_msg_field RMF_MDS_HSM_ARCHIVE; +extern struct req_msg_field RMF_HSM_USER_STATE; +extern struct req_msg_field RMF_HSM_STATE_SET; +extern struct req_msg_field RMF_MDS_HSM_CURRENT_ACTION; +extern struct req_msg_field RMF_MDS_HSM_REQUEST; + +/* seq-mgr fields */ +extern struct req_msg_field RMF_SEQ_OPC; +extern struct req_msg_field RMF_SEQ_RANGE; +extern struct req_msg_field RMF_FID_SPACE; + +/* FLD fields */ +extern struct req_msg_field RMF_FLD_OPC; +extern struct req_msg_field RMF_FLD_MDFLD; + +extern struct req_msg_field RMF_LLOGD_BODY; +extern struct req_msg_field RMF_LLOG_LOG_HDR; +extern struct req_msg_field RMF_LLOGD_CONN_BODY; + +extern struct req_msg_field RMF_MGS_TARGET_INFO; +extern struct req_msg_field RMF_MGS_SEND_PARAM; + +extern struct req_msg_field RMF_OST_BODY; +extern struct req_msg_field RMF_OBD_IOOBJ; +extern struct req_msg_field RMF_OBD_ID; +extern struct req_msg_field RMF_FID; +extern struct req_msg_field RMF_NIOBUF_REMOTE; +extern struct req_msg_field RMF_RCS; +extern struct req_msg_field RMF_FIEMAP_KEY; +extern struct req_msg_field RMF_FIEMAP_VAL; +extern struct req_msg_field RMF_OST_ID; + +/* MGS config read message format */ +extern struct req_msg_field RMF_MGS_CONFIG_BODY; +extern struct req_msg_field RMF_MGS_CONFIG_RES; + +/* generic uint32 */ +extern struct req_msg_field RMF_U32; + +/* OBJ update format */ +extern struct req_msg_field RMF_UPDATE; +extern struct req_msg_field RMF_UPDATE_REPLY; +/** @} req_layout */ + +#endif /* _LUSTRE_REQ_LAYOUT_H__ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_sec.h b/drivers/staging/lustre/lustre/include/lustre_sec.h new file mode 100644 index 000000000000..9e0908e1c4d6 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_sec.h @@ -0,0 +1,1145 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _LUSTRE_SEC_H_ +#define _LUSTRE_SEC_H_ + +/** \defgroup sptlrpc sptlrpc + * + * @{ + */ + +/* + * to avoid include + */ +struct obd_import; +struct obd_export; +struct ptlrpc_request; +struct ptlrpc_reply_state; +struct ptlrpc_bulk_desc; +struct brw_page; +/* Linux specific */ +struct key; +struct seq_file; + +/* + * forward declaration + */ +struct ptlrpc_sec_policy; +struct ptlrpc_sec_cops; +struct ptlrpc_sec_sops; +struct ptlrpc_sec; +struct ptlrpc_svc_ctx; +struct ptlrpc_cli_ctx; +struct ptlrpc_ctx_ops; + +/** + * \addtogroup flavor flavor + * + * RPC flavor is represented by a 32 bits integer. Currently the high 12 bits + * are unused, must be set to 0 for future expansion. + * <pre> + * ------------------------------------------------------------------------ + * | 4b (bulk svc) | 4b (bulk type) | 4b (svc) | 4b (mech) | 4b (policy) | + * ------------------------------------------------------------------------ + * </pre> + * + * @{ + */ + +/* + * flavor constants + */ +enum sptlrpc_policy { + SPTLRPC_POLICY_NULL = 0, + SPTLRPC_POLICY_PLAIN = 1, + SPTLRPC_POLICY_GSS = 2, + SPTLRPC_POLICY_MAX, +}; + +enum sptlrpc_mech_null { + SPTLRPC_MECH_NULL = 0, + SPTLRPC_MECH_NULL_MAX, +}; + +enum sptlrpc_mech_plain { + SPTLRPC_MECH_PLAIN = 0, + SPTLRPC_MECH_PLAIN_MAX, +}; + +enum sptlrpc_mech_gss { + SPTLRPC_MECH_GSS_NULL = 0, + SPTLRPC_MECH_GSS_KRB5 = 1, + SPTLRPC_MECH_GSS_MAX, +}; + +enum sptlrpc_service_type { + SPTLRPC_SVC_NULL = 0, /**< no security */ + SPTLRPC_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_SVC_INTG = 2, /**< integrity */ + SPTLRPC_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_SVC_MAX, +}; + +enum sptlrpc_bulk_type { + SPTLRPC_BULK_DEFAULT = 0, /**< follow rpc flavor */ + SPTLRPC_BULK_HASH = 1, /**< hash integrity */ + SPTLRPC_BULK_MAX, +}; + +enum sptlrpc_bulk_service { + SPTLRPC_BULK_SVC_NULL = 0, /**< no security */ + SPTLRPC_BULK_SVC_AUTH = 1, /**< authentication only */ + SPTLRPC_BULK_SVC_INTG = 2, /**< integrity */ + SPTLRPC_BULK_SVC_PRIV = 3, /**< privacy */ + SPTLRPC_BULK_SVC_MAX, +}; + +/* + * compose/extract macros + */ +#define FLVR_POLICY_OFFSET (0) +#define FLVR_MECH_OFFSET (4) +#define FLVR_SVC_OFFSET (8) +#define FLVR_BULK_TYPE_OFFSET (12) +#define FLVR_BULK_SVC_OFFSET (16) + +#define MAKE_FLVR(policy, mech, svc, btype, bsvc) \ + (((__u32)(policy) << FLVR_POLICY_OFFSET) | \ + ((__u32)(mech) << FLVR_MECH_OFFSET) | \ + ((__u32)(svc) << FLVR_SVC_OFFSET) | \ + ((__u32)(btype) << FLVR_BULK_TYPE_OFFSET) | \ + ((__u32)(bsvc) << FLVR_BULK_SVC_OFFSET)) + +/* + * extraction + */ +#define SPTLRPC_FLVR_POLICY(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xF) +#define SPTLRPC_FLVR_MECH(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xF) +#define SPTLRPC_FLVR_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_SVC_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_TYPE(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_TYPE_OFFSET) & 0xF) +#define SPTLRPC_FLVR_BULK_SVC(flavor) \ + ((((__u32)(flavor)) >> FLVR_BULK_SVC_OFFSET) & 0xF) + +#define SPTLRPC_FLVR_BASE(flavor) \ + ((((__u32)(flavor)) >> FLVR_POLICY_OFFSET) & 0xFFF) +#define SPTLRPC_FLVR_BASE_SUB(flavor) \ + ((((__u32)(flavor)) >> FLVR_MECH_OFFSET) & 0xFF) + +/* + * gss subflavors + */ +#define MAKE_BASE_SUBFLVR(mech, svc) \ + ((__u32)(mech) | \ + ((__u32)(svc) << (FLVR_SVC_OFFSET - FLVR_MECH_OFFSET))) + +#define SPTLRPC_SUBFLVR_KRB5N \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_NULL) +#define SPTLRPC_SUBFLVR_KRB5A \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_AUTH) +#define SPTLRPC_SUBFLVR_KRB5I \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_INTG) +#define SPTLRPC_SUBFLVR_KRB5P \ + MAKE_BASE_SUBFLVR(SPTLRPC_MECH_GSS_KRB5, SPTLRPC_SVC_PRIV) + +/* + * "end user" flavors + */ +#define SPTLRPC_FLVR_NULL \ + MAKE_FLVR(SPTLRPC_POLICY_NULL, \ + SPTLRPC_MECH_NULL, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_PLAIN \ + MAKE_FLVR(SPTLRPC_POLICY_PLAIN, \ + SPTLRPC_MECH_PLAIN, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_HASH, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5N \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_NULL, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5A \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_AUTH, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_NULL) +#define SPTLRPC_FLVR_KRB5I \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_INTG, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_INTG) +#define SPTLRPC_FLVR_KRB5P \ + MAKE_FLVR(SPTLRPC_POLICY_GSS, \ + SPTLRPC_MECH_GSS_KRB5, \ + SPTLRPC_SVC_PRIV, \ + SPTLRPC_BULK_DEFAULT, \ + SPTLRPC_BULK_SVC_PRIV) + +#define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL + +#define SPTLRPC_FLVR_INVALID ((__u32) 0xFFFFFFFF) +#define SPTLRPC_FLVR_ANY ((__u32) 0xFFF00000) + +/** + * extract the useful part from wire flavor + */ +#define WIRE_FLVR(wflvr) (((__u32) (wflvr)) & 0x000FFFFF) + +/** @} flavor */ + +static inline void flvr_set_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + svc, + SPTLRPC_FLVR_BULK_TYPE(*flvr), + SPTLRPC_FLVR_BULK_SVC(*flvr)); +} + +static inline void flvr_set_bulk_svc(__u32 *flvr, __u32 svc) +{ + LASSERT(svc < SPTLRPC_BULK_SVC_MAX); + *flvr = MAKE_FLVR(SPTLRPC_FLVR_POLICY(*flvr), + SPTLRPC_FLVR_MECH(*flvr), + SPTLRPC_FLVR_SVC(*flvr), + SPTLRPC_FLVR_BULK_TYPE(*flvr), + svc); +} + +struct bulk_spec_hash { + __u8 hash_alg; +}; + +/** + * Full description of flavors being used on a ptlrpc connection, include + * both regular RPC and bulk transfer parts. + */ +struct sptlrpc_flavor { + /** + * wire flavor, should be renamed to sf_wire. + */ + __u32 sf_rpc; + /** + * general flags of PTLRPC_SEC_FL_* + */ + __u32 sf_flags; + /** + * rpc flavor specification + */ + union { + /* nothing for now */ + } u_rpc; + /** + * bulk flavor specification + */ + union { + struct bulk_spec_hash hash; + } u_bulk; +}; + +/** + * identify the RPC is generated from what part of Lustre. It's encoded into + * RPC requests and to be checked by ptlrpc service. + */ +enum lustre_sec_part { + LUSTRE_SP_CLI = 0, + LUSTRE_SP_MDT, + LUSTRE_SP_OST, + LUSTRE_SP_MGC, + LUSTRE_SP_MGS, + LUSTRE_SP_ANY = 0xFF +}; + +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + +/** + * A rule specifies a flavor to be used by a ptlrpc connection between + * two Lustre parts. + */ +struct sptlrpc_rule { + __u32 sr_netid; /* LNET network ID */ + __u8 sr_from; /* sec_part */ + __u8 sr_to; /* sec_part */ + __u16 sr_padding; + struct sptlrpc_flavor sr_flvr; +}; + +/** + * A set of rules in memory. + * + * Rules are generated and stored on MGS, and propagated to MDT, OST, + * and client when needed. + */ +struct sptlrpc_rule_set { + int srs_nslot; + int srs_nrule; + struct sptlrpc_rule *srs_rules; +}; + +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); +int sptlrpc_flavor_has_bulk(struct sptlrpc_flavor *flvr); + +static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) +{ + memset(set, 0, sizeof(*set)); +} + +void sptlrpc_rule_set_free(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set); +int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, + struct sptlrpc_rule *rule); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); + +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset, + int initial); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); + +/* The maximum length of security payload. 1024 is enough for Kerberos 5, + * and should be enough for other future mechanisms but not sure. + * Only used by pre-allocated request/reply pool. + */ +#define SPTLRPC_MAX_PAYLOAD (1024) + + +struct vfs_cred { + uint32_t vc_uid; + uint32_t vc_gid; +}; + +struct ptlrpc_ctx_ops { + /** + * To determine whether it's suitable to use the \a ctx for \a vcred. + */ + int (*match) (struct ptlrpc_cli_ctx *ctx, + struct vfs_cred *vcred); + + /** + * To bring the \a ctx uptodate. + */ + int (*refresh) (struct ptlrpc_cli_ctx *ctx); + + /** + * Validate the \a ctx. + */ + int (*validate) (struct ptlrpc_cli_ctx *ctx); + + /** + * Force the \a ctx to die. + */ + void (*die) (struct ptlrpc_cli_ctx *ctx, + int grace); + int (*display) (struct ptlrpc_cli_ctx *ctx, + char *buf, int bufsize); + + /** + * Sign the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message with signature. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see null_ctx_sign(), plain_ctx_sign(), gss_cli_ctx_sign(). + */ + int (*sign) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Verify the reply message using \a ctx. + * + * \pre req->rq_repdata point to reply message with signature. + * \pre req->rq_repdata_len is the total reply message length. + * \post req->rq_repmsg point to reply message without signature. + * \post req->rq_replen is the reply message length. + * + * \see null_ctx_verify(), plain_ctx_verify(), gss_cli_ctx_verify(). + */ + int (*verify) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Encrypt the request message using \a ctx. + * + * \pre req->rq_reqmsg point to request message in clear text. + * \pre req->rq_reqlen is the request message length. + * \post req->rq_reqbuf point to request message. + * \post req->rq_reqdata_len is set to the final request message size. + * + * \see gss_cli_ctx_seal(). + */ + int (*seal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Decrypt the reply message using \a ctx. + * + * \pre req->rq_repdata point to encrypted reply message. + * \pre req->rq_repdata_len is the total cipher text length. + * \post req->rq_repmsg point to reply message in clear text. + * \post req->rq_replen is the reply message length in clear text. + * + * \see gss_cli_ctx_unseal(). + */ + int (*unseal) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req); + + /** + * Wrap bulk request data. This is called before wrapping RPC + * request message. + * + * \pre bulk buffer is descripted by desc->bd_iov and + * desc->bd_iov_count. note for read it's just buffer, no data + * need to be sent; for write it contains data in clear text. + * \post when necessary, ptlrpc_bulk_sec_desc was properly prepared + * (usually inside of RPC request message). + * - encryption: cipher text bulk buffer is descripted by + * desc->bd_enc_iov and desc->bd_iov_count (currently assume iov + * count remains the same). + * - otherwise: bulk buffer is still desc->bd_iov and + * desc->bd_iov_count. + * + * \return 0: success. + * \return -ev: error code. + * + * \see plain_cli_wrap_bulk(), gss_cli_ctx_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap bulk reply data. This is called after wrapping RPC + * reply message. + * + * \pre bulk buffer is descripted by desc->bd_iov/desc->bd_enc_iov and + * desc->bd_iov_count, according to wrap_bulk(). + * \post final bulk data in clear text is placed in buffer described + * by desc->bd_iov and desc->bd_iov_count. + * \return +ve nob of actual bulk data in clear text. + * \return -ve error code. + * + * \see plain_cli_unwrap_bulk(), gss_cli_ctx_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +#define PTLRPC_CTX_NEW_BIT (0) /* newly created */ +#define PTLRPC_CTX_UPTODATE_BIT (1) /* uptodate */ +#define PTLRPC_CTX_DEAD_BIT (2) /* mark expired gracefully */ +#define PTLRPC_CTX_ERROR_BIT (3) /* fatal error (refresh, etc.) */ +#define PTLRPC_CTX_CACHED_BIT (8) /* in ctx cache (hash etc.) */ +#define PTLRPC_CTX_ETERNAL_BIT (9) /* always valid */ + +#define PTLRPC_CTX_NEW (1 << PTLRPC_CTX_NEW_BIT) +#define PTLRPC_CTX_UPTODATE (1 << PTLRPC_CTX_UPTODATE_BIT) +#define PTLRPC_CTX_DEAD (1 << PTLRPC_CTX_DEAD_BIT) +#define PTLRPC_CTX_ERROR (1 << PTLRPC_CTX_ERROR_BIT) +#define PTLRPC_CTX_CACHED (1 << PTLRPC_CTX_CACHED_BIT) +#define PTLRPC_CTX_ETERNAL (1 << PTLRPC_CTX_ETERNAL_BIT) + +#define PTLRPC_CTX_STATUS_MASK (PTLRPC_CTX_NEW_BIT | \ + PTLRPC_CTX_UPTODATE | \ + PTLRPC_CTX_DEAD | \ + PTLRPC_CTX_ERROR) + +struct ptlrpc_cli_ctx { + struct hlist_node cc_cache; /* linked into ctx cache */ + atomic_t cc_refcount; + struct ptlrpc_sec *cc_sec; + struct ptlrpc_ctx_ops *cc_ops; + cfs_time_t cc_expire; /* in seconds */ + unsigned int cc_early_expire:1; + unsigned long cc_flags; + struct vfs_cred cc_vcred; + spinlock_t cc_lock; + struct list_head cc_req_list; /* waiting reqs linked here */ + struct list_head cc_gc_chain; /* linked to gc chain */ +}; + +/** + * client side policy operation vector. + */ +struct ptlrpc_sec_cops { + /** + * Given an \a imp, create and initialize a ptlrpc_sec structure. + * \param ctx service context: + * - regular import: \a ctx should be NULL; + * - reverse import: \a ctx is obtained from incoming request. + * \param flavor specify what flavor to use. + * + * When necessary, policy module is responsible for taking reference + * on the import. + * + * \see null_create_sec(), plain_create_sec(), gss_sec_create_kr(). + */ + struct ptlrpc_sec * (*create_sec) (struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flavor); + + /** + * Destructor of ptlrpc_sec. When called, refcount has been dropped + * to 0 and all contexts has been destroyed. + * + * \see null_destroy_sec(), plain_destroy_sec(), gss_sec_destroy_kr(). + */ + void (*destroy_sec) (struct ptlrpc_sec *sec); + + /** + * Notify that this ptlrpc_sec is going to die. Optionally, policy + * module is supposed to set sec->ps_dying and whatever necessary + * actions. + * + * \see plain_kill_sec(), gss_sec_kill(). + */ + void (*kill_sec) (struct ptlrpc_sec *sec); + + /** + * Given \a vcred, lookup and/or create its context. The policy module + * is supposed to maintain its own context cache. + * XXX currently \a create and \a remove_dead is always 1, perhaps + * should be removed completely. + * + * \see null_lookup_ctx(), plain_lookup_ctx(), gss_sec_lookup_ctx_kr(). + */ + struct ptlrpc_cli_ctx * (*lookup_ctx) (struct ptlrpc_sec *sec, + struct vfs_cred *vcred, + int create, + int remove_dead); + + /** + * Called then the reference of \a ctx dropped to 0. The policy module + * is supposed to destroy this context or whatever else according to + * its cache maintainance mechamism. + * + * \param sync if zero, we shouldn't wait for the context being + * destroyed completely. + * + * \see plain_release_ctx(), gss_sec_release_ctx_kr(). + */ + void (*release_ctx) (struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx, + int sync); + + /** + * Flush the context cache. + * + * \param uid context of which user, -1 means all contexts. + * \param grace if zero, the PTLRPC_CTX_UPTODATE_BIT of affected + * contexts should be cleared immediately. + * \param force if zero, only idle contexts will be flushed. + * + * \see plain_flush_ctx_cache(), gss_sec_flush_ctx_cache_kr(). + */ + int (*flush_ctx_cache) + (struct ptlrpc_sec *sec, + uid_t uid, + int grace, + int force); + + /** + * Called periodically by garbage collector to remove dead contexts + * from cache. + * + * \see gss_sec_gc_ctx_kr(). + */ + void (*gc_ctx) (struct ptlrpc_sec *sec); + + /** + * Given an context \a ctx, install a corresponding reverse service + * context on client side. + * XXX currently it's only used by GSS module, maybe we should remove + * this from general API. + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_sec *sec, + struct ptlrpc_cli_ctx *ctx); + + /** + * To allocate request buffer for \a req. + * + * \pre req->rq_reqmsg == NULL. + * \pre req->rq_reqbuf == NULL, otherwise it must be pre-allocated, + * we are not supposed to free it. + * \post if success, req->rq_reqmsg point to a buffer with size + * at least \a lustre_msg_size. + * + * \see null_alloc_reqbuf(), plain_alloc_reqbuf(), gss_alloc_reqbuf(). + */ + int (*alloc_reqbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free request buffer for \a req. + * + * \pre req->rq_reqbuf != NULL. + * + * \see null_free_reqbuf(), plain_free_reqbuf(), gss_free_reqbuf(). + */ + void (*free_reqbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To allocate reply buffer for \a req. + * + * \pre req->rq_repbuf == NULL. + * \post if success, req->rq_repbuf point to a buffer with size + * req->rq_repbuf_len, the size should be large enough to receive + * reply which be transformed from \a lustre_msg_size of clear text. + * + * \see null_alloc_repbuf(), plain_alloc_repbuf(), gss_alloc_repbuf(). + */ + int (*alloc_repbuf)(struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int lustre_msg_size); + + /** + * To free reply buffer for \a req. + * + * \pre req->rq_repbuf != NULL. + * \post req->rq_repbuf == NULL. + * \post req->rq_repbuf_len == 0. + * + * \see null_free_repbuf(), plain_free_repbuf(), gss_free_repbuf(). + */ + void (*free_repbuf) (struct ptlrpc_sec *sec, + struct ptlrpc_request *req); + + /** + * To expand the request buffer of \a req, thus the \a segment in + * the request message pointed by req->rq_reqmsg can accommodate + * at least \a newsize of data. + * + * \pre req->rq_reqmsg->lm_buflens[segment] < newsize. + * + * \see null_enlarge_reqbuf(), plain_enlarge_reqbuf(), + * gss_enlarge_reqbuf(). + */ + int (*enlarge_reqbuf) + (struct ptlrpc_sec *sec, + struct ptlrpc_request *req, + int segment, int newsize); + /* + * misc + */ + int (*display) (struct ptlrpc_sec *sec, + struct seq_file *seq); +}; + +/** + * server side policy operation vector. + */ +struct ptlrpc_sec_sops { + /** + * verify an incoming request. + * + * \pre request message is pointed by req->rq_reqbuf, size is + * req->rq_reqdata_len; and the message has been unpacked to + * host byte order. + * + * \retval SECSVC_OK success, req->rq_reqmsg point to request message + * in clear text, size is req->rq_reqlen; req->rq_svc_ctx is set; + * req->rq_sp_from is decoded from request. + * \retval SECSVC_COMPLETE success, the request has been fully + * processed, and reply message has been prepared; req->rq_sp_from is + * decoded from request. + * \retval SECSVC_DROP failed, this request should be dropped. + * + * \see null_accept(), plain_accept(), gss_svc_accept_kr(). + */ + int (*accept) (struct ptlrpc_request *req); + + /** + * Perform security transformation upon reply message. + * + * \pre reply message is pointed by req->rq_reply_state->rs_msg, size + * is req->rq_replen. + * \post req->rs_repdata_len is the final message size. + * \post req->rq_reply_off is set. + * + * \see null_authorize(), plain_authorize(), gss_svc_authorize(). + */ + int (*authorize) (struct ptlrpc_request *req); + + /** + * Invalidate server context \a ctx. + * + * \see gss_svc_invalidate_ctx(). + */ + void (*invalidate_ctx) + (struct ptlrpc_svc_ctx *ctx); + + /** + * Allocate a ptlrpc_reply_state. + * + * \param msgsize size of the reply message in clear text. + * \pre if req->rq_reply_state != NULL, then it's pre-allocated, we + * should simply use it; otherwise we'll responsible for allocating + * a new one. + * \post req->rq_reply_state != NULL; + * \post req->rq_reply_state->rs_msg != NULL; + * + * \see null_alloc_rs(), plain_alloc_rs(), gss_svc_alloc_rs(). + */ + int (*alloc_rs) (struct ptlrpc_request *req, + int msgsize); + + /** + * Free a ptlrpc_reply_state. + */ + void (*free_rs) (struct ptlrpc_reply_state *rs); + + /** + * Release the server context \a ctx. + * + * \see gss_svc_free_ctx(). + */ + void (*free_ctx) (struct ptlrpc_svc_ctx *ctx); + + /** + * Install a reverse context based on the server context \a ctx. + * + * \see gss_svc_install_rctx_kr(). + */ + int (*install_rctx)(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); + + /** + * Prepare buffer for incoming bulk write. + * + * \pre desc->bd_iov and desc->bd_iov_count describes the buffer + * intended to receive the write. + * + * \see gss_svc_prep_bulk(). + */ + int (*prep_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Unwrap the bulk write data. + * + * \see plain_svc_unwrap_bulk(), gss_svc_unwrap_bulk(). + */ + int (*unwrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + + /** + * Wrap the bulk read data. + * + * \see plain_svc_wrap_bulk(), gss_svc_wrap_bulk(). + */ + int (*wrap_bulk) (struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +}; + +struct ptlrpc_sec_policy { + module_t *sp_owner; + char *sp_name; + __u16 sp_policy; /* policy number */ + struct ptlrpc_sec_cops *sp_cops; /* client ops */ + struct ptlrpc_sec_sops *sp_sops; /* server ops */ +}; + +#define PTLRPC_SEC_FL_REVERSE 0x0001 /* reverse sec */ +#define PTLRPC_SEC_FL_ROOTONLY 0x0002 /* treat everyone as root */ +#define PTLRPC_SEC_FL_UDESC 0x0004 /* ship udesc */ +#define PTLRPC_SEC_FL_BULK 0x0008 /* intensive bulk i/o expected */ +#define PTLRPC_SEC_FL_PAG 0x0010 /* PAG mode */ + +/** + * The ptlrpc_sec represents the client side ptlrpc security facilities, + * each obd_import (both regular and reverse import) must associate with + * a ptlrpc_sec. + * + * \see sptlrpc_import_sec_adapt(). + */ +struct ptlrpc_sec { + struct ptlrpc_sec_policy *ps_policy; + atomic_t ps_refcount; + /** statistic only */ + atomic_t ps_nctx; + /** unique identifier */ + int ps_id; + struct sptlrpc_flavor ps_flvr; + enum lustre_sec_part ps_part; + /** after set, no more new context will be created */ + unsigned int ps_dying:1; + /** owning import */ + struct obd_import *ps_import; + spinlock_t ps_lock; + + /* + * garbage collection + */ + struct list_head ps_gc_list; + cfs_time_t ps_gc_interval; /* in seconds */ + cfs_time_t ps_gc_next; /* in seconds */ +}; + +static inline int sec_is_reverse(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_REVERSE); +} + +static inline int sec_is_rootonly(struct ptlrpc_sec *sec) +{ + return (sec->ps_flvr.sf_flags & PTLRPC_SEC_FL_ROOTONLY); +} + + +struct ptlrpc_svc_ctx { + atomic_t sc_refcount; + struct ptlrpc_sec_policy *sc_policy; +}; + +/* + * user identity descriptor + */ +#define LUSTRE_MAX_GROUPS (128) + +struct ptlrpc_user_desc { + __u32 pud_uid; + __u32 pud_gid; + __u32 pud_fsuid; + __u32 pud_fsgid; + __u32 pud_cap; + __u32 pud_ngroups; + __u32 pud_groups[0]; +}; + +/* + * bulk flavors + */ +enum sptlrpc_bulk_hash_alg { + BULK_HASH_ALG_NULL = 0, + BULK_HASH_ALG_ADLER32, + BULK_HASH_ALG_CRC32, + BULK_HASH_ALG_MD5, + BULK_HASH_ALG_SHA1, + BULK_HASH_ALG_SHA256, + BULK_HASH_ALG_SHA384, + BULK_HASH_ALG_SHA512, + BULK_HASH_ALG_MAX +}; + +const char * sptlrpc_get_hash_name(__u8 hash_alg); +__u8 sptlrpc_get_hash_alg(const char *algname); + +enum { + BSD_FL_ERR = 1, +}; + +struct ptlrpc_bulk_sec_desc { + __u8 bsd_version; /* 0 */ + __u8 bsd_type; /* SPTLRPC_BULK_XXX */ + __u8 bsd_svc; /* SPTLRPC_BULK_SVC_XXXX */ + __u8 bsd_flags; /* flags */ + __u32 bsd_nob; /* nob of bulk data */ + __u8 bsd_data[0]; /* policy-specific token */ +}; + + +/* + * lprocfs + */ +struct proc_dir_entry; +extern struct proc_dir_entry *sptlrpc_proc_root; + +/* + * round size up to next power of 2, for slab allocation. + * @size must be sane (can't overflow after round up) + */ +static inline int size_roundup_power2(int size) +{ + size--; + size |= size >> 1; + size |= size >> 2; + size |= size >> 4; + size |= size >> 8; + size |= size >> 16; + size++; + return size; +} + +/* + * internal support libraries + */ +void _sptlrpc_enlarge_msg_inplace(struct lustre_msg *msg, + int segment, int newsize); + +/* + * security policies + */ +int sptlrpc_register_policy(struct ptlrpc_sec_policy *policy); +int sptlrpc_unregister_policy(struct ptlrpc_sec_policy *policy); + +__u32 sptlrpc_name2flavor_base(const char *name); +const char *sptlrpc_flavor2name_base(__u32 flvr); +char *sptlrpc_flavor2name_bulk(struct sptlrpc_flavor *sf, + char *buf, int bufsize); +char *sptlrpc_flavor2name(struct sptlrpc_flavor *sf, char *buf, int bufsize); +char *sptlrpc_secflags2str(__u32 flags, char *buf, int bufsize); + +static inline +struct ptlrpc_sec_policy *sptlrpc_policy_get(struct ptlrpc_sec_policy *policy) +{ + __module_get(policy->sp_owner); + return policy; +} + +static inline +void sptlrpc_policy_put(struct ptlrpc_sec_policy *policy) +{ + module_put(policy->sp_owner); +} + +/* + * client credential + */ +static inline +unsigned long cli_ctx_status(struct ptlrpc_cli_ctx *ctx) +{ + return (ctx->cc_flags & PTLRPC_CTX_STATUS_MASK); +} + +static inline +int cli_ctx_is_ready(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) == PTLRPC_CTX_UPTODATE); +} + +static inline +int cli_ctx_is_refreshed(struct ptlrpc_cli_ctx *ctx) +{ + return (cli_ctx_status(ctx) != 0); +} + +static inline +int cli_ctx_is_uptodate(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_UPTODATE) != 0); +} + +static inline +int cli_ctx_is_error(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ERROR) != 0); +} + +static inline +int cli_ctx_is_dead(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & (PTLRPC_CTX_DEAD | PTLRPC_CTX_ERROR)) != 0); +} + +static inline +int cli_ctx_is_eternal(struct ptlrpc_cli_ctx *ctx) +{ + return ((ctx->cc_flags & PTLRPC_CTX_ETERNAL) != 0); +} + +/* + * sec get/put + */ +struct ptlrpc_sec *sptlrpc_sec_get(struct ptlrpc_sec *sec); +void sptlrpc_sec_put(struct ptlrpc_sec *sec); + +/* + * internal apis which only used by policy impelentation + */ +int sptlrpc_get_next_secid(void); +void sptlrpc_sec_destroy(struct ptlrpc_sec *sec); + +/* + * exported client context api + */ +struct ptlrpc_cli_ctx *sptlrpc_cli_ctx_get(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_put(struct ptlrpc_cli_ctx *ctx, int sync); +void sptlrpc_cli_ctx_expire(struct ptlrpc_cli_ctx *ctx); +void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx); +int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize); + +/* + * exported client context wrap/buffers + */ +int sptlrpc_cli_wrap_request(struct ptlrpc_request *req); +int sptlrpc_cli_unwrap_reply(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_reqbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_reqbuf(struct ptlrpc_request *req); +int sptlrpc_cli_alloc_repbuf(struct ptlrpc_request *req, int msgsize); +void sptlrpc_cli_free_repbuf(struct ptlrpc_request *req); +int sptlrpc_cli_enlarge_reqbuf(struct ptlrpc_request *req, + int segment, int newsize); +int sptlrpc_cli_unwrap_early_reply(struct ptlrpc_request *req, + struct ptlrpc_request **req_ret); +void sptlrpc_cli_finish_early_reply(struct ptlrpc_request *early_req); + +void sptlrpc_request_out_callback(struct ptlrpc_request *req); + +/* + * exported higher interface of import & request + */ +int sptlrpc_import_sec_adapt(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx, + struct sptlrpc_flavor *flvr); +struct ptlrpc_sec *sptlrpc_import_sec_ref(struct obd_import *imp); +void sptlrpc_import_sec_put(struct obd_import *imp); + +int sptlrpc_import_check_ctx(struct obd_import *imp); +void sptlrpc_import_flush_root_ctx(struct obd_import *imp); +void sptlrpc_import_flush_my_ctx(struct obd_import *imp); +void sptlrpc_import_flush_all_ctx(struct obd_import *imp); +int sptlrpc_req_get_ctx(struct ptlrpc_request *req); +void sptlrpc_req_put_ctx(struct ptlrpc_request *req, int sync); +int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout); +int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req); +void sptlrpc_req_set_flavor(struct ptlrpc_request *req, int opcode); + +int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule); + +/* gc */ +void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec); +void sptlrpc_gc_add_ctx(struct ptlrpc_cli_ctx *ctx); + +/* misc */ +const char * sec2target_str(struct ptlrpc_sec *sec); +int sptlrpc_lprocfs_cliobd_attach(struct obd_device *dev); + +/* + * server side + */ +enum secsvc_accept_res { + SECSVC_OK = 0, + SECSVC_COMPLETE, + SECSVC_DROP, +}; + +int sptlrpc_svc_unwrap_request(struct ptlrpc_request *req); +int sptlrpc_svc_alloc_rs(struct ptlrpc_request *req, int msglen); +int sptlrpc_svc_wrap_reply(struct ptlrpc_request *req); +void sptlrpc_svc_free_rs(struct ptlrpc_reply_state *rs); +void sptlrpc_svc_ctx_addref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_decref(struct ptlrpc_request *req); +void sptlrpc_svc_ctx_invalidate(struct ptlrpc_request *req); + +int sptlrpc_target_export_check(struct obd_export *exp, + struct ptlrpc_request *req); +void sptlrpc_target_update_exp_flavor(struct obd_device *obd, + struct sptlrpc_rule_set *rset); + +/* + * reverse context + */ +int sptlrpc_svc_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_svc_ctx *ctx); +int sptlrpc_cli_install_rvs_ctx(struct obd_import *imp, + struct ptlrpc_cli_ctx *ctx); + +/* bulk security api */ +int sptlrpc_enc_pool_add_user(void); +int sptlrpc_enc_pool_del_user(void); +int sptlrpc_enc_pool_get_pages(struct ptlrpc_bulk_desc *desc); +void sptlrpc_enc_pool_put_pages(struct ptlrpc_bulk_desc *desc); + +int sptlrpc_cli_wrap_bulk(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); +int sptlrpc_cli_unwrap_bulk_read(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc, + int nob); +int sptlrpc_cli_unwrap_bulk_write(struct ptlrpc_request *req, + struct ptlrpc_bulk_desc *desc); + +/* bulk helpers (internal use only by policies) */ +int sptlrpc_get_bulk_checksum(struct ptlrpc_bulk_desc *desc, __u8 alg, + void *buf, int buflen); + +int bulk_sec_desc_unpack(struct lustre_msg *msg, int offset, int swabbed); + +/* user descriptor helpers */ +static inline int sptlrpc_user_desc_size(int ngroups) +{ + return sizeof(struct ptlrpc_user_desc) + ngroups * sizeof(__u32); +} + +int sptlrpc_current_user_desc_size(void); +int sptlrpc_pack_user_desc(struct lustre_msg *msg, int offset); +int sptlrpc_unpack_user_desc(struct lustre_msg *req, int offset, int swabbed); + + +#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN) +#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE) + +enum { + LUSTRE_SEC_NONE = 0, + LUSTRE_SEC_REMOTE = 1, + LUSTRE_SEC_SPECIFY = 2, + LUSTRE_SEC_ALL = 3 +}; + +/** @} sptlrpc */ + +#endif /* _LUSTRE_SEC_H_ */ diff --git a/drivers/staging/lustre/lustre/include/lustre_update.h b/drivers/staging/lustre/lustre/include/lustre_update.h new file mode 100644 index 000000000000..84defce0f623 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_update.h @@ -0,0 +1,189 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.gnu.org/licenses/gpl-2.0.htm + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + */ +/* + * lustre/include/lustre_update.h + * + * Author: Di Wang <di.wang@intel.com> + */ + +#ifndef _LUSTRE_UPDATE_H +#define _LUSTRE_UPDATE_H + +#define UPDATE_BUFFER_SIZE 8192 +struct update_request { + struct dt_device *ur_dt; + struct list_head ur_list; /* attached itself to thandle */ + int ur_flags; + int ur_rc; /* request result */ + int ur_batchid; /* Current batch(trans) id */ + struct update_buf *ur_buf; /* Holding the update req */ +}; + +static inline unsigned long update_size(struct update *update) +{ + unsigned long size; + int i; + + size = cfs_size_round(offsetof(struct update, u_bufs[0])); + for (i = 0; i < UPDATE_BUF_COUNT; i++) + size += cfs_size_round(update->u_lens[i]); + + return size; +} + +static inline void *update_param_buf(struct update *update, int index, + int *size) +{ + int i; + void *ptr; + + if (index >= UPDATE_BUF_COUNT) + return NULL; + + ptr = (char *)update + cfs_size_round(offsetof(struct update, + u_bufs[0])); + for (i = 0; i < index; i++) { + LASSERT(update->u_lens[i] > 0); + ptr += cfs_size_round(update->u_lens[i]); + } + + if (size != NULL) + *size = update->u_lens[index]; + + return ptr; +} + +static inline unsigned long update_buf_size(struct update_buf *buf) +{ + unsigned long size; + int i = 0; + + size = cfs_size_round(offsetof(struct update_buf, ub_bufs[0])); + for (i = 0; i < buf->ub_count; i++) { + struct update *update; + + update = (struct update *)((char *)buf + size); + size += update_size(update); + } + LASSERT(size <= UPDATE_BUFFER_SIZE); + return size; +} + +static inline void *update_buf_get(struct update_buf *buf, int index, int *size) +{ + int count = buf->ub_count; + void *ptr; + int i = 0; + + if (index >= count) + return NULL; + + ptr = (char *)buf + cfs_size_round(offsetof(struct update_buf, + ub_bufs[0])); + for (i = 0; i < index; i++) + ptr += update_size((struct update *)ptr); + + if (size != NULL) + *size = update_size((struct update *)ptr); + + return ptr; +} + +static inline void update_init_reply_buf(struct update_reply *reply, int count) +{ + reply->ur_version = UPDATE_REPLY_V1; + reply->ur_count = count; +} + +static inline void *update_get_buf_internal(struct update_reply *reply, + int index, int *size) +{ + char *ptr; + int count = reply->ur_count; + int i; + + if (index >= count) + return NULL; + + ptr = (char *)reply + cfs_size_round(offsetof(struct update_reply, + ur_lens[count])); + for (i = 0; i < index; i++) { + LASSERT(reply->ur_lens[i] > 0); + ptr += cfs_size_round(reply->ur_lens[i]); + } + + if (size != NULL) + *size = reply->ur_lens[index]; + + return ptr; +} + +static inline void update_insert_reply(struct update_reply *reply, void *data, + int data_len, int index, int rc) +{ + char *ptr; + + ptr = update_get_buf_internal(reply, index, NULL); + LASSERT(ptr != NULL); + + *(int *)ptr = cpu_to_le32(rc); + ptr += sizeof(int); + if (data_len > 0) { + LASSERT(data != NULL); + memcpy(ptr, data, data_len); + } + reply->ur_lens[index] = data_len + sizeof(int); +} + +static inline int update_get_reply_buf(struct update_reply *reply, void **buf, + int index) +{ + char *ptr; + int size = 0; + int result; + + ptr = update_get_buf_internal(reply, index, &size); + result = *(int *)ptr; + + if (result < 0) + return result; + + LASSERT((ptr != NULL && size >= sizeof(int))); + *buf = ptr + sizeof(int); + return size - sizeof(int); +} + +static inline int update_get_reply_result(struct update_reply *reply, + void **buf, int index) +{ + void *ptr; + int size; + + ptr = update_get_buf_internal(reply, index, &size); + LASSERT(ptr != NULL && size > sizeof(int)); + return *(int *)ptr; +} + +#endif diff --git a/drivers/staging/lustre/lustre/include/lustre_ver.h b/drivers/staging/lustre/lustre/include/lustre_ver.h new file mode 100644 index 000000000000..dc187b8f741f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lustre_ver.h @@ -0,0 +1,24 @@ +#ifndef _LUSTRE_VER_H_ +#define _LUSTRE_VER_H_ +/* This file automatically generated from lustre/include/lustre_ver.h.in, + * based on parameters in lustre/autoconf/lustre-version.ac. + * Changes made directly to this file will be lost. */ + +#define LUSTRE_MAJOR 2 +#define LUSTRE_MINOR 3 +#define LUSTRE_PATCH 64 +#define LUSTRE_FIX 0 +#define LUSTRE_VERSION_STRING "2.3.64" + +#define LUSTRE_VERSION_CODE OBD_OCD_VERSION(LUSTRE_MAJOR,LUSTRE_MINOR,LUSTRE_PATCH,LUSTRE_FIX) + +/* liblustre clients are only allowed to connect if their LUSTRE_FIX mismatches + * by this amount (set in lustre/autoconf/lustre-version.ac). */ +#define LUSTRE_VERSION_ALLOWED_OFFSET OBD_OCD_VERSION(0, 0, 1, 32) + +/* If lustre version of client and servers it connects to differs by more + * than this amount, client would issue a warning. + * (set in lustre/autoconf/lustre-version.ac) */ +#define LUSTRE_VERSION_OFFSET_WARN OBD_OCD_VERSION(0, 4, 0, 0) + +#endif diff --git a/drivers/staging/lustre/lustre/include/lvfs.h b/drivers/staging/lustre/lustre/include/lvfs.h new file mode 100644 index 000000000000..28f1a6b76f73 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/lvfs.h @@ -0,0 +1,57 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/lvfs.h + * + * lustre VFS/process permission interface + */ + +#ifndef __LVFS_H__ +#define __LVFS_H__ + +#define LL_FID_NAMELEN (16 + 1 + 8 + 1) + +#include <linux/libcfs/libcfs.h> +#include <linux/lvfs.h> + +#include <linux/libcfs/lucache.h> + + +/* lvfs_common.c */ +struct dentry *lvfs_fid2dentry(struct lvfs_run_ctxt *, __u64, __u32, __u64 ,void *data); + +void push_ctxt(struct lvfs_run_ctxt *save, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *cred); +void pop_ctxt(struct lvfs_run_ctxt *saved, struct lvfs_run_ctxt *new_ctx, + struct lvfs_ucred *cred); +#endif diff --git a/drivers/staging/lustre/lustre/include/md_object.h b/drivers/staging/lustre/lustre/include/md_object.h new file mode 100644 index 000000000000..92d6420b21da --- /dev/null +++ b/drivers/staging/lustre/lustre/include/md_object.h @@ -0,0 +1,908 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/md_object.h + * + * Extention of lu_object.h for metadata objects + */ + +#ifndef _LUSTRE_MD_OBJECT_H +#define _LUSTRE_MD_OBJECT_H + +/** \defgroup md md + * Sub-class of lu_object with methods common for "meta-data" objects in MDT + * stack. + * + * Meta-data objects implement namespace operations: you can link, unlink + * them, and treat them as directories. + * + * Examples: mdt, cmm, and mdt are implementations of md interface. + * @{ + */ + + +/* + * super-class definitions. + */ +#include <dt_object.h> + +struct md_device; +struct md_device_operations; +struct md_object; +struct obd_export; + +enum { + UCRED_INVALID = -1, + UCRED_INIT = 0, + UCRED_OLD = 1, + UCRED_NEW = 2 +}; + +enum { + MD_CAPAINFO_MAX = 5 +}; + +/** there are at most 5 fids in one operation, see rename, NOTE the last one + * is a temporary one used for is_subdir() */ +struct md_capainfo { + __u32 mc_auth; + __u32 mc_padding; + struct lu_fid mc_fid[MD_CAPAINFO_MAX]; + struct lustre_capa *mc_capa[MD_CAPAINFO_MAX]; +}; + +struct md_quota { + struct obd_export *mq_exp; +}; + +/** + * Implemented in mdd/mdd_handler.c. + * + * XXX should be moved into separate .h/.c together with all md security + * related definitions. + */ +struct md_capainfo *md_capainfo(const struct lu_env *env); +struct md_quota *md_quota(const struct lu_env *env); + +/** metadata attributes */ +enum ma_valid { + MA_INODE = (1 << 0), + MA_LOV = (1 << 1), + MA_COOKIE = (1 << 2), + MA_FLAGS = (1 << 3), + MA_LMV = (1 << 4), + MA_ACL_DEF = (1 << 5), + MA_LOV_DEF = (1 << 6), + MA_LAY_GEN = (1 << 7), + MA_HSM = (1 << 8), + MA_SOM = (1 << 9), + MA_PFID = (1 << 10) +}; + +typedef enum { + MDL_MINMODE = 0, + MDL_EX = 1, + MDL_PW = 2, + MDL_PR = 4, + MDL_CW = 8, + MDL_CR = 16, + MDL_NL = 32, + MDL_GROUP = 64, + MDL_MAXMODE +} mdl_mode_t; + +typedef enum { + MDT_NUL_LOCK = 0, + MDT_REG_LOCK = (1 << 0), + MDT_PDO_LOCK = (1 << 1) +} mdl_type_t; + +/* memory structure for hsm attributes + * for fields description see the on disk structure hsm_attrs + * which is defined in lustre_idl.h + */ +struct md_hsm { + __u32 mh_compat; + __u32 mh_flags; + __u64 mh_arch_id; + __u64 mh_arch_ver; +}; + +#define IOEPOCH_INVAL 0 + +/* memory structure for som attributes + * for fields description see the on disk structure som_attrs + * which is defined in lustre_idl.h + */ +struct md_som_data { + __u32 msd_compat; + __u32 msd_incompat; + __u64 msd_ioepoch; + __u64 msd_size; + __u64 msd_blocks; + __u64 msd_mountid; +}; + +struct md_attr { + __u64 ma_valid; + __u64 ma_need; + __u64 ma_attr_flags; + struct lu_attr ma_attr; + struct lu_fid ma_pfid; + struct md_hsm ma_hsm; + struct lov_mds_md *ma_lmm; + struct lmv_stripe_md *ma_lmv; + void *ma_acl; + struct llog_cookie *ma_cookie; + struct lustre_capa *ma_capa; + struct md_som_data *ma_som; + int ma_lmm_size; + int ma_lmv_size; + int ma_acl_size; + int ma_cookie_size; + __u16 ma_layout_gen; +}; + +/** Additional parameters for create */ +struct md_op_spec { + union { + /** symlink target */ + const char *sp_symname; + /** parent FID for cross-ref mkdir */ + const struct lu_fid *sp_pfid; + /** eadata for regular files */ + struct md_spec_reg { + /** lov objs exist already */ + const struct lu_fid *fid; + const void *eadata; + int eadatalen; + } sp_ea; + } u; + + /** Create flag from client: such as MDS_OPEN_CREAT, and others. */ + __u64 sp_cr_flags; + + /** don't create lov objects or llog cookie - this replay */ + unsigned int no_create:1, + sp_cr_lookup:1, /* do lookup sanity check or not. */ + sp_rm_entry:1; /* only remove name entry */ + + /** Current lock mode for parent dir where create is performing. */ + mdl_mode_t sp_cr_mode; + + /** to create directory */ + const struct dt_index_features *sp_feat; +}; + +/** + * Operations implemented for each md object (both directory and leaf). + */ +struct md_object_operations { + int (*moo_permission)(const struct lu_env *env, + struct md_object *pobj, struct md_object *cobj, + struct md_attr *attr, int mask); + + int (*moo_attr_get)(const struct lu_env *env, struct md_object *obj, + struct md_attr *attr); + + int (*moo_attr_set)(const struct lu_env *env, struct md_object *obj, + const struct md_attr *attr); + + int (*moo_xattr_get)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf, const char *name); + + int (*moo_xattr_list)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + + int (*moo_xattr_set)(const struct lu_env *env, struct md_object *obj, + const struct lu_buf *buf, const char *name, + int fl); + + int (*moo_xattr_del)(const struct lu_env *env, struct md_object *obj, + const char *name); + + /** This method is used to swap the layouts between 2 objects */ + int (*moo_swap_layouts)(const struct lu_env *env, + struct md_object *obj1, struct md_object *obj2, + __u64 flags); + + /** \retval number of bytes actually read upon success */ + int (*moo_readpage)(const struct lu_env *env, struct md_object *obj, + const struct lu_rdpg *rdpg); + + int (*moo_readlink)(const struct lu_env *env, struct md_object *obj, + struct lu_buf *buf); + int (*moo_changelog)(const struct lu_env *env, + enum changelog_rec_type type, int flags, + struct md_object *obj); + /** part of cross-ref operation */ + int (*moo_object_create)(const struct lu_env *env, + struct md_object *obj, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*moo_ref_add)(const struct lu_env *env, + struct md_object *obj, + const struct md_attr *ma); + + int (*moo_ref_del)(const struct lu_env *env, + struct md_object *obj, + struct md_attr *ma); + + int (*moo_open)(const struct lu_env *env, + struct md_object *obj, int flag); + + int (*moo_close)(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, int mode); + + int (*moo_capa_get)(const struct lu_env *, struct md_object *, + struct lustre_capa *, int renewal); + + int (*moo_object_sync)(const struct lu_env *, struct md_object *); + + int (*moo_file_lock)(const struct lu_env *env, struct md_object *obj, + struct lov_mds_md *lmm, struct ldlm_extent *extent, + struct lustre_handle *lockh); + int (*moo_file_unlock)(const struct lu_env *env, struct md_object *obj, + struct lov_mds_md *lmm, + struct lustre_handle *lockh); + int (*moo_object_lock)(const struct lu_env *env, struct md_object *obj, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy); +}; + +/** + * Operations implemented for each directory object. + */ +struct md_dir_operations { + int (*mdo_is_subdir) (const struct lu_env *env, struct md_object *obj, + const struct lu_fid *fid, struct lu_fid *sfid); + + int (*mdo_lookup)(const struct lu_env *env, struct md_object *obj, + const struct lu_name *lname, struct lu_fid *fid, + struct md_op_spec *spec); + + mdl_mode_t (*mdo_lock_mode)(const struct lu_env *env, + struct md_object *obj, + mdl_mode_t mode); + + int (*mdo_create)(const struct lu_env *env, struct md_object *pobj, + const struct lu_name *lname, struct md_object *child, + struct md_op_spec *spec, + struct md_attr *ma); + + /** This method is used for creating data object for this meta object*/ + int (*mdo_create_data)(const struct lu_env *env, struct md_object *p, + struct md_object *o, + const struct md_op_spec *spec, + struct md_attr *ma); + + int (*mdo_rename)(const struct lu_env *env, struct md_object *spobj, + struct md_object *tpobj, const struct lu_fid *lf, + const struct lu_name *lsname, struct md_object *tobj, + const struct lu_name *ltname, struct md_attr *ma); + + int (*mdo_link)(const struct lu_env *env, struct md_object *tgt_obj, + struct md_object *src_obj, const struct lu_name *lname, + struct md_attr *ma); + + int (*mdo_unlink)(const struct lu_env *env, struct md_object *pobj, + struct md_object *cobj, const struct lu_name *lname, + struct md_attr *ma, int no_name); + + /** This method is used to compare a requested layout to an existing + * layout (struct lov_mds_md_v1/3 vs struct lov_mds_md_v1/3) */ + int (*mdo_lum_lmm_cmp)(const struct lu_env *env, + struct md_object *cobj, + const struct md_op_spec *spec, + struct md_attr *ma); + + /** partial ops for cross-ref case */ + int (*mdo_name_insert)(const struct lu_env *env, + struct md_object *obj, + const struct lu_name *lname, + const struct lu_fid *fid, + const struct md_attr *ma); + + int (*mdo_name_remove)(const struct lu_env *env, + struct md_object *obj, + const struct lu_name *lname, + const struct md_attr *ma); + + int (*mdo_rename_tgt)(const struct lu_env *env, struct md_object *pobj, + struct md_object *tobj, const struct lu_fid *fid, + const struct lu_name *lname, struct md_attr *ma); +}; + +struct md_device_operations { + /** meta-data device related handlers. */ + int (*mdo_root_get)(const struct lu_env *env, struct md_device *m, + struct lu_fid *f); + + int (*mdo_maxsize_get)(const struct lu_env *env, struct md_device *m, + int *md_size, int *cookie_size); + + int (*mdo_statfs)(const struct lu_env *env, struct md_device *m, + struct obd_statfs *sfs); + + int (*mdo_init_capa_ctxt)(const struct lu_env *env, struct md_device *m, + int mode, unsigned long timeout, __u32 alg, + struct lustre_capa_key *keys); + + int (*mdo_update_capa_key)(const struct lu_env *env, + struct md_device *m, + struct lustre_capa_key *key); + + int (*mdo_llog_ctxt_get)(const struct lu_env *env, + struct md_device *m, int idx, void **h); + + int (*mdo_iocontrol)(const struct lu_env *env, struct md_device *m, + unsigned int cmd, int len, void *data); +}; + +enum md_upcall_event { + /** Sync the md layer*/ + MD_LOV_SYNC = (1 << 0), + /** Just for split, no need trans, for replay */ + MD_NO_TRANS = (1 << 1), + MD_LOV_CONFIG = (1 << 2), + /** Trigger quota recovery */ + MD_LOV_QUOTA = (1 << 3) +}; + +struct md_upcall { + /** this lock protects upcall using against its removal + * read lock is for usage the upcall, write - for init/fini */ + struct rw_semaphore mu_upcall_sem; + /** device to call, upper layer normally */ + struct md_device *mu_upcall_dev; + /** upcall function */ + int (*mu_upcall)(const struct lu_env *env, struct md_device *md, + enum md_upcall_event ev, void *data); +}; + +struct md_device { + struct lu_device md_lu_dev; + const struct md_device_operations *md_ops; + struct md_upcall md_upcall; +}; + +static inline void md_upcall_init(struct md_device *m, void *upcl) +{ + init_rwsem(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = NULL; + m->md_upcall.mu_upcall = upcl; +} + +static inline void md_upcall_dev_set(struct md_device *m, struct md_device *up) +{ + down_write(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = up; + up_write(&m->md_upcall.mu_upcall_sem); +} + +static inline void md_upcall_fini(struct md_device *m) +{ + down_write(&m->md_upcall.mu_upcall_sem); + m->md_upcall.mu_upcall_dev = NULL; + m->md_upcall.mu_upcall = NULL; + up_write(&m->md_upcall.mu_upcall_sem); +} + +static inline int md_do_upcall(const struct lu_env *env, struct md_device *m, + enum md_upcall_event ev, void *data) +{ + int rc = 0; + down_read(&m->md_upcall.mu_upcall_sem); + if (m->md_upcall.mu_upcall_dev != NULL && + m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall != NULL) { + rc = m->md_upcall.mu_upcall_dev->md_upcall.mu_upcall(env, + m->md_upcall.mu_upcall_dev, + ev, data); + } + up_read(&m->md_upcall.mu_upcall_sem); + return rc; +} + +struct md_object { + struct lu_object mo_lu; + const struct md_object_operations *mo_ops; + const struct md_dir_operations *mo_dir_ops; +}; + +/** + * seq-server site. + */ +struct seq_server_site { + struct lu_site *ss_lu; + /** + * mds number of this site. + */ + mdsno_t ss_node_id; + /** + * Fid location database + */ + struct lu_server_fld *ss_server_fld; + struct lu_client_fld *ss_client_fld; + + /** + * Server Seq Manager + */ + struct lu_server_seq *ss_server_seq; + + /** + * Controller Seq Manager + */ + struct lu_server_seq *ss_control_seq; + struct obd_export *ss_control_exp; + + /** + * Client Seq Manager + */ + struct lu_client_seq *ss_client_seq; +}; + +static inline struct md_device *lu2md_dev(const struct lu_device *d) +{ + LASSERT(IS_ERR(d) || lu_device_is_md(d)); + return container_of0(d, struct md_device, md_lu_dev); +} + +static inline struct lu_device *md2lu_dev(struct md_device *d) +{ + return &d->md_lu_dev; +} + +static inline struct md_object *lu2md(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->lo_dev)); + return container_of0(o, struct md_object, mo_lu); +} + +static inline struct md_object *md_object_next(const struct md_object *obj) +{ + return (obj ? lu2md(lu_object_next(&obj->mo_lu)) : NULL); +} + +static inline struct md_device *md_obj2dev(const struct md_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_md(o->mo_lu.lo_dev)); + return container_of0(o->mo_lu.lo_dev, struct md_device, md_lu_dev); +} + +static inline struct seq_server_site *lu_site2seq(const struct lu_site *s) +{ + return s->ld_seq_site; +} + +static inline int md_device_init(struct md_device *md, struct lu_device_type *t) +{ + return lu_device_init(&md->md_lu_dev, t); +} + +static inline void md_device_fini(struct md_device *md) +{ + lu_device_fini(&md->md_lu_dev); +} + +static inline struct md_object *md_object_find_slice(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *f) +{ + return lu2md(lu_object_find_slice(env, md2lu_dev(md), f, NULL)); +} + + +/** md operations */ +static inline int mo_permission(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + struct md_attr *at, + int mask) +{ + LASSERT(c->mo_ops->moo_permission); + return c->mo_ops->moo_permission(env, p, c, at, mask); +} + +static inline int mo_attr_get(const struct lu_env *env, + struct md_object *m, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_get); + return m->mo_ops->moo_attr_get(env, m, at); +} + +static inline int mo_readlink(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_readlink); + return m->mo_ops->moo_readlink(env, m, buf); +} + +static inline int mo_changelog(const struct lu_env *env, + enum changelog_rec_type type, + int flags, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_changelog); + return m->mo_ops->moo_changelog(env, type, flags, m); +} + +static inline int mo_attr_set(const struct lu_env *env, + struct md_object *m, + const struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_attr_set); + return m->mo_ops->moo_attr_set(env, m, at); +} + +static inline int mo_xattr_get(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_get); + return m->mo_ops->moo_xattr_get(env, m, buf, name); +} + +static inline int mo_xattr_del(const struct lu_env *env, + struct md_object *m, + const char *name) +{ + LASSERT(m->mo_ops->moo_xattr_del); + return m->mo_ops->moo_xattr_del(env, m, name); +} + +static inline int mo_xattr_set(const struct lu_env *env, + struct md_object *m, + const struct lu_buf *buf, + const char *name, + int flags) +{ + LASSERT(m->mo_ops->moo_xattr_set); + return m->mo_ops->moo_xattr_set(env, m, buf, name, flags); +} + +static inline int mo_xattr_list(const struct lu_env *env, + struct md_object *m, + struct lu_buf *buf) +{ + LASSERT(m->mo_ops->moo_xattr_list); + return m->mo_ops->moo_xattr_list(env, m, buf); +} + +static inline int mo_swap_layouts(const struct lu_env *env, + struct md_object *o1, + struct md_object *o2, __u64 flags) +{ + LASSERT(o1->mo_ops->moo_swap_layouts); + LASSERT(o2->mo_ops->moo_swap_layouts); + if (o1->mo_ops->moo_swap_layouts != o2->mo_ops->moo_swap_layouts) + return -EPERM; + return o1->mo_ops->moo_swap_layouts(env, o1, o2, flags); +} + +static inline int mo_open(const struct lu_env *env, + struct md_object *m, + int flags) +{ + LASSERT(m->mo_ops->moo_open); + return m->mo_ops->moo_open(env, m, flags); +} + +static inline int mo_close(const struct lu_env *env, + struct md_object *m, + struct md_attr *ma, + int mode) +{ + LASSERT(m->mo_ops->moo_close); + return m->mo_ops->moo_close(env, m, ma, mode); +} + +static inline int mo_readpage(const struct lu_env *env, + struct md_object *m, + const struct lu_rdpg *rdpg) +{ + LASSERT(m->mo_ops->moo_readpage); + return m->mo_ops->moo_readpage(env, m, rdpg); +} + +static inline int mo_object_create(const struct lu_env *env, + struct md_object *m, + const struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(m->mo_ops->moo_object_create); + return m->mo_ops->moo_object_create(env, m, spc, at); +} + +static inline int mo_ref_add(const struct lu_env *env, + struct md_object *m, + const struct md_attr *ma) +{ + LASSERT(m->mo_ops->moo_ref_add); + return m->mo_ops->moo_ref_add(env, m, ma); +} + +static inline int mo_ref_del(const struct lu_env *env, + struct md_object *m, + struct md_attr *ma) +{ + LASSERT(m->mo_ops->moo_ref_del); + return m->mo_ops->moo_ref_del(env, m, ma); +} + +static inline int mo_capa_get(const struct lu_env *env, + struct md_object *m, + struct lustre_capa *c, + int renewal) +{ + LASSERT(m->mo_ops->moo_capa_get); + return m->mo_ops->moo_capa_get(env, m, c, renewal); +} + +static inline int mo_object_sync(const struct lu_env *env, struct md_object *m) +{ + LASSERT(m->mo_ops->moo_object_sync); + return m->mo_ops->moo_object_sync(env, m); +} + +static inline int mo_file_lock(const struct lu_env *env, struct md_object *m, + struct lov_mds_md *lmm, + struct ldlm_extent *extent, + struct lustre_handle *lockh) +{ + LASSERT(m->mo_ops->moo_file_lock); + return m->mo_ops->moo_file_lock(env, m, lmm, extent, lockh); +} + +static inline int mo_file_unlock(const struct lu_env *env, struct md_object *m, + struct lov_mds_md *lmm, + struct lustre_handle *lockh) +{ + LASSERT(m->mo_ops->moo_file_unlock); + return m->mo_ops->moo_file_unlock(env, m, lmm, lockh); +} + +static inline int mo_object_lock(const struct lu_env *env, + struct md_object *m, + struct lustre_handle *lh, + struct ldlm_enqueue_info *einfo, + void *policy) +{ + LASSERT(m->mo_ops->moo_object_lock); + return m->mo_ops->moo_object_lock(env, m, lh, einfo, policy); +} + +static inline int mdo_lookup(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + struct lu_fid *f, + struct md_op_spec *spec) +{ + LASSERT(p->mo_dir_ops->mdo_lookup); + return p->mo_dir_ops->mdo_lookup(env, p, lname, f, spec); +} + +static inline mdl_mode_t mdo_lock_mode(const struct lu_env *env, + struct md_object *mo, + mdl_mode_t lm) +{ + if (mo->mo_dir_ops->mdo_lock_mode == NULL) + return MDL_MINMODE; + return mo->mo_dir_ops->mdo_lock_mode(env, mo, lm); +} + +static inline int mdo_create(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lchild_name, + struct md_object *c, + struct md_op_spec *spc, + struct md_attr *at) +{ + LASSERT(p->mo_dir_ops->mdo_create); + return p->mo_dir_ops->mdo_create(env, p, lchild_name, c, spc, at); +} + +static inline int mdo_create_data(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_create_data); + return c->mo_dir_ops->mdo_create_data(env, p, c, spec, ma); +} + +static inline int mdo_rename(const struct lu_env *env, + struct md_object *sp, + struct md_object *tp, + const struct lu_fid *lf, + const struct lu_name *lsname, + struct md_object *t, + const struct lu_name *ltname, + struct md_attr *ma) +{ + LASSERT(tp->mo_dir_ops->mdo_rename); + return tp->mo_dir_ops->mdo_rename(env, sp, tp, lf, lsname, t, ltname, + ma); +} + +static inline int mdo_is_subdir(const struct lu_env *env, + struct md_object *mo, + const struct lu_fid *fid, + struct lu_fid *sfid) +{ + LASSERT(mo->mo_dir_ops->mdo_is_subdir); + return mo->mo_dir_ops->mdo_is_subdir(env, mo, fid, sfid); +} + +static inline int mdo_link(const struct lu_env *env, + struct md_object *p, + struct md_object *s, + const struct lu_name *lname, + struct md_attr *ma) +{ + LASSERT(s->mo_dir_ops->mdo_link); + return s->mo_dir_ops->mdo_link(env, p, s, lname, ma); +} + +static inline int mdo_unlink(const struct lu_env *env, + struct md_object *p, + struct md_object *c, + const struct lu_name *lname, + struct md_attr *ma, int no_name) +{ + LASSERT(p->mo_dir_ops->mdo_unlink); + return p->mo_dir_ops->mdo_unlink(env, p, c, lname, ma, no_name); +} + +static inline int mdo_lum_lmm_cmp(const struct lu_env *env, + struct md_object *c, + const struct md_op_spec *spec, + struct md_attr *ma) +{ + LASSERT(c->mo_dir_ops->mdo_lum_lmm_cmp); + return c->mo_dir_ops->mdo_lum_lmm_cmp(env, c, spec, ma); +} + +static inline int mdo_name_insert(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + const struct lu_fid *f, + const struct md_attr *ma) +{ + LASSERT(p->mo_dir_ops->mdo_name_insert); + return p->mo_dir_ops->mdo_name_insert(env, p, lname, f, ma); +} + +static inline int mdo_name_remove(const struct lu_env *env, + struct md_object *p, + const struct lu_name *lname, + const struct md_attr *ma) +{ + LASSERT(p->mo_dir_ops->mdo_name_remove); + return p->mo_dir_ops->mdo_name_remove(env, p, lname, ma); +} + +static inline int mdo_rename_tgt(const struct lu_env *env, + struct md_object *p, + struct md_object *t, + const struct lu_fid *lf, + const struct lu_name *lname, + struct md_attr *ma) +{ + if (t) { + LASSERT(t->mo_dir_ops->mdo_rename_tgt); + return t->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma); + } else { + LASSERT(p->mo_dir_ops->mdo_rename_tgt); + return p->mo_dir_ops->mdo_rename_tgt(env, p, t, lf, lname, ma); + } +} + +/** + * Used in MDD/OUT layer for object lock rule + **/ +enum mdd_object_role { + MOR_SRC_PARENT, + MOR_SRC_CHILD, + MOR_TGT_PARENT, + MOR_TGT_CHILD, + MOR_TGT_ORPHAN +}; + +struct dt_device; +/** + * Structure to hold object information. This is used to create object + * \pre llod_dir exist + */ +struct lu_local_obj_desc { + const char *llod_dir; + const char *llod_name; + __u32 llod_oid; + int llod_is_index; + const struct dt_index_features *llod_feat; + struct list_head llod_linkage; +}; + +int lustre_buf2som(void *buf, int rc, struct md_som_data *msd); +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh); +void lustre_hsm2buf(void *buf, struct md_hsm *mh); + +struct lu_ucred { + __u32 uc_valid; + __u32 uc_o_uid; + __u32 uc_o_gid; + __u32 uc_o_fsuid; + __u32 uc_o_fsgid; + __u32 uc_uid; + __u32 uc_gid; + __u32 uc_fsuid; + __u32 uc_fsgid; + __u32 uc_suppgids[2]; + cfs_cap_t uc_cap; + __u32 uc_umask; + group_info_t *uc_ginfo; + struct md_identity *uc_identity; +}; + +struct lu_ucred *lu_ucred(const struct lu_env *env); + +struct lu_ucred *lu_ucred_check(const struct lu_env *env); + +struct lu_ucred *lu_ucred_assert(const struct lu_env *env); + +int lu_ucred_global_init(void); + +void lu_ucred_global_fini(void); + +#define md_cap_t(x) (x) + +#define MD_CAP_TO_MASK(x) (1 << (x)) + +#define md_cap_raised(c, flag) (md_cap_t(c) & MD_CAP_TO_MASK(flag)) + +/* capable() is copied from linux kernel! */ +static inline int md_capable(struct lu_ucred *uc, cfs_cap_t cap) +{ + if (md_cap_raised(uc->uc_cap, cap)) + return 1; + return 0; +} + +/** @} md */ +#endif /* _LINUX_MD_OBJECT_H */ diff --git a/drivers/staging/lustre/lustre/include/obd.h b/drivers/staging/lustre/lustre/include/obd.h new file mode 100644 index 000000000000..0a251fdfe167 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd.h @@ -0,0 +1,1677 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_H +#define __OBD_H + +#include <linux/obd.h> + +#define IOC_OSC_TYPE 'h' +#define IOC_OSC_MIN_NR 20 +#define IOC_OSC_SET_ACTIVE _IOWR(IOC_OSC_TYPE, 21, struct obd_device *) +#define IOC_OSC_MAX_NR 50 + +#define IOC_MDC_TYPE 'i' +#define IOC_MDC_MIN_NR 20 +#define IOC_MDC_MAX_NR 50 + +#include <lustre/lustre_idl.h> +#include <lu_ref.h> +#include <lustre_lib.h> +#include <lustre_export.h> +#include <lustre_fld.h> +#include <lustre_capa.h> + +#include <linux/libcfs/bitmap.h> + + +#define MAX_OBD_DEVICES 8192 + +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + __u64 ar_min_xid; +}; + +struct lov_oinfo { /* per-stripe data structure */ + struct ost_id loi_oi; /* object ID/Sequence on the target OST */ + int loi_ost_idx; /* OST stripe index in lov_tgt_desc->tgts */ + int loi_ost_gen; /* generation of this loi_ost_idx */ + + unsigned long loi_kms_valid:1; + __u64 loi_kms; /* known minimum size */ + struct ost_lvb loi_lvb; + struct osc_async_rc loi_ar; +}; + +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + +static inline void loi_init(struct lov_oinfo *loi) +{ +} + +struct lov_stripe_md { + atomic_t lsm_refc; + spinlock_t lsm_lock; + pid_t lsm_lock_owner; /* debugging */ + + /* maximum possible file size, might change as OSTs status changes, + * e.g. disconnected, deactivated */ + __u64 lsm_maxbytes; + struct { + /* Public members. */ + struct ost_id lw_object_oi; /* lov object id/seq */ + + /* LOV-private members start here -- only for use in lov/. */ + __u32 lw_magic; + __u32 lw_stripe_size; /* size of the stripe */ + __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ + __u16 lw_stripe_count; /* number of objects being striped over */ + __u16 lw_layout_gen; /* generation of the layout */ + char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */ + } lsm_wire; + + struct lov_oinfo *lsm_oinfo[0]; +}; + +#define lsm_oi lsm_wire.lw_object_oi +#define lsm_magic lsm_wire.lw_magic +#define lsm_layout_gen lsm_wire.lw_layout_gen +#define lsm_stripe_size lsm_wire.lw_stripe_size +#define lsm_pattern lsm_wire.lw_pattern +#define lsm_stripe_count lsm_wire.lw_stripe_count +#define lsm_pool_name lsm_wire.lw_pool_name + +struct obd_info; + +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); + +/* obd info for a particular level (lov, osc). */ +struct obd_info { + /* Lock policy. It keeps an extent which is specific for a particular + * OSC. (e.g. lov_prep_enqueue_set initialises extent of the policy, + * and osc_enqueue passes it into ldlm_lock_match & ldlm_cli_enqueue. */ + ldlm_policy_data_t oi_policy; + /* Flags used for set request specific flags: + - while lock handling, the flags obtained on the enqueue + request are set here. + - while stats, the flags used for control delay/resend. + - while setattr, the flags used for distinguish punch operation + */ + __u64 oi_flags; + /* Lock handle specific for every OSC lock. */ + struct lustre_handle *oi_lockh; + /* lsm data specific for every OSC. */ + struct lov_stripe_md *oi_md; + /* obdo data specific for every OSC, if needed at all. */ + struct obdo *oi_oa; + /* statfs data specific for every OSC, if needed at all. */ + struct obd_statfs *oi_osfs; + /* An update callback which is called to update some data on upper + * level. E.g. it is used for update lsm->lsm_oinfo at every recieved + * request in osc level for enqueue requests. It is also possible to + * update some caller data from LOV layer if needed. */ + obd_enqueue_update_f oi_cb_up; + /* oss capability, its type is obd_capa in client to avoid copy. + * in contrary its type is lustre_capa in OSS. */ + void *oi_capa; + /* transfer jobid from ost_sync() to filter_sync()... */ + char *oi_jobid; +}; + +/* compare all relevant fields. */ +static inline int lov_stripe_md_cmp(struct lov_stripe_md *m1, + struct lov_stripe_md *m2) +{ + /* + * ->lsm_wire contains padding, but it should be zeroed out during + * allocation. + */ + return memcmp(&m1->lsm_wire, &m2->lsm_wire, sizeof m1->lsm_wire); +} + +static inline int lov_lum_lsm_cmp(struct lov_user_md *lum, + struct lov_stripe_md *lsm) +{ + if (lsm->lsm_magic != lum->lmm_magic) + return 1; + if ((lsm->lsm_stripe_count != 0) && (lum->lmm_stripe_count != 0) && + (lsm->lsm_stripe_count != lum->lmm_stripe_count)) + return 2; + if ((lsm->lsm_stripe_size != 0) && (lum->lmm_stripe_size != 0) && + (lsm->lsm_stripe_size != lum->lmm_stripe_size)) + return 3; + if ((lsm->lsm_pattern != 0) && (lum->lmm_pattern != 0) && + (lsm->lsm_pattern != lum->lmm_pattern)) + return 4; + if ((lsm->lsm_magic == LOV_MAGIC_V3) && + (strncmp(lsm->lsm_pool_name, + ((struct lov_user_md_v3 *)lum)->lmm_pool_name, + LOV_MAXPOOLNAME) != 0)) + return 5; + return 0; +} + +static inline int lov_lum_swab_if_needed(struct lov_user_md_v3 *lumv3, + int *lmm_magic, + struct lov_user_md *lum) +{ + if (lum && copy_from_user(lumv3, lum,sizeof(struct lov_user_md_v1))) + return -EFAULT; + + *lmm_magic = lumv3->lmm_magic; + + if (*lmm_magic == __swab32(LOV_USER_MAGIC_V1)) { + lustre_swab_lov_user_md_v1((struct lov_user_md_v1 *)lumv3); + *lmm_magic = LOV_USER_MAGIC_V1; + } else if (*lmm_magic == LOV_USER_MAGIC_V3) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + } else if (*lmm_magic == __swab32(LOV_USER_MAGIC_V3)) { + if (lum && copy_from_user(lumv3, lum, sizeof(*lumv3))) + return -EFAULT; + lustre_swab_lov_user_md_v3(lumv3); + *lmm_magic = LOV_USER_MAGIC_V3; + } else if (*lmm_magic != LOV_USER_MAGIC_V1) { + CDEBUG(D_IOCTL, + "bad userland LOV MAGIC: %#08x != %#08x nor %#08x\n", + *lmm_magic, LOV_USER_MAGIC_V1, LOV_USER_MAGIC_V3); + return -EINVAL; + } + return 0; +} + +void lov_stripe_lock(struct lov_stripe_md *md); +void lov_stripe_unlock(struct lov_stripe_md *md); + +struct obd_type { + struct list_head typ_chain; + struct obd_ops *typ_dt_ops; + struct md_ops *typ_md_ops; + proc_dir_entry_t *typ_procroot; + char *typ_name; + int typ_refcnt; + struct lu_device_type *typ_lu; + spinlock_t obd_type_lock; +}; + +struct brw_page { + obd_off off; + struct page *pg; + int count; + obd_flag flag; +}; + +/* Individual type definitions */ + +struct ost_server_data; + +struct osd_properties { + size_t osd_max_ea_size; +}; + +#define OBT_MAGIC 0xBDDECEAE +/* hold common fields for "target" device */ +struct obd_device_target { + __u32 obt_magic; + __u32 obt_instance; + struct super_block *obt_sb; + /** last_rcvd file */ + struct file *obt_rcvd_filp; + __u64 obt_mount_count; + struct rw_semaphore obt_rwsem; + struct vfsmount *obt_vfsmnt; + struct file *obt_health_check_filp; + struct osd_properties obt_osd_properties; + struct obd_job_stats obt_jobstats; +}; + +/* llog contexts */ +enum llog_ctxt_id { + LLOG_CONFIG_ORIG_CTXT = 0, + LLOG_CONFIG_REPL_CTXT, + LLOG_MDS_OST_ORIG_CTXT, + LLOG_MDS_OST_REPL_CTXT, + LLOG_SIZE_ORIG_CTXT, + LLOG_SIZE_REPL_CTXT, + LLOG_RD1_ORIG_CTXT, + LLOG_RD1_REPL_CTXT, + LLOG_TEST_ORIG_CTXT, + LLOG_TEST_REPL_CTXT, + LLOG_LOVEA_ORIG_CTXT, + LLOG_LOVEA_REPL_CTXT, + LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */ + LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */ + LLOG_MAX_CTXTS +}; + +#define FILTER_SUBDIR_COUNT 32 /* set to zero for no subdirs */ + +struct filter_subdirs { + struct dentry *dentry[FILTER_SUBDIR_COUNT]; +}; + + +struct filter_ext { + __u64 fe_start; + __u64 fe_end; +}; + +struct filter_obd { + /* NB this field MUST be first */ + struct obd_device_target fo_obt; + const char *fo_fstype; + + int fo_group_count; + struct dentry *fo_dentry_O; + struct dentry **fo_dentry_O_groups; + struct filter_subdirs *fo_dentry_O_sub; + struct mutex fo_init_lock; /* group initialization lock*/ + int fo_committed_group; + + spinlock_t fo_objidlock; /* protect fo_lastobjid */ + + unsigned long fo_destroys_in_progress; + struct mutex fo_create_locks[FILTER_SUBDIR_COUNT]; + + struct list_head fo_export_list; + int fo_subdir_count; + + obd_size fo_tot_dirty; /* protected by obd_osfs_lock */ + obd_size fo_tot_granted; /* all values in bytes */ + obd_size fo_tot_pending; + int fo_tot_granted_clients; + + obd_size fo_readcache_max_filesize; + spinlock_t fo_flags_lock; + unsigned int fo_read_cache:1, /**< enable read-only cache */ + fo_writethrough_cache:1,/**< read cache writes */ + fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/ + fo_raid_degraded:1;/**< RAID device degraded */ + + struct obd_import *fo_mdc_imp; + struct obd_uuid fo_mdc_uuid; + struct lustre_handle fo_mdc_conn; + struct file **fo_last_objid_files; + __u64 *fo_last_objids; /* last created objid for groups, + * protected by fo_objidlock */ + + struct mutex fo_alloc_lock; + + atomic_t fo_r_in_flight; + atomic_t fo_w_in_flight; + + /* + * per-filter pool of kiobuf's allocated by filter_common_setup() and + * torn down by filter_cleanup(). + * + * This pool contains kiobuf used by + * filter_{prep,commit}rw_{read,write}() and is shared by all OST + * threads. + * + * Locking: protected by internal lock of cfs_hash, pool can be + * found from this hash table by t_id of ptlrpc_thread. + */ + struct cfs_hash *fo_iobuf_hash; + + struct brw_stats fo_filter_stats; + + int fo_fmd_max_num; /* per exp filter_mod_data */ + int fo_fmd_max_age; /* jiffies to fmd expiry */ + unsigned long fo_syncjournal:1, /* sync journal on writes */ + fo_sync_lock_cancel:2;/* sync on lock cancel */ + + + /* sptlrpc stuff */ + rwlock_t fo_sptlrpc_lock; + struct sptlrpc_rule_set fo_sptlrpc_rset; + + /* capability related */ + unsigned int fo_fl_oss_capa; + struct list_head fo_capa_keys; + struct hlist_head *fo_capa_hash; + int fo_sec_level; +}; + +struct timeout_item { + enum timeout_event ti_event; + cfs_time_t ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; +}; + +#define OSC_MAX_RIF_DEFAULT 8 +#define MDS_OSC_MAX_RIF_DEFAULT 50 +#define OSC_MAX_RIF_MAX 256 +#define OSC_MAX_DIRTY_DEFAULT (OSC_MAX_RIF_DEFAULT * 4) +#define OSC_MAX_DIRTY_MB_MAX 2048 /* arbitrary, but < MAX_LONG bytes */ +#define OSC_DEFAULT_RESENDS 10 + +/* possible values for fo_sync_lock_cancel */ +enum { + NEVER_SYNC_ON_CANCEL = 0, + BLOCKING_SYNC_ON_CANCEL = 1, + ALWAYS_SYNC_ON_CANCEL = 2, + NUM_SYNC_ON_CANCEL_STATES +}; + +#define MDC_MAX_RIF_DEFAULT 8 +#define MDC_MAX_RIF_MAX 512 + +struct mdc_rpc_lock; +struct obd_import; +struct client_obd { + struct rw_semaphore cl_sem; + struct obd_uuid cl_target_uuid; + struct obd_import *cl_import; /* ptlrpc connection state */ + int cl_conn_count; + /* max_mds_easize is purely a performance thing so we don't have to + * call obd_size_diskmd() all the time. */ + int cl_default_mds_easize; + int cl_max_mds_easize; + int cl_max_mds_cookiesize; + + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ + + /* the grant values are protected by loi_list_lock below */ + long cl_dirty; /* all _dirty_ in bytes */ + long cl_dirty_max; /* allowed w/o rpc */ + long cl_dirty_transit; /* dirty synchronous */ + long cl_avail_grant; /* bytes of credit for ost */ + long cl_lost_grant; /* lost credits (trunc) */ + + /* since we allocate grant by blocks, we don't know how many grant will + * be used to add a page into cache. As a solution, we reserve maximum + * grant before trying to dirty a page and unreserve the rest. + * See osc_{reserve|unreserve}_grant for details. */ + long cl_reserved_grant; + struct list_head cl_cache_waiters; /* waiting for cache/grant */ + cfs_time_t cl_next_shrink_grant; /* jiffies */ + struct list_head cl_grant_shrink_list; /* Timeout event list */ + int cl_grant_shrink_interval; /* seconds */ + + /* A chunk is an optimal size used by osc_extent to determine + * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */ + int cl_chunkbits; + int cl_chunk; + int cl_extent_tax; /* extent overhead, by bytes */ + + /* keep track of objects that have lois that contain pages which + * have been queued for async brw. this lock also protects the + * lists of osc_client_pages that hang off of the loi */ + /* + * ->cl_loi_list_lock protects consistency of + * ->cl_loi_{ready,read,write}_list. ->ap_make_ready() and + * ->ap_completion() call-backs are executed under this lock. As we + * cannot guarantee that these call-backs never block on all platforms + * (as a matter of fact they do block on Mac OS X), type of + * ->cl_loi_list_lock is platform dependent: it's a spin-lock on Linux + * and blocking mutex on Mac OS X. (Alternative is to make this lock + * blocking everywhere, but we don't want to slow down fast-path of + * our main platform.) + * + * Exact type of ->cl_loi_list_lock is defined in arch/obd.h together + * with client_obd_list_{un,}lock() and + * client_obd_list_lock_{init,done}() functions. + * + * NB by Jinshan: though field names are still _loi_, but actually + * osc_object{}s are in the list. + */ + client_obd_lock_t cl_loi_list_lock; + struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; + struct list_head cl_loi_write_list; + struct list_head cl_loi_read_list; + int cl_r_in_flight; + int cl_w_in_flight; + /* just a sum of the loi/lop pending numbers to be exported by /proc */ + atomic_t cl_pending_w_pages; + atomic_t cl_pending_r_pages; + __u32 cl_max_pages_per_rpc; + int cl_max_rpcs_in_flight; + struct obd_histogram cl_read_rpc_hist; + struct obd_histogram cl_write_rpc_hist; + struct obd_histogram cl_read_page_hist; + struct obd_histogram cl_write_page_hist; + struct obd_histogram cl_read_offset_hist; + struct obd_histogram cl_write_offset_hist; + + /* lru for osc caching pages */ + struct cl_client_cache *cl_cache; + struct list_head cl_lru_osc; /* member of cl_cache->ccc_lru */ + atomic_t *cl_lru_left; + atomic_t cl_lru_busy; + atomic_t cl_lru_shrinkers; + atomic_t cl_lru_in_list; + struct list_head cl_lru_list; /* lru page list */ + client_obd_lock_t cl_lru_list_lock; /* page list protector */ + + /* number of in flight destroy rpcs is limited to max_rpcs_in_flight */ + atomic_t cl_destroy_in_flight; + wait_queue_head_t cl_destroy_waitq; + + struct mdc_rpc_lock *cl_rpc_lock; + struct mdc_rpc_lock *cl_close_lock; + + /* mgc datastruct */ + struct semaphore cl_mgc_sem; + struct vfsmount *cl_mgc_vfsmnt; + struct dentry *cl_mgc_configs_dir; + atomic_t cl_mgc_refcount; + struct obd_export *cl_mgc_mgsexp; + + /* checksumming for data sent over the network */ + unsigned int cl_checksum:1; /* 0 = disabled, 1 = enabled */ + /* supported checksum types that are worked out at connect time */ + __u32 cl_supp_cksum_types; + /* checksum algorithm to be used */ + cksum_type_t cl_cksum_type; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; + + /* used by quotacheck when the servers are older than 2.4 */ + int cl_qchk_stat; /* quotacheck stat of the peer */ +#define CL_NOT_QUOTACHECKED 1 /* client->cl_qchk_stat init value */ +#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(2, 7, 50, 0) +#warning "please consider removing quotacheck compatibility code" +#endif + + /* sequence manager */ + struct lu_client_seq *cl_seq; + + atomic_t cl_resends; /* resend count */ + + /* ptlrpc work for writeback in ptlrpcd context */ + void *cl_writeback_work; + /* hash tables for osc_quota_info */ + cfs_hash_t *cl_quota_hash[MAXQUOTAS]; +}; +#define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) + +struct obd_id_info { + __u32 idx; + obd_id *data; +}; + +/* */ + +struct echo_obd { + struct obd_device_target eo_obt; + struct obdo eo_oa; + spinlock_t eo_lock; + __u64 eo_lastino; + struct lustre_handle eo_nl_lock; + atomic_t eo_prep; +}; + +struct ost_obd { + struct ptlrpc_service *ost_service; + struct ptlrpc_service *ost_create_service; + struct ptlrpc_service *ost_io_service; + struct ptlrpc_service *ost_seq_service; + struct mutex ost_health_mutex; +}; + +struct echo_client_obd { + struct obd_export *ec_exp; /* the local connection to osc/lov */ + spinlock_t ec_lock; + struct list_head ec_objects; + struct list_head ec_locks; + int ec_nstripes; + __u64 ec_unique; +}; + +struct lov_qos_oss { + struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ + struct list_head lqo_oss_list; /* link to lov_qos */ + __u64 lqo_bavail; /* total bytes avail on OSS */ + __u64 lqo_penalty; /* current penalty */ + __u64 lqo_penalty_per_obj;/* penalty decrease every obj*/ + time_t lqo_used; /* last used time, seconds */ + __u32 lqo_ost_count; /* number of osts on this oss */ +}; + +struct ltd_qos { + struct lov_qos_oss *ltq_oss; /* oss info */ + __u64 ltq_penalty; /* current penalty */ + __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ + __u64 ltq_weight; /* net weighting */ + time_t ltq_used; /* last used time, seconds */ + unsigned int ltq_usable:1; /* usable for striping */ +}; + +/* Generic subset of OSTs */ +struct ost_pool { + __u32 *op_array; /* array of index of + lov_obd->lov_tgts */ + unsigned int op_count; /* number of OSTs in the array */ + unsigned int op_size; /* allocated size of lp_array */ + struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ +}; + +/* Round-robin allocator data */ +struct lov_qos_rr { + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx; /* aliasing for start_idx */ + int lqr_start_count; /* reseed counter */ + struct ost_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_dirty:1; /* recalc round-robin list */ +}; + +/* allow statfs data caching for 1 second */ +#define OBD_STATFS_CACHE_SECONDS 1 + +struct lov_statfs_data { + struct obd_info lsd_oi; + struct obd_statfs lsd_statfs; +}; +/* Stripe placement optimization */ +struct lov_qos { + struct list_head lq_oss_list; /* list of OSSs that targets use */ + struct rw_semaphore lq_rw_sem; + __u32 lq_active_oss_count; + unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lov_qos_rr lq_rr; /* round robin qos data */ + unsigned long lq_dirty:1, /* recalc qos data */ + lq_same_space:1,/* the ost's all have approx. + the same space avail */ + lq_reset:1, /* zero current penalties */ + lq_statfs_in_progress:1; /* statfs op in + progress */ + /* qos statfs data */ + struct lov_statfs_data *lq_statfs_data; + wait_queue_head_t lq_statfs_waitq; /* waitqueue to notify statfs + * requests completion */ +}; + +struct lov_tgt_desc { + struct list_head ltd_kill; + struct obd_uuid ltd_uuid; + struct obd_device *ltd_obd; + struct obd_export *ltd_exp; + struct ltd_qos ltd_qos; /* qos info per target */ + __u32 ltd_gen; + __u32 ltd_index; /* index in lov_obd->tgts */ + unsigned long ltd_active:1,/* is this target up for requests */ + ltd_activate:1,/* should target be activated */ + ltd_reap:1; /* should this target be deleted */ +}; + +/* Pool metadata */ +#define pool_tgt_size(_p) _p->pool_obds.op_size +#define pool_tgt_count(_p) _p->pool_obds.op_count +#define pool_tgt_array(_p) _p->pool_obds.op_array +#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */ + struct ost_pool pool_obds; /* pool members */ + atomic_t pool_refcount; /* pool ref. counter */ + struct lov_qos_rr pool_rr; /* round robin qos */ + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + proc_dir_entry_t *pool_proc_entry; /* file in /proc */ + struct obd_device *pool_lobd; /* obd of the lov/lod to which + * this pool belongs */ +}; + +struct lov_obd { + struct lov_desc desc; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct ost_pool lov_packed; /* all OSTs in a packed + array */ + struct mutex lov_lock; + struct obd_connect_data lov_ocd; + atomic_t lov_refcount; + __u32 lov_tgt_count; /* how many OBD's */ + __u32 lov_active_tgt_count; /* how many active */ + __u32 lov_death_row;/* tgts scheduled to be deleted */ + __u32 lov_tgt_size; /* size of tgts array */ + int lov_connects; + int lov_pool_count; + cfs_hash_t *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + proc_dir_entry_t *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; + + /* Cached LRU pages from upper layer */ + void *lov_cache; + + struct rw_semaphore lov_notify_lock; +}; + +struct lmv_tgt_desc { + struct obd_uuid ltd_uuid; + struct obd_export *ltd_exp; + int ltd_idx; + struct mutex ltd_fid_mutex; + unsigned long ltd_active:1; /* target up for requests */ +}; + +enum placement_policy { + PLACEMENT_CHAR_POLICY = 0, + PLACEMENT_NID_POLICY = 1, + PLACEMENT_INVAL_POLICY = 2, + PLACEMENT_MAX_POLICY +}; + +typedef enum placement_policy placement_policy_t; + +struct lmv_obd { + int refcount; + struct lu_client_fld lmv_fld; + spinlock_t lmv_lock; + placement_policy_t lmv_placement; + struct lmv_desc desc; + struct obd_uuid cluuid; + struct obd_export *exp; + + struct mutex init_mutex; + int connected; + int max_easize; + int max_def_easize; + int max_cookiesize; + int server_timeout; + + int tgts_size; /* size of tgts array */ + struct lmv_tgt_desc **tgts; + + struct obd_connect_data conn_data; +}; + +struct niobuf_local { + __u64 lnb_file_offset; + __u32 lnb_page_offset; + __u32 len; + __u32 flags; + struct page *page; + struct dentry *dentry; + int lnb_grant_used; + int rc; +}; + +#define LUSTRE_FLD_NAME "fld" +#define LUSTRE_SEQ_NAME "seq" + +#define LUSTRE_MDD_NAME "mdd" +#define LUSTRE_OSD_LDISKFS_NAME "osd-ldiskfs" +#define LUSTRE_OSD_ZFS_NAME "osd-zfs" +#define LUSTRE_VVP_NAME "vvp" +#define LUSTRE_LMV_NAME "lmv" +#define LUSTRE_SLP_NAME "slp" +#define LUSTRE_LOD_NAME "lod" +#define LUSTRE_OSP_NAME "osp" +#define LUSTRE_LWP_NAME "lwp" + +/* obd device type names */ + /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ +#define LUSTRE_MDS_NAME "mds" +#define LUSTRE_MDT_NAME "mdt" +#define LUSTRE_MDC_NAME "mdc" +#define LUSTRE_OSS_NAME "ost" /* FIXME change name to oss */ +#define LUSTRE_OST_NAME "obdfilter" /* FIXME change name to ost */ +#define LUSTRE_OSC_NAME "osc" +#define LUSTRE_LOV_NAME "lov" +#define LUSTRE_MGS_NAME "mgs" +#define LUSTRE_MGC_NAME "mgc" + +#define LUSTRE_ECHO_NAME "obdecho" +#define LUSTRE_ECHO_CLIENT_NAME "echo_client" +#define LUSTRE_QMT_NAME "qmt" + +/* Constant obd names (post-rename) */ +#define LUSTRE_MDS_OBDNAME "MDS" +#define LUSTRE_OSS_OBDNAME "OSS" +#define LUSTRE_MGS_OBDNAME "MGS" +#define LUSTRE_MGC_OBDNAME "MGC" + +static inline int is_osp_on_mdt(char *name) +{ + char *ptr; + + ptr = strrchr(name, '-'); + if (ptr == NULL) { + CERROR("%s is not a obdname\n", name); + return 0; + } + + /* 1.8 OSC/OSP name on MDT is fsname-OSTxxxx-osc */ + if (strncmp(ptr + 1, "osc", 3) == 0) + return 1; + + if (strncmp(ptr + 1, "MDT", 3) != 0) + return 0; + + while (*(--ptr) != '-' && ptr != name); + + if (ptr == name) + return 0; + + if (strncmp(ptr + 1, LUSTRE_OSP_NAME, strlen(LUSTRE_OSP_NAME)) != 0 && + strncmp(ptr + 1, LUSTRE_OSC_NAME, strlen(LUSTRE_OSC_NAME)) != 0) + return 0; + + return 1; +} + +/* Don't conflict with on-wire flags OBD_BRW_WRITE, etc */ +#define N_LOCAL_TEMP_PAGE 0x10000000 + +struct obd_trans_info { + __u64 oti_transno; + __u64 oti_xid; + /* Only used on the server side for tracking acks. */ + struct oti_req_ack_lock { + struct lustre_handle lock; + __u32 mode; + } oti_ack_locks[4]; + void *oti_handle; + struct llog_cookie oti_onecookie; + struct llog_cookie *oti_logcookies; + int oti_numcookies; + /** synchronous write is needed */ + unsigned long oti_sync_write:1; + + /* initial thread handling transaction */ + struct ptlrpc_thread * oti_thread; + __u32 oti_conn_cnt; + /** VBR: versions */ + __u64 oti_pre_version; + /** JobID */ + char *oti_jobid; + + struct obd_uuid *oti_ost_uuid; +}; + +static inline void oti_init(struct obd_trans_info *oti, + struct ptlrpc_request *req) +{ + if (oti == NULL) + return; + memset(oti, 0, sizeof(*oti)); + + if (req == NULL) + return; + + oti->oti_xid = req->rq_xid; + /** VBR: take versions from request */ + if (req->rq_reqmsg != NULL && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg); + oti->oti_pre_version = pre_version ? pre_version[0] : 0; + oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + + /** called from mds_create_objects */ + if (req->rq_repmsg != NULL) + oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); + oti->oti_thread = req->rq_svc_thread; + if (req->rq_reqmsg != NULL) + oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); +} + +static inline void oti_alloc_cookies(struct obd_trans_info *oti,int num_cookies) +{ + if (!oti) + return; + + if (num_cookies == 1) + oti->oti_logcookies = &oti->oti_onecookie; + else + OBD_ALLOC_LARGE(oti->oti_logcookies, + num_cookies * sizeof(oti->oti_onecookie)); + + oti->oti_numcookies = num_cookies; +} + +static inline void oti_free_cookies(struct obd_trans_info *oti) +{ + if (!oti || !oti->oti_logcookies) + return; + + if (oti->oti_logcookies == &oti->oti_onecookie) + LASSERT(oti->oti_numcookies == 1); + else + OBD_FREE_LARGE(oti->oti_logcookies, + oti->oti_numcookies*sizeof(oti->oti_onecookie)); + oti->oti_logcookies = NULL; + oti->oti_numcookies = 0; +} + +/* + * Events signalled through obd_notify() upcall-chain. + */ +enum obd_notify_event { + /* target added */ + OBD_NOTIFY_CREATE, + /* Device connect start */ + OBD_NOTIFY_CONNECT, + /* Device activated */ + OBD_NOTIFY_ACTIVE, + /* Device deactivated */ + OBD_NOTIFY_INACTIVE, + /* Device disconnected */ + OBD_NOTIFY_DISCON, + /* Connect data for import were changed */ + OBD_NOTIFY_OCD, + /* Sync request */ + OBD_NOTIFY_SYNC_NONBLOCK, + OBD_NOTIFY_SYNC, + /* Configuration event */ + OBD_NOTIFY_CONFIG, + /* Administratively deactivate/activate event */ + OBD_NOTIFY_DEACTIVATE, + OBD_NOTIFY_ACTIVATE +}; + +/* + * Data structure used to pass obd_notify()-event to non-obd listeners (llite + * and liblustre being main examples). + */ +struct obd_notify_upcall { + int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, + enum obd_notify_event ev, void *owner, void *data); + /* Opaque datum supplied by upper layer listener */ + void *onu_owner; +}; + +struct target_recovery_data { + svc_handler_t trd_recovery_handler; + pid_t trd_processing_task; + struct completion trd_starting; + struct completion trd_finishing; +}; + +struct obd_llog_group { + int olg_seq; + struct llog_ctxt *olg_ctxts[LLOG_MAX_CTXTS]; + wait_queue_head_t olg_waitq; + spinlock_t olg_lock; + struct mutex olg_cat_processing; +}; + +/* corresponds to one of the obd's */ +#define OBD_DEVICE_MAGIC 0XAB5CD6EF +#define OBD_DEV_BY_DEVNAME 0xffffd0de + +struct obd_device { + struct obd_type *obd_type; + __u32 obd_magic; + + /* common and UUID name of this device */ + char obd_name[MAX_OBD_NAME]; + struct obd_uuid obd_uuid; + + struct lu_device *obd_lu_dev; + + int obd_minor; + /* bitfield modification is protected by obd_dev_lock */ + unsigned long obd_attached:1, /* finished attach */ + obd_set_up:1, /* finished setup */ + obd_recovering:1, /* there are recoverable clients */ + obd_abort_recovery:1,/* recovery expired */ + obd_version_recov:1, /* obd uses version checking */ + obd_replayable:1, /* recovery is enabled; inform clients */ + obd_no_transno:1, /* no committed-transno notification */ + obd_no_recov:1, /* fail instead of retry messages */ + obd_stopping:1, /* started cleanup */ + obd_starting:1, /* started setup */ + obd_force:1, /* cleanup with > 0 obd refcount */ + obd_fail:1, /* cleanup with failover */ + obd_async_recov:1, /* allow asynchronous orphan cleanup */ + obd_no_conn:1, /* deny new connections */ + obd_inactive:1, /* device active/inactive + * (for /proc/status only!!) */ + obd_no_ir:1, /* no imperative recovery. */ + obd_process_conf:1; /* device is processing mgs config */ + /* use separate field as it is set in interrupt to don't mess with + * protection of other bits using _bh lock */ + unsigned long obd_recovery_expired:1; + /* uuid-export hash body */ + cfs_hash_t *obd_uuid_hash; + /* nid-export hash body */ + cfs_hash_t *obd_nid_hash; + /* nid stats body */ + cfs_hash_t *obd_nid_stats_hash; + struct list_head obd_nid_stats; + atomic_t obd_refcount; + wait_queue_head_t obd_refcount_waitq; + struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; + int obd_num_exports; + spinlock_t obd_nid_lock; + struct ldlm_namespace *obd_namespace; + struct ptlrpc_client obd_ldlm_client; /* XXX OST/MDS only */ + /* a spinlock is OK for what we do now, may need a semaphore later */ + spinlock_t obd_dev_lock; /* protect OBD bitfield above */ + struct mutex obd_dev_mutex; + __u64 obd_last_committed; + struct fsfilt_operations *obd_fsops; + spinlock_t obd_osfs_lock; + struct obd_statfs obd_osfs; /* locked by obd_osfs_lock */ + __u64 obd_osfs_age; + struct lvfs_run_ctxt obd_lvfs_ctxt; + struct obd_llog_group obd_olg; /* default llog group */ + struct obd_device *obd_observer; + struct rw_semaphore obd_observer_link_sem; + struct obd_notify_upcall obd_upcall; + struct obd_export *obd_self_export; + /* list of exports in LRU order, for ping evictor, with obd_dev_lock */ + struct list_head obd_exports_timed; + time_t obd_eviction_timer; /* for ping evictor */ + + int obd_max_recoverable_clients; + atomic_t obd_connected_clients; + int obd_stale_clients; + int obd_delayed_clients; + /* this lock protects all recovery list_heads, timer and + * obd_next_recovery_transno value */ + spinlock_t obd_recovery_task_lock; + __u64 obd_next_recovery_transno; + int obd_replayed_requests; + int obd_requests_queued_for_recovery; + wait_queue_head_t obd_next_transno_waitq; + /* protected by obd_recovery_task_lock */ + timer_list_t obd_recovery_timer; + time_t obd_recovery_start; /* seconds */ + time_t obd_recovery_end; /* seconds, for lprocfs_status */ + int obd_recovery_time_hard; + int obd_recovery_timeout; + int obd_recovery_ir_factor; + + /* new recovery stuff from CMD2 */ + struct target_recovery_data obd_recovery_data; + int obd_replayed_locks; + atomic_t obd_req_replay_clients; + atomic_t obd_lock_replay_clients; + /* all lists are protected by obd_recovery_task_lock */ + struct list_head obd_req_replay_queue; + struct list_head obd_lock_replay_queue; + struct list_head obd_final_req_queue; + int obd_recovery_stage; + + union { + struct obd_device_target obt; + struct filter_obd filter; + struct client_obd cli; + struct ost_obd ost; + struct echo_client_obd echo_client; + struct echo_obd echo; + struct lov_obd lov; + struct lmv_obd lmv; + } u; + /* Fields used by LProcFS */ + unsigned int obd_cntr_base; + struct lprocfs_stats *obd_stats; + + unsigned int md_cntr_base; + struct lprocfs_stats *md_stats; + + proc_dir_entry_t *obd_proc_entry; + void *obd_proc_private; /* type private PDEs */ + proc_dir_entry_t *obd_proc_exports_entry; + proc_dir_entry_t *obd_svc_procroot; + struct lprocfs_stats *obd_svc_stats; + atomic_t obd_evict_inprogress; + wait_queue_head_t obd_evict_inprogress_waitq; + struct list_head obd_evict_list; /* protected with pet_lock */ + + /** + * Ldlm pool part. Save last calculated SLV and Limit. + */ + rwlock_t obd_pool_lock; + int obd_pool_limit; + __u64 obd_pool_slv; + + /** + * A list of outstanding class_incref()'s against this obd. For + * debugging. + */ + struct lu_ref obd_reference; + + int obd_conn_inprogress; +}; + +#define OBD_LLOG_FL_SENDNOW 0x0001 +#define OBD_LLOG_FL_EXIT 0x0002 + +enum obd_cleanup_stage { +/* Special case hack for MDS LOVs */ + OBD_CLEANUP_EARLY, +/* can be directly mapped to .ldto_device_fini() */ + OBD_CLEANUP_EXPORTS, +}; + +/* get/set_info keys */ +#define KEY_ASYNC "async" +#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_BLOCKSIZE "blocksize" +#define KEY_CAPA_KEY "capa_key" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" +#define KEY_CONN_DATA "conn_data" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_GRANT_SHRINK "grant_shrink" +#define KEY_HSM_COPYTOOL_SEND "hsm_send" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INIT_RECOV "initial_recov" +#define KEY_INTERMDS "inter_mds" +#define KEY_LAST_ID "last_id" +#define KEY_LAST_FID "last_fid" +#define KEY_LOCK_TO_STRIPE "lock_to_stripe" +#define KEY_LOVDESC "lovdesc" +#define KEY_LOV_IDX "lov_idx" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_MDS_CONN "mds_conn" +#define KEY_MGSSEC "mgssec" +#define KEY_NEXT_ID "next_id" +#define KEY_READ_ONLY "read-only" +#define KEY_REGISTER_TARGET "register_target" +#define KEY_SET_FS "set_fs" +#define KEY_TGT_COUNT "tgt_count" +/* KEY_SET_INFO in lustre_idl.h */ +#define KEY_SPTLRPC_CONF "sptlrpc_conf" +#define KEY_CONNECT_FLAG "connect_flags" +#define KEY_SYNC_LOCK_CANCEL "sync_lock_cancel" + +#define KEY_CACHE_SET "cache_set" +#define KEY_CACHE_LRU_SHRINK "cache_lru_shrink" +#define KEY_CHANGELOG_INDEX "changelog_index" + +struct lu_context; + +/* /!\ must be coherent with include/linux/namei.h on patched kernel */ +#define IT_OPEN (1 << 0) +#define IT_CREAT (1 << 1) +#define IT_READDIR (1 << 2) +#define IT_GETATTR (1 << 3) +#define IT_LOOKUP (1 << 4) +#define IT_UNLINK (1 << 5) +#define IT_TRUNC (1 << 6) +#define IT_GETXATTR (1 << 7) +#define IT_EXEC (1 << 8) +#define IT_PIN (1 << 9) +#define IT_LAYOUT (1 << 10) +#define IT_QUOTA_DQACQ (1 << 11) +#define IT_QUOTA_CONN (1 << 12) + +static inline int it_to_lock_mode(struct lookup_intent *it) +{ + /* CREAT needs to be tested before open (both could be set) */ + if (it->it_op & IT_CREAT) + return LCK_CW; + else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP | + IT_LAYOUT)) + return LCK_CR; + + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); + return -EINVAL; +} + +struct md_op_data { + struct lu_fid op_fid1; /* operation fid1 (usualy parent) */ + struct lu_fid op_fid2; /* operation fid2 (usualy child) */ + struct lu_fid op_fid3; /* 2 extra fids to find conflicting */ + struct lu_fid op_fid4; /* to the operation locks. */ + mdsno_t op_mds; /* what mds server open will go to */ + struct lustre_handle op_handle; + obd_time op_mod_time; + const char *op_name; + int op_namelen; + __u32 op_mode; + struct lmv_stripe_md *op_mea1; + struct lmv_stripe_md *op_mea2; + __u32 op_suppgids[2]; + __u32 op_fsuid; + __u32 op_fsgid; + cfs_cap_t op_cap; + void *op_data; + + /* iattr fields and blocks. */ + struct iattr op_attr; + unsigned int op_attr_flags; + __u64 op_valid; + loff_t op_attr_blocks; + + /* Size-on-MDS epoch and flags. */ + __u64 op_ioepoch; + __u32 op_flags; + + /* Capa fields */ + struct obd_capa *op_capa1; + struct obd_capa *op_capa2; + + /* Various operation flags. */ + __u32 op_bias; + + /* Operation type */ + __u32 op_opc; + + /* Used by readdir */ + __u64 op_offset; + + /* Used by readdir */ + __u32 op_npages; + + /* used to transfer info between the stacks of MD client + * see enum op_cli_flags */ + __u32 op_cli_flags; +}; + +enum op_cli_flags { + CLI_SET_MEA = 1 << 0, + CLI_RM_ENTRY = 1 << 1, +}; + +struct md_enqueue_info; +/* metadata stat-ahead */ +typedef int (* md_enqueue_cb_t)(struct ptlrpc_request *req, + struct md_enqueue_info *minfo, + int rc); + +/* seq client type */ +enum lu_cli_type { + LUSTRE_SEQ_METADATA = 1, + LUSTRE_SEQ_DATA +}; + +struct md_enqueue_info { + struct md_op_data mi_data; + struct lookup_intent mi_it; + struct lustre_handle mi_lockh; + struct inode *mi_dir; + md_enqueue_cb_t mi_cb; + __u64 mi_cbdata; + unsigned int mi_generation; +}; + +struct obd_ops { + module_t *o_owner; + int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, + void *karg, void *uarg); + int (*o_get_info)(const struct lu_env *env, struct obd_export *, + __u32 keylen, void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm); + int (*o_set_info_async)(const struct lu_env *, struct obd_export *, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set); + int (*o_attach)(struct obd_device *dev, obd_count len, void *data); + int (*o_detach)(struct obd_device *dev); + int (*o_setup) (struct obd_device *dev, struct lustre_cfg *cfg); + int (*o_precleanup)(struct obd_device *dev, + enum obd_cleanup_stage cleanup_stage); + int (*o_cleanup)(struct obd_device *dev); + int (*o_process_config)(struct obd_device *dev, obd_count len, + void *data); + int (*o_postrecov)(struct obd_device *dev); + int (*o_add_conn)(struct obd_import *imp, struct obd_uuid *uuid, + int priority); + int (*o_del_conn)(struct obd_import *imp, struct obd_uuid *uuid); + /* connect to the target device with given connection + * data. @ocd->ocd_connect_flags is modified to reflect flags actually + * granted by the target, which are guaranteed to be a subset of flags + * asked for. If @ocd == NULL, use default parameters. */ + int (*o_connect)(const struct lu_env *env, + struct obd_export **exp, struct obd_device *src, + struct obd_uuid *cluuid, struct obd_connect_data *ocd, + void *localdata); + int (*o_reconnect)(const struct lu_env *env, + struct obd_export *exp, struct obd_device *src, + struct obd_uuid *cluuid, + struct obd_connect_data *ocd, + void *localdata); + int (*o_disconnect)(struct obd_export *exp); + + /* Initialize/finalize fids infrastructure. */ + int (*o_fid_init)(struct obd_device *obd, + struct obd_export *exp, enum lu_cli_type type); + int (*o_fid_fini)(struct obd_device *obd); + + /* Allocate new fid according to passed @hint. */ + int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid, + struct md_op_data *op_data); + + /* + * Object with @fid is getting deleted, we may want to do something + * about this. + */ + int (*o_statfs)(const struct lu_env *, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, __u32 flags); + int (*o_statfs_async)(struct obd_export *exp, struct obd_info *oinfo, + __u64 max_age, struct ptlrpc_request_set *set); + int (*o_packmd)(struct obd_export *exp, struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src); + int (*o_unpackmd)(struct obd_export *exp,struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, int disk_len); + int (*o_preallocate)(struct lustre_handle *, obd_count *req, + obd_id *ids); + /* FIXME: add fid capability support for create & destroy! */ + int (*o_precreate)(struct obd_export *exp); + int (*o_create)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md **ea, + struct obd_trans_info *oti); + int (*o_create_async)(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md **ea, + struct obd_trans_info *oti); + int (*o_destroy)(const struct lu_env *env, struct obd_export *exp, + struct obdo *oa, struct lov_stripe_md *ea, + struct obd_trans_info *oti, struct obd_export *md_exp, + void *capa); + int (*o_setattr)(const struct lu_env *, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti); + int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_getattr)(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo); + int (*o_getattr_async)(struct obd_export *exp, struct obd_info *oinfo, + struct ptlrpc_request_set *set); + int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo, + obd_count oa_bufs, struct brw_page *pgarr, + struct obd_trans_info *oti); + int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only); + int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, + obd_off size, int shrink); + int (*o_punch)(const struct lu_env *, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset); + int (*o_sync)(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set); + int (*o_migrate)(struct lustre_handle *conn, struct lov_stripe_md *dst, + struct lov_stripe_md *src, obd_size start, + obd_size end, struct obd_trans_info *oti); + int (*o_copy)(struct lustre_handle *dstconn, struct lov_stripe_md *dst, + struct lustre_handle *srconn, struct lov_stripe_md *src, + obd_size start, obd_size end, struct obd_trans_info *); + int (*o_iterate)(struct lustre_handle *conn, + int (*)(obd_id, obd_seq, void *), + obd_id *startid, obd_seq seq, void *data); + int (*o_preprw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, int objcount, + struct obd_ioobj *obj, struct niobuf_remote *remote, + int *nr_pages, struct niobuf_local *local, + struct obd_trans_info *oti, struct lustre_capa *capa); + int (*o_commitrw)(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc); + int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset); + int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *, + ldlm_iterator_t it, void *data); + int (*o_find_cbdata)(struct obd_export *, struct lov_stripe_md *, + ldlm_iterator_t it, void *data); + int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *); + int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *, + ldlm_cancel_flags_t flags, void *opaque); + int (*o_init_export)(struct obd_export *exp); + int (*o_destroy_export)(struct obd_export *exp); + int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *, + int cmd, obd_off *); + + /* llog related obd_methods */ + int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp, + struct obd_device *disk_obd, int *idx); + int (*o_llog_finish)(struct obd_device *obd, int count); + int (*o_llog_connect)(struct obd_export *, struct llogd_conn_body *); + + /* metadata-only methods */ + int (*o_pin)(struct obd_export *, const struct lu_fid *fid, + struct obd_capa *, struct obd_client_handle *, int flag); + int (*o_unpin)(struct obd_export *, struct obd_client_handle *, int); + + int (*o_import_event)(struct obd_device *, struct obd_import *, + enum obd_import_event); + + int (*o_notify)(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); + + int (*o_health_check)(const struct lu_env *env, struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); + + /* quota methods */ + int (*o_quotacheck)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + + int (*o_ping)(const struct lu_env *, struct obd_export *exp); + + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); + void (*o_getref)(struct obd_device *obd); + void (*o_putref)(struct obd_device *obd); + /* + * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line + * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. + * Also, add a wrapper function in include/linux/obd_class.h. */ +}; + +enum { + LUSTRE_OPC_MKDIR = (1 << 0), + LUSTRE_OPC_SYMLINK = (1 << 1), + LUSTRE_OPC_MKNOD = (1 << 2), + LUSTRE_OPC_CREATE = (1 << 3), + LUSTRE_OPC_ANY = (1 << 4) +}; + +/* lmv structures */ +#define MEA_MAGIC_LAST_CHAR 0xb2221ca1 +#define MEA_MAGIC_ALL_CHARS 0xb222a11c +#define MEA_MAGIC_HASH_SEGMENT 0xb222a11b + +#define MAX_HASH_SIZE_32 0x7fffffffUL +#define MAX_HASH_SIZE 0x7fffffffffffffffULL +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL + +struct lustre_md { + struct mdt_body *body; + struct lov_stripe_md *lsm; + struct lmv_stripe_md *mea; +#ifdef CONFIG_FS_POSIX_ACL + struct posix_acl *posix_acl; +#endif + struct mdt_remote_perm *remote_perm; + struct obd_capa *mds_capa; + struct obd_capa *oss_capa; +}; + +struct md_open_data { + struct obd_client_handle *mod_och; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; + atomic_t mod_refcount; +}; + +struct lookup_intent; + +struct md_ops { + int (*m_getstatus)(struct obd_export *, struct lu_fid *, + struct obd_capa **); + int (*m_null_inode)(struct obd_export *, const struct lu_fid *); + int (*m_find_cbdata)(struct obd_export *, const struct lu_fid *, + ldlm_iterator_t, void *); + int (*m_close)(struct obd_export *, struct md_op_data *, + struct md_open_data *, struct ptlrpc_request **); + int (*m_create)(struct obd_export *, struct md_op_data *, + const void *, int, int, __u32, __u32, cfs_cap_t, + __u64, struct ptlrpc_request **); + int (*m_done_writing)(struct obd_export *, struct md_op_data *, + struct md_open_data *); + int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, + struct lookup_intent *, struct md_op_data *, + struct lustre_handle *, void *, int, + struct ptlrpc_request **, __u64); + int (*m_getattr)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_getattr_name)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_intent_lock)(struct obd_export *, struct md_op_data *, + void *, int, struct lookup_intent *, int, + struct ptlrpc_request **, + ldlm_blocking_callback, __u64); + int (*m_link)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + int (*m_rename)(struct obd_export *, struct md_op_data *, + const char *, int, const char *, int, + struct ptlrpc_request **); + int (*m_is_subdir)(struct obd_export *, const struct lu_fid *, + const struct lu_fid *, + struct ptlrpc_request **); + int (*m_setattr)(struct obd_export *, struct md_op_data *, void *, + int , void *, int, struct ptlrpc_request **, + struct md_open_data **mod); + int (*m_sync)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, struct ptlrpc_request **); + int (*m_readpage)(struct obd_export *, struct md_op_data *, + struct page **, struct ptlrpc_request **); + + int (*m_unlink)(struct obd_export *, struct md_op_data *, + struct ptlrpc_request **); + + int (*m_setxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, obd_valid, const char *, + const char *, int, int, int, __u32, + struct ptlrpc_request **); + + int (*m_getxattr)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, obd_valid, const char *, + const char *, int, int, int, + struct ptlrpc_request **); + + int (*m_init_ea_size)(struct obd_export *, int, int, int); + + int (*m_get_lustre_md)(struct obd_export *, struct ptlrpc_request *, + struct obd_export *, struct obd_export *, + struct lustre_md *); + + int (*m_free_lustre_md)(struct obd_export *, struct lustre_md *); + + int (*m_set_open_replay_data)(struct obd_export *, + struct obd_client_handle *, + struct ptlrpc_request *); + int (*m_clear_open_replay_data)(struct obd_export *, + struct obd_client_handle *); + int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u64 *); + + ldlm_mode_t (*m_lock_match)(struct obd_export *, __u64, + const struct lu_fid *, ldlm_type_t, + ldlm_policy_data_t *, ldlm_mode_t, + struct lustre_handle *); + + int (*m_cancel_unused)(struct obd_export *, const struct lu_fid *, + ldlm_policy_data_t *, ldlm_mode_t, + ldlm_cancel_flags_t flags, void *opaque); + int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, + renew_capa_cb_t cb); + int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, + const struct req_msg_field *, struct obd_capa **); + + int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, + struct obd_capa *, __u32, + struct ptlrpc_request **); + + int (*m_intent_getattr_async)(struct obd_export *, + struct md_enqueue_info *, + struct ldlm_enqueue_info *); + + int (*m_revalidate_lock)(struct obd_export *, struct lookup_intent *, + struct lu_fid *, __u64 *bits); + + /* + * NOTE: If adding ops, add another LPROCFS_MD_OP_INIT() line to + * lprocfs_alloc_md_stats() in obdclass/lprocfs_status.c. Also, add a + * wrapper function in include/linux/obd_class.h. + */ +}; + +struct lsm_operations { + void (*lsm_free)(struct lov_stripe_md *); + int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa, + struct obd_export *md_exp); + void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *, + obd_off *); + void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *, + obd_off *); + int (*lsm_lmm_verify) (struct lov_mds_md *lmm, int lmm_bytes, + __u16 *stripe_count); + int (*lsm_unpackmd) (struct lov_obd *lov, struct lov_stripe_md *lsm, + struct lov_mds_md *lmm); +}; + +extern const struct lsm_operations lsm_v1_ops; +extern const struct lsm_operations lsm_v3_ops; +static inline const struct lsm_operations *lsm_op_find(int magic) +{ + switch(magic) { + case LOV_MAGIC_V1: + return &lsm_v1_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; + default: + CERROR("Cannot recognize lsm_magic %08x\n", magic); + return NULL; + } +} + +/* Requests for obd_extent_calc() */ +#define OBD_CALC_STRIPE_START 1 +#define OBD_CALC_STRIPE_END 2 + +static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo) +{ + return oinfo->oi_capa; +} + +static inline struct md_open_data *obd_mod_alloc(void) +{ + struct md_open_data *mod; + OBD_ALLOC_PTR(mod); + if (mod == NULL) + return NULL; + atomic_set(&mod->mod_refcount, 1); + return mod; +} + +#define obd_mod_get(mod) atomic_inc(&(mod)->mod_refcount) +#define obd_mod_put(mod) \ +({ \ + if (atomic_dec_and_test(&(mod)->mod_refcount)) { \ + if ((mod)->mod_open_req) \ + ptlrpc_req_finished((mod)->mod_open_req); \ + OBD_FREE_PTR(mod); \ + } \ +}) + +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid); +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent); + +/* return 1 if client should be resend request */ +static inline int client_should_resend(int resend, struct client_obd *cli) +{ + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; +} + +/** + * Return device name for this device + * + * XXX: lu_device is declared before obd_device, while a pointer pointing + * back to obd_device in lu_device, so this helper function defines here + * instead of in lu_object.h + */ +static inline const char *lu_dev_name(const struct lu_device *lu_dev) +{ + return lu_dev->ld_obd->obd_name; +} + +static inline bool filename_is_volatile(const char *name, int namelen, int *idx) +{ + const char *start; + char *end; + + if (strncmp(name, LUSTRE_VOLATILE_HDR, LUSTRE_VOLATILE_HDR_LEN) != 0) + return false; + + /* caller does not care of idx */ + if (idx == NULL) + return true; + + /* volatile file, the MDT can be set from name */ + /* name format is LUSTRE_VOLATILE_HDR:[idx]: */ + /* if no MDT is specified, use std way */ + if (namelen < LUSTRE_VOLATILE_HDR_LEN + 2) + goto bad_format; + /* test for no MDT idx case */ + if ((*(name + LUSTRE_VOLATILE_HDR_LEN) == ':') && + (*(name + LUSTRE_VOLATILE_HDR_LEN + 1) == ':')) { + *idx = -1; + return true; + } + /* we have an idx, read it */ + start = name + LUSTRE_VOLATILE_HDR_LEN + 1; + *idx = strtoul(start, &end, 0); + /* error cases: + * no digit, no trailing :, negative value + */ + if (((*idx == 0) && (end == start)) || + (*end != ':') || (*idx < 0)) + goto bad_format; + + return true; +bad_format: + /* bad format of mdt idx, we cannot return an error + * to caller so we use hash algo */ + CERROR("Bad volatile file name format: %s\n", + name + LUSTRE_VOLATILE_HDR_LEN); + return false; +} + +static inline int cli_brw_size(struct obd_device *obd) +{ + LASSERT(obd != NULL); + return obd->u.cli.cl_max_pages_per_rpc << PAGE_CACHE_SHIFT; +} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_cache.h b/drivers/staging/lustre/lustre/include/obd_cache.h new file mode 100644 index 000000000000..c8249fbb0d72 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_cache.h @@ -0,0 +1,39 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_CACHE_H__ +#define _OBD_CACHE_H__ + + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_cksum.h b/drivers/staging/lustre/lustre/include/obd_cksum.h new file mode 100644 index 000000000000..5f740f1743ca --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_cksum.h @@ -0,0 +1,176 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __OBD_CKSUM +#define __OBD_CKSUM +#include <linux/libcfs/libcfs.h> +#include <lustre/lustre_idl.h> + +static inline unsigned char cksum_obd2cfs(cksum_type_t cksum_type) +{ + switch (cksum_type) { + case OBD_CKSUM_CRC32: + return CFS_HASH_ALG_CRC32; + case OBD_CKSUM_ADLER: + return CFS_HASH_ALG_ADLER32; + case OBD_CKSUM_CRC32C: + return CFS_HASH_ALG_CRC32C; + default: + CERROR("Unknown checksum type (%x)!!!\n", cksum_type); + LBUG(); + } + return 0; +} + +/* The OBD_FL_CKSUM_* flags is packed into 5 bits of o_flags, since there can + * only be a single checksum type per RPC. + * + * The OBD_CHECKSUM_* type bits passed in ocd_cksum_types are a 32-bit bitmask + * since they need to represent the full range of checksum algorithms that + * both the client and server can understand. + * + * In case of an unsupported types/flags we fall back to ADLER + * because that is supported by all clients since 1.8 + * + * In case multiple algorithms are supported the best one is used. */ +static inline obd_flag cksum_type_pack(cksum_type_t cksum_type) +{ + unsigned int performance = 0, tmp; + obd_flag flag = OBD_FL_CKSUM_ADLER; + + if (cksum_type & OBD_CKSUM_CRC32) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32; + } + } + if (cksum_type & OBD_CKSUM_CRC32C) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_CRC32C; + } + } + if (cksum_type & OBD_CKSUM_ADLER) { + tmp = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)); + if (tmp > performance) { + performance = tmp; + flag = OBD_FL_CKSUM_ADLER; + } + } + if (unlikely(cksum_type && !(cksum_type & (OBD_CKSUM_CRC32C | + OBD_CKSUM_CRC32 | + OBD_CKSUM_ADLER)))) + CWARN("unknown cksum type %x\n", cksum_type); + + return flag; +} + +static inline cksum_type_t cksum_type_unpack(obd_flag o_flags) +{ + switch (o_flags & OBD_FL_CKSUM_ALL) { + case OBD_FL_CKSUM_CRC32C: + return OBD_CKSUM_CRC32C; + case OBD_FL_CKSUM_CRC32: + return OBD_CKSUM_CRC32; + default: + break; + } + + return OBD_CKSUM_ADLER; +} + +/* Return a bitmask of the checksum types supported on this system. + * 1.8 supported ADLER it is base and not depend on hw + * Client uses all available local algos + */ +static inline cksum_type_t cksum_types_supported_client(void) +{ + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) > 0) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) > 0) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + +/* Server uses algos that perform at 50% or better of the Adler */ +static inline cksum_type_t cksum_types_supported_server(void) +{ + int base_speed; + cksum_type_t ret = OBD_CKSUM_ADLER; + + CDEBUG(D_INFO, "Crypto hash speed: crc %d, crc32c %d, adler %d\n", + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)), + cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER))); + + base_speed = cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_ADLER)) / 2; + + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32C)) >= + base_speed) + ret |= OBD_CKSUM_CRC32C; + if (cfs_crypto_hash_speed(cksum_obd2cfs(OBD_CKSUM_CRC32)) >= + base_speed) + ret |= OBD_CKSUM_CRC32; + + return ret; +} + + +/* Select the best checksum algorithm among those supplied in the cksum_types + * input. + * + * Currently, calling cksum_type_pack() with a mask will return the fastest + * checksum type due to its benchmarking at libcfs module load. + * Caution is advised, however, since what is fastest on a single client may + * not be the fastest or most efficient algorithm on the server. */ +static inline cksum_type_t cksum_type_select(cksum_type_t cksum_types) +{ + return cksum_type_unpack(cksum_type_pack(cksum_types)); +} + +/* Checksum algorithm names. Must be defined in the same order as the + * OBD_CKSUM_* flags. */ +#define DECLARE_CKSUM_NAME char *cksum_name[] = {"crc32", "adler", "crc32c"} + +#endif /* __OBD_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_class.h b/drivers/staging/lustre/lustre/include/obd_class.h new file mode 100644 index 000000000000..de5c5853647f --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_class.h @@ -0,0 +1,2281 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef __CLASS_OBD_H +#define __CLASS_OBD_H + + +#include <obd_support.h> +#include <lustre_import.h> +#include <lustre_net.h> +#include <obd.h> +#include <lustre_lib.h> +#include <lustre/lustre_idl.h> +#include <lprocfs_status.h> + +#include <linux/obd_class.h> + +#define OBD_STATFS_NODELAY 0x0001 /* requests should be send without delay + * and resends for avoid deadlocks */ +#define OBD_STATFS_FROM_CACHE 0x0002 /* the statfs callback should not update + * obd_osfs_age */ +#define OBD_STATFS_PTLRPCD 0x0004 /* requests will be sent via ptlrpcd + * instead of a specific set. This + * means that we cannot rely on the set + * interpret routine to be called. + * lov_statfs_fini() must thus be called + * by the request interpret routine */ +#define OBD_STATFS_FOR_MDT0 0x0008 /* The statfs is only for retrieving + * information from MDT0. */ +#define OBD_FL_PUNCH 0x00000001 /* To indicate it is punch operation */ + +/* OBD Device Declarations */ +extern struct obd_device *obd_devs[MAX_OBD_DEVICES]; +extern rwlock_t obd_dev_lock; + +/* OBD Operations Declarations */ +extern struct obd_device *class_conn2obd(struct lustre_handle *); +extern struct obd_device *class_exp2obd(struct obd_export *); +extern int class_handle_ioctl(unsigned int cmd, unsigned long arg); +extern int lustre_get_jobid(char *jobid); + +struct lu_device_type; + +/* genops.c */ +struct obd_export *class_conn2export(struct lustre_handle *); +int class_register_type(struct obd_ops *, struct md_ops *, + struct lprocfs_vars *, const char *nm, + struct lu_device_type *ldt); +int class_unregister_type(const char *nm); + +struct obd_device *class_newdev(const char *type_name, const char *name); +void class_release_dev(struct obd_device *obd); + +int class_name2dev(const char *name); +struct obd_device *class_name2obd(const char *name); +int class_uuid2dev(struct obd_uuid *uuid); +struct obd_device *class_uuid2obd(struct obd_uuid *uuid); +void class_obd_list(void); +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid); +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, + int *next); +struct obd_device * class_num2obd(int num); +int get_devices_count(void); + +int class_notify_sptlrpc_conf(const char *fsname, int namelen); + +char *obd_export_nid2str(struct obd_export *exp); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid); +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep); + +int obd_zombie_impexp_init(void); +void obd_zombie_impexp_stop(void); +void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); +void obd_exports_barrier(struct obd_device *obd); +int kuc_len(int payload_len); +struct kuc_hdr * kuc_ptr(void *p); +int kuc_ispayload(void *p); +void *kuc_alloc(int payload_len, int transport, int type); +void kuc_free(void *p, int payload_len); + +struct llog_handle; +struct llog_rec_hdr; +typedef int (*llog_cb_t)(const struct lu_env *, struct llog_handle *, + struct llog_rec_hdr *, void *); +/* obd_config.c */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name); +int class_process_config(struct lustre_cfg *lcfg); +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data); +int class_attach(struct lustre_cfg *lcfg); +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg); +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source); +void class_decref(struct obd_device *obd, + const char *scope, const void *source); +void dump_exports(struct obd_device *obd, int locks); +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg); +int class_add_uuid(const char *uuid, __u64 nid); + +/*obdecho*/ +#ifdef LPROCFS +extern void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars); +#else +static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) +{ + memset(lvars, 0, sizeof(*lvars)); +} +#endif + +#define CFG_F_START 0x01 /* Set when we start updating from a log */ +#define CFG_F_MARKER 0x02 /* We are within a maker */ +#define CFG_F_SKIP 0x04 /* We should ignore this cfg command */ +#define CFG_F_COMPAT146 0x08 /* Allow old-style logs */ +#define CFG_F_EXCLUDE 0x10 /* OST exclusion list */ + +/* Passed as data param to class_config_parse_llog */ +struct config_llog_instance { + char *cfg_obdname; + void *cfg_instance; + struct super_block *cfg_sb; + struct obd_uuid cfg_uuid; + llog_cb_t cfg_callback; + int cfg_last_idx; /* for partial llog processing */ + int cfg_flags; +}; +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg); + +enum { + CONFIG_T_CONFIG = 0, + CONFIG_T_SPTLRPC = 1, + CONFIG_T_RECOVER = 2, + CONFIG_T_MAX = 3 +}; + +/* list of active configuration logs */ +struct config_llog_data { + struct ldlm_res_id cld_resid; + struct config_llog_instance cld_cfg; + struct list_head cld_list_chain; + atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ + struct config_llog_data *cld_recover; /* imperative recover log */ + struct obd_export *cld_mgcexp; + struct mutex cld_lock; + int cld_type; + unsigned int cld_stopping:1, /* we were told to stop + * watching */ + cld_lostlock:1; /* lock not requeued */ + char cld_logname[0]; +}; + +struct lustre_profile { + struct list_head lp_list; + char *lp_profile; + char *lp_dt; + char *lp_md; +}; + +struct lustre_profile *class_get_profile(const char * prof); +void class_del_profile(const char *prof); +void class_del_profiles(void); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *, struct ldlm_lock *); +void __class_export_del_lock_ref(struct obd_export *, struct ldlm_lock *); +extern void (*class_export_dump_hook)(struct obd_export *); + +#else + +#define __class_export_add_lock_ref(exp, lock) do {} while(0) +#define __class_export_del_lock_ref(exp, lock) do {} while(0) + +#endif + +#define class_export_rpc_inc(exp) \ +({ \ + atomic_inc(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC GETting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_rpc_dec(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_rpc_count); \ + atomic_dec(&(exp)->exp_rpc_count); \ + CDEBUG(D_INFO, "RPC PUTting export %p : new rpc_count %d\n", \ + (exp), atomic_read(&(exp)->exp_rpc_count)); \ +}) + +#define class_export_lock_get(exp, lock) \ +({ \ + atomic_inc(&(exp)->exp_locks_count); \ + __class_export_add_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock GETting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_get(exp); \ +}) + +#define class_export_lock_put(exp, lock) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_locks_count); \ + atomic_dec(&(exp)->exp_locks_count); \ + __class_export_del_lock_ref(exp, lock); \ + CDEBUG(D_INFO, "lock PUTting export %p : new locks_count %d\n", \ + (exp), atomic_read(&(exp)->exp_locks_count)); \ + class_export_put(exp); \ +}) + +#define class_export_cb_get(exp) \ +({ \ + atomic_inc(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback GETting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_get(exp); \ +}) + +#define class_export_cb_put(exp) \ +({ \ + LASSERT_ATOMIC_POS(&exp->exp_cb_count); \ + atomic_dec(&(exp)->exp_cb_count); \ + CDEBUG(D_INFO, "callback PUTting export %p : new cb_count %d\n",\ + (exp), atomic_read(&(exp)->exp_cb_count)); \ + class_export_put(exp); \ +}) + +/* genops.c */ +struct obd_export *class_export_get(struct obd_export *exp); +void class_export_put(struct obd_export *exp); +struct obd_export *class_new_export(struct obd_device *obddev, + struct obd_uuid *cluuid); +void class_unlink_export(struct obd_export *exp); + +struct obd_import *class_import_get(struct obd_import *); +void class_import_put(struct obd_import *); +struct obd_import *class_new_import(struct obd_device *obd); +void class_destroy_import(struct obd_import *exp); + +struct obd_type *class_search_type(const char *name); +struct obd_type *class_get_type(const char *name); +void class_put_type(struct obd_type *type); +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid); +int class_disconnect(struct obd_export *exp); +void class_fail_export(struct obd_export *exp); +int class_connected_export(struct obd_export *exp); +void class_disconnect_exports(struct obd_device *obddev); +int class_manual_cleanup(struct obd_device *obd); +void class_disconnect_stale_exports(struct obd_device *, + int (*test_export)(struct obd_export *)); +static inline enum obd_option exp_flags_from_obd(struct obd_device *obd) +{ + return ((obd->obd_fail ? OBD_OPT_FAILOVER : 0) | + (obd->obd_force ? OBD_OPT_FORCE : 0) | + (obd->obd_abort_recovery ? OBD_OPT_ABORT_RECOV : 0) | + 0); +} + + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid); +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj); +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, + unsigned int ia_valid); +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid); +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid); +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo); +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo); + +#define OBT(dev) (dev)->obd_type +#define OBP(dev, op) (dev)->obd_type->typ_dt_ops->o_ ## op +#define MDP(dev, op) (dev)->obd_type->typ_md_ops->m_ ## op +#define CTXTP(ctxt, op) (ctxt)->loc_logops->lop_##op + +/* Ensure obd_setup: used for cleanup which must be called + while obd is stopping */ +#define OBD_CHECK_DEV(obd) \ +do { \ + if (!(obd)) { \ + CERROR("NULL device\n"); \ + RETURN(-ENODEV); \ + } \ +} while (0) + +/* ensure obd_setup and !obd_stopping */ +#define OBD_CHECK_DEV_ACTIVE(obd) \ +do { \ + OBD_CHECK_DEV(obd); \ + if (!(obd)->obd_set_up || (obd)->obd_stopping) { \ + CERROR("Device %d not setup\n", \ + (obd)->obd_minor); \ + RETURN(-ENODEV); \ + } \ +} while (0) + + +#ifdef LPROCFS +#define OBD_COUNTER_OFFSET(op) \ + ((offsetof(struct obd_ops, o_ ## op) - \ + offsetof(struct obd_ops, o_iocontrol)) \ + / sizeof(((struct obd_ops *)(0))->o_iocontrol)) + +#define OBD_COUNTER_INCREMENT(obdx, op) \ + if ((obdx)->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->obd_stats->ls_num); \ + lprocfs_counter_incr((obdx)->obd_stats, coffset); \ + } + +#define EXP_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->obd_cntr_base) + \ + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->obd_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->obd_stats, coffset); \ + if ((export)->exp_nid_stats != NULL && \ + (export)->exp_nid_stats->nid_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_nid_stats->nid_stats, coffset);\ + } + +#define MD_COUNTER_OFFSET(op) \ + ((offsetof(struct md_ops, m_ ## op) - \ + offsetof(struct md_ops, m_getstatus)) \ + / sizeof(((struct md_ops *)(0))->m_getstatus)) + +#define MD_COUNTER_INCREMENT(obdx, op) \ + if ((obd)->md_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((obdx)->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (obdx)->md_stats->ls_num); \ + lprocfs_counter_incr((obdx)->md_stats, coffset); \ + } + +#define EXP_MD_COUNTER_INCREMENT(export, op) \ + if ((export)->exp_obd->obd_stats != NULL) { \ + unsigned int coffset; \ + coffset = (unsigned int)((export)->exp_obd->md_cntr_base) + \ + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < (export)->exp_obd->md_stats->ls_num); \ + lprocfs_counter_incr((export)->exp_obd->md_stats, coffset); \ + if ((export)->exp_md_stats != NULL) \ + lprocfs_counter_incr( \ + (export)->exp_md_stats, coffset); \ + } + +#else +#define OBD_COUNTER_OFFSET(op) +#define OBD_COUNTER_INCREMENT(obd, op) +#define EXP_COUNTER_INCREMENT(exp, op) +#define MD_COUNTER_INCREMENT(obd, op) +#define EXP_MD_COUNTER_INCREMENT(exp, op) +#endif + +static inline int lprocfs_nid_ldlm_stats_init(struct nid_stat* tmp) +{ + /* Always add in ldlm_stats */ + tmp->nid_ldlm_stats = lprocfs_alloc_stats(LDLM_LAST_OPC - LDLM_FIRST_OPC + ,LPROCFS_STATS_FLAG_NOPERCPU); + if (tmp->nid_ldlm_stats == NULL) + return -ENOMEM; + + lprocfs_init_ldlm_stats(tmp->nid_ldlm_stats); + + return lprocfs_register_stats(tmp->nid_proc, "ldlm_stats", + tmp->nid_ldlm_stats); +} + +#define OBD_CHECK_MD_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !MDP((obd), op)) { \ + if (err) \ + CERROR("md_" #op ": dev %s/%d no operation\n", \ + obd->obd_name, obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +#define EXP_CHECK_MD_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + RETURN(-ENODEV); \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + RETURN(-EOPNOTSUPP); \ + } \ + if (!OBT((exp)->exp_obd) || !MDP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %s/%d no operation\n", \ + (exp)->exp_obd->obd_name, \ + (exp)->exp_obd->obd_minor); \ + RETURN(-EOPNOTSUPP); \ + } \ +} while (0) + + +#define OBD_CHECK_DT_OP(obd, op, err) \ +do { \ + if (!OBT(obd) || !OBP((obd), op)) { \ + if (err) \ + CERROR("obd_" #op ": dev %d no operation\n", \ + obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +#define EXP_CHECK_DT_OP(exp, op) \ +do { \ + if ((exp) == NULL) { \ + CERROR("obd_" #op ": NULL export\n"); \ + RETURN(-ENODEV); \ + } \ + if ((exp)->exp_obd == NULL || !OBT((exp)->exp_obd)) { \ + CERROR("obd_" #op ": cleaned up obd\n"); \ + RETURN(-EOPNOTSUPP); \ + } \ + if (!OBT((exp)->exp_obd) || !OBP((exp)->exp_obd, op)) { \ + CERROR("obd_" #op ": dev %d no operation\n", \ + (exp)->exp_obd->obd_minor); \ + RETURN(-EOPNOTSUPP); \ + } \ +} while (0) + +#define CTXT_CHECK_OP(ctxt, op, err) \ +do { \ + if (!OBT(ctxt->loc_obd) || !CTXTP((ctxt), op)) { \ + if (err) \ + CERROR("lop_" #op ": dev %d no operation\n", \ + ctxt->loc_obd->obd_minor); \ + RETURN(err); \ + } \ +} while (0) + +static inline int class_devno_max(void) +{ + return MAX_OBD_DEVICES; +} + +static inline int obd_get_info(const struct lu_env *env, + struct obd_export *exp, __u32 keylen, + void *key, __u32 *vallen, void *val, + struct lov_stripe_md *lsm) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, get_info); + EXP_COUNTER_INCREMENT(exp, get_info); + + rc = OBP(exp->exp_obd, get_info)(env, exp, keylen, key, vallen, val, + lsm); + RETURN(rc); +} + +static inline int obd_set_info_async(const struct lu_env *env, + struct obd_export *exp, obd_count keylen, + void *key, obd_count vallen, void *val, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, set_info_async); + EXP_COUNTER_INCREMENT(exp, set_info_async); + + rc = OBP(exp->exp_obd, set_info_async)(env, exp, keylen, key, vallen, + val, set); + RETURN(rc); +} + +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ + +#define DECLARE_LU_VARS(ldt, d) \ + struct lu_device_type *ldt; \ + struct lu_device *d + +static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + ldt = obd->obd_type->typ_lu; + if (ldt != NULL) { + struct lu_context session_ctx; + struct lu_env env; + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + env.le_ses = &session_ctx; + d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); + lu_env_fini(&env); + if (!IS_ERR(d)) { + obd->obd_lu_dev = d; + d->ld_obd = obd; + rc = 0; + } else + rc = PTR_ERR(d); + } + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + } else { + OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, setup); + rc = OBP(obd, setup)(obd, cfg); + } + RETURN(rc); +} + +static inline int obd_precleanup(struct obd_device *obd, + enum obd_cleanup_stage cleanup_stage) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + if (cleanup_stage == OBD_CLEANUP_EXPORTS) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_fini(&env, d); + lu_env_fini(&env); + } + } + } + OBD_CHECK_DT_OP(obd, precleanup, 0); + OBD_COUNTER_INCREMENT(obd, precleanup); + + rc = OBP(obd, precleanup)(obd, cleanup_stage); + RETURN(rc); +} + +static inline int obd_cleanup(struct obd_device *obd) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + ldt->ldt_ops->ldto_device_free(&env, d); + lu_env_fini(&env); + obd->obd_lu_dev = NULL; + } + } + OBD_CHECK_DT_OP(obd, cleanup, 0); + OBD_COUNTER_INCREMENT(obd, cleanup); + + rc = OBP(obd, cleanup)(obd); + RETURN(rc); +} + +static inline void obd_cleanup_client_import(struct obd_device *obd) +{ + ENTRY; + + /* If we set up but never connected, the + client import will not have been cleaned. */ + down_write(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import) { + struct obd_import *imp; + imp = obd->u.cli.cl_import; + CDEBUG(D_CONFIG, "%s: client import never connected\n", + obd->obd_name); + ptlrpc_invalidate_import(imp); + if (imp->imp_rq_pool) { + ptlrpc_free_rq_pool(imp->imp_rq_pool); + imp->imp_rq_pool = NULL; + } + client_destroy_import(imp); + obd->u.cli.cl_import = NULL; + } + up_write(&obd->u.cli.cl_sem); + + EXIT; +} + +static inline int +obd_process_config(struct obd_device *obd, int datalen, void *data) +{ + int rc; + DECLARE_LU_VARS(ldt, d); + ENTRY; + + OBD_CHECK_DEV(obd); + + obd->obd_process_conf = 1; + ldt = obd->obd_type->typ_lu; + d = obd->obd_lu_dev; + if (ldt != NULL && d != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, ldt->ldt_ctx_tags); + if (rc == 0) { + rc = d->ld_ops->ldo_process_config(&env, d, data); + lu_env_fini(&env); + } + } else { + OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP); + rc = OBP(obd, process_config)(obd, datalen, data); + } + OBD_COUNTER_INCREMENT(obd, process_config); + obd->obd_process_conf = 0; + + RETURN(rc); +} + +/* Pack an in-memory MD struct for storage on disk. + * Returns +ve size of packed MD (0 for free), or -ve error. + * + * If @disk_tgt == NULL, MD size is returned (max size if @mem_src == NULL). + * If @*disk_tgt != NULL and @mem_src == NULL, @*disk_tgt will be freed. + * If @*disk_tgt == NULL, it will be allocated + */ +static inline int obd_packmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt, + struct lov_stripe_md *mem_src) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, packmd); + EXP_COUNTER_INCREMENT(exp, packmd); + + rc = OBP(exp->exp_obd, packmd)(exp, disk_tgt, mem_src); + RETURN(rc); +} + +static inline int obd_size_diskmd(struct obd_export *exp, + struct lov_stripe_md *mem_src) +{ + return obd_packmd(exp, NULL, mem_src); +} + +/* helper functions */ +static inline int obd_alloc_diskmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt) +{ + LASSERT(disk_tgt); + LASSERT(*disk_tgt == NULL); + return obd_packmd(exp, disk_tgt, NULL); +} + +static inline int obd_free_diskmd(struct obd_export *exp, + struct lov_mds_md **disk_tgt) +{ + LASSERT(disk_tgt); + LASSERT(*disk_tgt); + /* + * LU-2590, for caller's convenience, *disk_tgt could be host + * endianness, it needs swab to LE if necessary, while just + * lov_mds_md header needs it for figuring out how much memory + * needs to be freed. + */ + if ((cpu_to_le32(LOV_MAGIC) != LOV_MAGIC) && + (((*disk_tgt)->lmm_magic == LOV_MAGIC_V1) || + ((*disk_tgt)->lmm_magic == LOV_MAGIC_V3))) + lustre_swab_lov_mds_md(*disk_tgt); + return obd_packmd(exp, disk_tgt, NULL); +} + +/* Unpack an MD struct from disk to in-memory format. + * Returns +ve size of unpacked MD (0 for free), or -ve error. + * + * If @mem_tgt == NULL, MD size is returned (max size if @disk_src == NULL). + * If @*mem_tgt != NULL and @disk_src == NULL, @*mem_tgt will be freed. + * If @*mem_tgt == NULL, it will be allocated + */ +static inline int obd_unpackmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt, + struct lov_mds_md *disk_src, + int disk_len) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, unpackmd); + EXP_COUNTER_INCREMENT(exp, unpackmd); + + rc = OBP(exp->exp_obd, unpackmd)(exp, mem_tgt, disk_src, disk_len); + RETURN(rc); +} + +/* helper functions */ +static inline int obd_alloc_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + LASSERT(mem_tgt); + LASSERT(*mem_tgt == NULL); + return obd_unpackmd(exp, mem_tgt, NULL, 0); +} + +static inline int obd_free_memmd(struct obd_export *exp, + struct lov_stripe_md **mem_tgt) +{ + int rc; + + LASSERT(mem_tgt); + LASSERT(*mem_tgt); + rc = obd_unpackmd(exp, mem_tgt, NULL, 0); + *mem_tgt = NULL; + return rc; +} + +static inline int obd_precreate(struct obd_export *exp) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, precreate); + OBD_COUNTER_INCREMENT(exp->exp_obd, precreate); + + rc = OBP(exp->exp_obd, precreate)(exp); + RETURN(rc); +} + +static inline int obd_create_async(struct obd_export *exp, + struct obd_info *oinfo, + struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, create_async); + EXP_COUNTER_INCREMENT(exp, create_async); + + rc = OBP(exp->exp_obd, create_async)(exp, oinfo, ea, oti); + RETURN(rc); +} + +static inline int obd_create(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md **ea, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, create); + EXP_COUNTER_INCREMENT(exp, create); + + rc = OBP(exp->exp_obd, create)(env, exp, obdo, ea, oti); + RETURN(rc); +} + +static inline int obd_destroy(const struct lu_env *env, struct obd_export *exp, + struct obdo *obdo, struct lov_stripe_md *ea, + struct obd_trans_info *oti, + struct obd_export *md_exp, void *capa) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, destroy); + EXP_COUNTER_INCREMENT(exp, destroy); + + rc = OBP(exp->exp_obd, destroy)(env, exp, obdo, ea, oti, md_exp, capa); + RETURN(rc); +} + +static inline int obd_getattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, getattr); + EXP_COUNTER_INCREMENT(exp, getattr); + + rc = OBP(exp->exp_obd, getattr)(env, exp, oinfo); + RETURN(rc); +} + +static inline int obd_getattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, getattr_async); + EXP_COUNTER_INCREMENT(exp, getattr_async); + + rc = OBP(exp->exp_obd, getattr_async)(exp, oinfo, set); + RETURN(rc); +} + +static inline int obd_setattr(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr); + EXP_COUNTER_INCREMENT(exp, setattr); + + rc = OBP(exp->exp_obd, setattr)(env, exp, oinfo, oti); + RETURN(rc); +} + +/* This performs all the requests set init/wait/destroy actions. */ +static inline int obd_setattr_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* This adds all the requests into @set if @set != NULL, otherwise + all requests are sent asynchronously without waiting for response. */ +static inline int obd_setattr_async(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, setattr_async); + EXP_COUNTER_INCREMENT(exp, setattr_async); + + rc = OBP(exp->exp_obd, setattr_async)(exp, oinfo, oti, set); + RETURN(rc); +} + +static inline int obd_add_conn(struct obd_import *imp, struct obd_uuid *uuid, + int priority) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, add_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, add_conn); + + rc = OBP(obd, add_conn)(imp, uuid, priority); + RETURN(rc); +} + +static inline int obd_del_conn(struct obd_import *imp, struct obd_uuid *uuid) +{ + struct obd_device *obd = imp->imp_obd; + int rc; + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, del_conn, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, del_conn); + + rc = OBP(obd, del_conn)(imp, uuid); + RETURN(rc); +} + +static inline struct obd_uuid *obd_get_uuid(struct obd_export *exp) +{ + struct obd_uuid *uuid; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, get_uuid, NULL); + EXP_COUNTER_INCREMENT(exp, get_uuid); + + uuid = OBP(exp->exp_obd, get_uuid)(exp); + RETURN(uuid); +} + +/** Create a new /a exp on device /a obd for the uuid /a cluuid + * @param exp New export handle + * @param d Connect data, supported flags are set, flags also understood + * by obd are returned. + */ +static inline int obd_connect(const struct lu_env *env, + struct obd_export **exp,struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *data, + void *localdata) +{ + int rc; + __u64 ocf = data ? data->ocd_connect_flags : 0; /* for post-condition + * check */ + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, connect, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, connect); + + rc = OBP(obd, connect)(env, exp, obd, cluuid, data, localdata); + /* check that only subset is granted */ + LASSERT(ergo(data != NULL, (data->ocd_connect_flags & ocf) == + data->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_reconnect(const struct lu_env *env, + struct obd_export *exp, + struct obd_device *obd, + struct obd_uuid *cluuid, + struct obd_connect_data *d, + void *localdata) +{ + int rc; + __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition + * check */ + + ENTRY; + + OBD_CHECK_DEV_ACTIVE(obd); + OBD_CHECK_DT_OP(obd, reconnect, 0); + OBD_COUNTER_INCREMENT(obd, reconnect); + + rc = OBP(obd, reconnect)(env, exp, obd, cluuid, d, localdata); + /* check that only subset is granted */ + LASSERT(ergo(d != NULL, + (d->ocd_connect_flags & ocf) == d->ocd_connect_flags)); + RETURN(rc); +} + +static inline int obd_disconnect(struct obd_export *exp) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, disconnect); + EXP_COUNTER_INCREMENT(exp, disconnect); + + rc = OBP(exp->exp_obd, disconnect)(exp); + RETURN(rc); +} + +static inline int obd_fid_init(struct obd_device *obd, struct obd_export *exp, + enum lu_cli_type type) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, fid_init, 0); + OBD_COUNTER_INCREMENT(obd, fid_init); + + rc = OBP(obd, fid_init)(obd, exp, type); + RETURN(rc); +} + +static inline int obd_fid_fini(struct obd_device *obd) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, fid_fini, 0); + OBD_COUNTER_INCREMENT(obd, fid_fini); + + rc = OBP(obd, fid_fini)(obd); + RETURN(rc); +} + +static inline int obd_fid_alloc(struct obd_export *exp, + struct lu_fid *fid, + struct md_op_data *op_data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, fid_alloc); + EXP_COUNTER_INCREMENT(exp, fid_alloc); + + rc = OBP(exp->exp_obd, fid_alloc)(exp, fid, op_data); + RETURN(rc); +} + +static inline int obd_ping(const struct lu_env *env, struct obd_export *exp) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, ping, 0); + EXP_COUNTER_INCREMENT(exp, ping); + + rc = OBP(exp->exp_obd, ping)(env, exp); + RETURN(rc); +} + +static inline int obd_pool_new(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_new, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_new); + + rc = OBP(obd, pool_new)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_del(struct obd_device *obd, char *poolname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_del, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_del); + + rc = OBP(obd, pool_del)(obd, poolname); + RETURN(rc); +} + +static inline int obd_pool_add(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_add, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_add); + + rc = OBP(obd, pool_add)(obd, poolname, ostname); + RETURN(rc); +} + +static inline int obd_pool_rem(struct obd_device *obd, char *poolname, char *ostname) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(obd, pool_rem, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, pool_rem); + + rc = OBP(obd, pool_rem)(obd, poolname, ostname); + RETURN(rc); +} + +static inline void obd_getref(struct obd_device *obd) +{ + ENTRY; + if (OBT(obd) && OBP(obd, getref)) { + OBD_COUNTER_INCREMENT(obd, getref); + OBP(obd, getref)(obd); + } + EXIT; +} + +static inline void obd_putref(struct obd_device *obd) +{ + ENTRY; + if (OBT(obd) && OBP(obd, putref)) { + OBD_COUNTER_INCREMENT(obd, putref); + OBP(obd, putref)(obd); + } + EXIT; +} + +static inline int obd_init_export(struct obd_export *exp) +{ + int rc = 0; + + ENTRY; + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, init_export)) + rc = OBP(exp->exp_obd, init_export)(exp); + RETURN(rc); +} + +static inline int obd_destroy_export(struct obd_export *exp) +{ + ENTRY; + if ((exp)->exp_obd != NULL && OBT((exp)->exp_obd) && + OBP((exp)->exp_obd, destroy_export)) + OBP(exp->exp_obd, destroy_export)(exp); + RETURN(0); +} + +static inline int obd_extent_calc(struct obd_export *exp, + struct lov_stripe_md *md, + int cmd, obd_off *offset) +{ + int rc; + ENTRY; + EXP_CHECK_DT_OP(exp, extent_calc); + rc = OBP(exp->exp_obd, extent_calc)(exp, md, cmd, offset); + RETURN(rc); +} + +static inline struct dentry * +obd_lvfs_fid2dentry(struct obd_export *exp, struct ost_id *oi, __u32 gen) +{ + struct lvfs_run_ctxt *ctxt = &exp->exp_obd->obd_lvfs_ctxt; + LASSERT(exp->exp_obd); + + return ctxt->cb_ops.l_fid2dentry(ostid_id(oi), gen, ostid_seq(oi), + exp->exp_obd); +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs_async(struct obd_export *exp, + struct obd_info *oinfo, + __u64 max_age, + struct ptlrpc_request_set *rqset) +{ + int rc = 0; + struct obd_device *obd; + ENTRY; + + if (exp == NULL || exp->exp_obd == NULL) + RETURN(-EINVAL); + + obd = exp->exp_obd; + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "%s: osfs %p age "LPU64", max_age "LPU64"\n", + obd->obd_name, &obd->obd_osfs, obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs_async)(exp, oinfo, max_age, rqset); + } else { + CDEBUG(D_SUPER,"%s: use %p cache blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(oinfo->oi_osfs, &obd->obd_osfs, sizeof(*oinfo->oi_osfs)); + spin_unlock(&obd->obd_osfs_lock); + oinfo->oi_flags |= OBD_STATFS_FROM_CACHE; + if (oinfo->oi_cb_up) + oinfo->oi_cb_up(oinfo, 0); + } + RETURN(rc); +} + +static inline int obd_statfs_rqset(struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + struct ptlrpc_request_set *set = NULL; + struct obd_info oinfo = { { { 0 } } }; + int rc = 0; + ENTRY; + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + oinfo.oi_osfs = osfs; + oinfo.oi_flags = flags; + rc = obd_statfs_async(exp, &oinfo, max_age, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +/* @max_age is the oldest time in jiffies that we accept using a cached data. + * If the cache is older than @max_age we will get a new value from the + * target. Use a value of "cfs_time_current() + HZ" to guarantee freshness. */ +static inline int obd_statfs(const struct lu_env *env, struct obd_export *exp, + struct obd_statfs *osfs, __u64 max_age, + __u32 flags) +{ + int rc = 0; + struct obd_device *obd = exp->exp_obd; + ENTRY; + + if (obd == NULL) + RETURN(-EINVAL); + + OBD_CHECK_DT_OP(obd, statfs, -EOPNOTSUPP); + OBD_COUNTER_INCREMENT(obd, statfs); + + CDEBUG(D_SUPER, "osfs "LPU64", max_age "LPU64"\n", + obd->obd_osfs_age, max_age); + if (cfs_time_before_64(obd->obd_osfs_age, max_age)) { + rc = OBP(obd, statfs)(env, exp, osfs, max_age, flags); + if (rc == 0) { + spin_lock(&obd->obd_osfs_lock); + memcpy(&obd->obd_osfs, osfs, sizeof(obd->obd_osfs)); + obd->obd_osfs_age = cfs_time_current_64(); + spin_unlock(&obd->obd_osfs_lock); + } + } else { + CDEBUG(D_SUPER, "%s: use %p cache blocks "LPU64"/"LPU64 + " objects "LPU64"/"LPU64"\n", + obd->obd_name, &obd->obd_osfs, + obd->obd_osfs.os_bavail, obd->obd_osfs.os_blocks, + obd->obd_osfs.os_ffree, obd->obd_osfs.os_files); + spin_lock(&obd->obd_osfs_lock); + memcpy(osfs, &obd->obd_osfs, sizeof(*osfs)); + spin_unlock(&obd->obd_osfs_lock); + } + RETURN(rc); +} + +static inline int obd_sync_rqset(struct obd_export *exp, struct obd_info *oinfo, + obd_size start, obd_size end) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, sync); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, sync)(NULL, exp, oinfo, start, end, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_sync(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, obd_size start, obd_size end, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, sync, -EOPNOTSUPP); + EXP_COUNTER_INCREMENT(exp, sync); + + rc = OBP(exp->exp_obd, sync)(env, exp, oinfo, start, end, set); + RETURN(rc); +} + +static inline int obd_punch_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct obd_trans_info *oti) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, punch); + EXP_COUNTER_INCREMENT(exp, punch); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, punch)(NULL, exp, oinfo, oti, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_punch(const struct lu_env *env, struct obd_export *exp, + struct obd_info *oinfo, struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, punch); + EXP_COUNTER_INCREMENT(exp, punch); + + rc = OBP(exp->exp_obd, punch)(env, exp, oinfo, oti, rqset); + RETURN(rc); +} + +static inline int obd_brw(int cmd, struct obd_export *exp, + struct obd_info *oinfo, obd_count oa_bufs, + struct brw_page *pg, struct obd_trans_info *oti) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, brw); + EXP_COUNTER_INCREMENT(exp, brw); + + if (!(cmd & (OBD_BRW_RWMASK | OBD_BRW_CHECK))) { + CERROR("obd_brw: cmd must be OBD_BRW_READ, OBD_BRW_WRITE, " + "or OBD_BRW_CHECK\n"); + LBUG(); + } + + rc = OBP(exp->exp_obd, brw)(cmd, exp, oinfo, oa_bufs, pg, oti); + RETURN(rc); +} + +static inline int obd_preprw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *remote, int *pages, + struct niobuf_local *local, + struct obd_trans_info *oti, + struct lustre_capa *capa) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, preprw); + EXP_COUNTER_INCREMENT(exp, preprw); + + rc = OBP(exp->exp_obd, preprw)(env, cmd, exp, oa, objcount, obj, remote, + pages, local, oti, capa); + RETURN(rc); +} + +static inline int obd_commitrw(const struct lu_env *env, int cmd, + struct obd_export *exp, struct obdo *oa, + int objcount, struct obd_ioobj *obj, + struct niobuf_remote *rnb, int pages, + struct niobuf_local *local, + struct obd_trans_info *oti, int rc) +{ + ENTRY; + + EXP_CHECK_DT_OP(exp, commitrw); + EXP_COUNTER_INCREMENT(exp, commitrw); + + rc = OBP(exp->exp_obd, commitrw)(env, cmd, exp, oa, objcount, obj, + rnb, pages, local, oti, rc); + RETURN(rc); +} + +static inline int obd_merge_lvb(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct ost_lvb *lvb, int kms_only) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, merge_lvb); + EXP_COUNTER_INCREMENT(exp, merge_lvb); + + rc = OBP(exp->exp_obd, merge_lvb)(exp, lsm, lvb, kms_only); + RETURN(rc); +} + +static inline int obd_adjust_kms(struct obd_export *exp, + struct lov_stripe_md *lsm, obd_off size, + int shrink) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, adjust_kms); + EXP_COUNTER_INCREMENT(exp, adjust_kms); + + rc = OBP(exp->exp_obd, adjust_kms)(exp, lsm, size, shrink); + RETURN(rc); +} + +static inline int obd_iocontrol(unsigned int cmd, struct obd_export *exp, + int len, void *karg, void *uarg) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, iocontrol); + EXP_COUNTER_INCREMENT(exp, iocontrol); + + rc = OBP(exp->exp_obd, iocontrol)(cmd, exp, len, karg, uarg); + RETURN(rc); +} + +static inline int obd_enqueue_rqset(struct obd_export *exp, + struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo) +{ + struct ptlrpc_request_set *set = NULL; + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, enqueue); + EXP_COUNTER_INCREMENT(exp, enqueue); + + set = ptlrpc_prep_set(); + if (set == NULL) + RETURN(-ENOMEM); + + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set); + if (rc == 0) + rc = ptlrpc_set_wait(set); + ptlrpc_set_destroy(set); + RETURN(rc); +} + +static inline int obd_enqueue(struct obd_export *exp, + struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *set) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, enqueue); + EXP_COUNTER_INCREMENT(exp, enqueue); + + rc = OBP(exp->exp_obd, enqueue)(exp, oinfo, einfo, set); + RETURN(rc); +} + +static inline int obd_change_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, change_cbdata); + EXP_COUNTER_INCREMENT(exp, change_cbdata); + + rc = OBP(exp->exp_obd, change_cbdata)(exp, lsm, it, data); + RETURN(rc); +} + +static inline int obd_find_cbdata(struct obd_export *exp, + struct lov_stripe_md *lsm, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, find_cbdata); + EXP_COUNTER_INCREMENT(exp, find_cbdata); + + rc = OBP(exp->exp_obd, find_cbdata)(exp, lsm, it, data); + RETURN(rc); +} + +static inline int obd_cancel(struct obd_export *exp, + struct lov_stripe_md *ea, __u32 mode, + struct lustre_handle *lockh) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, cancel); + EXP_COUNTER_INCREMENT(exp, cancel); + + rc = OBP(exp->exp_obd, cancel)(exp, ea, mode, lockh); + RETURN(rc); +} + +static inline int obd_cancel_unused(struct obd_export *exp, + struct lov_stripe_md *ea, + ldlm_cancel_flags_t flags, + void *opaque) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, cancel_unused); + EXP_COUNTER_INCREMENT(exp, cancel_unused); + + rc = OBP(exp->exp_obd, cancel_unused)(exp, ea, flags, opaque); + RETURN(rc); +} + +static inline int obd_pin(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct obd_client_handle *handle, + int flag) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, pin); + EXP_COUNTER_INCREMENT(exp, pin); + + rc = OBP(exp->exp_obd, pin)(exp, fid, oc, handle, flag); + RETURN(rc); +} + +static inline int obd_unpin(struct obd_export *exp, + struct obd_client_handle *handle, int flag) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, unpin); + EXP_COUNTER_INCREMENT(exp, unpin); + + rc = OBP(exp->exp_obd, unpin)(exp, handle, flag); + RETURN(rc); +} + + +static inline void obd_import_event(struct obd_device *obd, + struct obd_import *imp, + enum obd_import_event event) +{ + ENTRY; + if (!obd) { + CERROR("NULL device\n"); + EXIT; + return; + } + if (obd->obd_set_up && OBP(obd, import_event)) { + OBD_COUNTER_INCREMENT(obd, import_event); + OBP(obd, import_event)(obd, imp, event); + } + EXIT; +} + +static inline int obd_llog_connect(struct obd_export *exp, + struct llogd_conn_body *body) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, llog_connect, 0); + EXP_COUNTER_INCREMENT(exp, llog_connect); + + rc = OBP(exp->exp_obd, llog_connect)(exp, body); + RETURN(rc); +} + + +static inline int obd_notify(struct obd_device *obd, + struct obd_device *watched, + enum obd_notify_event ev, + void *data) +{ + int rc; + ENTRY; + OBD_CHECK_DEV(obd); + + /* the check for async_recov is a complete hack - I'm hereby + overloading the meaning to also mean "this was called from + mds_postsetup". I know that my mds is able to handle notifies + by this point, and it needs to get them to execute mds_postrecov. */ + if (!obd->obd_set_up && !obd->obd_async_recov) { + CDEBUG(D_HA, "obd %s not set up\n", obd->obd_name); + RETURN(-EINVAL); + } + + if (!OBP(obd, notify)) { + CDEBUG(D_HA, "obd %s has no notify handler\n", obd->obd_name); + RETURN(-ENOSYS); + } + + OBD_COUNTER_INCREMENT(obd, notify); + rc = OBP(obd, notify)(obd, watched, ev, data); + RETURN(rc); +} + +static inline int obd_notify_observer(struct obd_device *observer, + struct obd_device *observed, + enum obd_notify_event ev, + void *data) +{ + int rc1; + int rc2; + + struct obd_notify_upcall *onu; + + if (observer->obd_observer) + rc1 = obd_notify(observer->obd_observer, observed, ev, data); + else + rc1 = 0; + /* + * Also, call non-obd listener, if any + */ + onu = &observer->obd_upcall; + if (onu->onu_upcall != NULL) + rc2 = onu->onu_upcall(observer, observed, ev, + onu->onu_owner, NULL); + else + rc2 = 0; + + return rc1 ? rc1 : rc2; +} + +static inline int obd_quotacheck(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, quotacheck); + EXP_COUNTER_INCREMENT(exp, quotacheck); + + rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_quotactl(struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + EXP_CHECK_DT_OP(exp, quotactl); + EXP_COUNTER_INCREMENT(exp, quotactl); + + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_health_check(const struct lu_env *env, + struct obd_device *obd) +{ + /* returns: 0 on healthy + * >0 on unhealthy + reason code/flag + * however the only suppored reason == 1 right now + * We'll need to define some better reasons + * or flags in the future. + * <0 on error + */ + int rc; + ENTRY; + + /* don't use EXP_CHECK_DT_OP, because NULL method is normal here */ + if (obd == NULL || !OBT(obd)) { + CERROR("cleaned up obd\n"); + RETURN(-EOPNOTSUPP); + } + if (!obd->obd_set_up || obd->obd_stopping) + RETURN(0); + if (!OBP(obd, health_check)) + RETURN(0); + + rc = OBP(obd, health_check)(env, obd); + RETURN(rc); +} + +static inline int obd_register_observer(struct obd_device *obd, + struct obd_device *observer) +{ + ENTRY; + OBD_CHECK_DEV(obd); + down_write(&obd->obd_observer_link_sem); + if (obd->obd_observer && observer) { + up_write(&obd->obd_observer_link_sem); + RETURN(-EALREADY); + } + obd->obd_observer = observer; + up_write(&obd->obd_observer_link_sem); + RETURN(0); +} + +static inline int obd_pin_observer(struct obd_device *obd, + struct obd_device **observer) +{ + ENTRY; + down_read(&obd->obd_observer_link_sem); + if (!obd->obd_observer) { + *observer = NULL; + up_read(&obd->obd_observer_link_sem); + RETURN(-ENOENT); + } + *observer = obd->obd_observer; + RETURN(0); +} + +static inline int obd_unpin_observer(struct obd_device *obd) +{ + ENTRY; + up_read(&obd->obd_observer_link_sem); + RETURN(0); +} + +#if 0 +static inline int obd_register_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb, + obd_pin_extent_cb pin_cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, register_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_page_removal_cb); + + rc = OBP(exp->exp_obd, register_page_removal_cb)(exp, cb, pin_cb); + RETURN(rc); +} + +static inline int obd_unregister_page_removal_cb(struct obd_export *exp, + obd_page_removal_cb_t cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_page_removal_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_page_removal_cb); + + rc = OBP(exp->exp_obd, unregister_page_removal_cb)(exp, cb); + RETURN(rc); +} + +static inline int obd_register_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, register_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, register_lock_cancel_cb); + + rc = OBP(exp->exp_obd, register_lock_cancel_cb)(exp, cb); + RETURN(rc); +} + +static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp, + obd_lock_cancel_cb cb) +{ + int rc; + ENTRY; + + OBD_CHECK_DT_OP(exp->exp_obd, unregister_lock_cancel_cb, 0); + OBD_COUNTER_INCREMENT(exp->exp_obd, unregister_lock_cancel_cb); + + rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb); + RETURN(rc); +} +#endif + +/* metadata helpers */ +static inline int md_getstatus(struct obd_export *exp, + struct lu_fid *fid, struct obd_capa **pc) +{ + int rc; + ENTRY; + + EXP_CHECK_MD_OP(exp, getstatus); + EXP_MD_COUNTER_INCREMENT(exp, getstatus); + rc = MDP(exp->exp_obd, getstatus)(exp, fid, pc); + RETURN(rc); +} + +static inline int md_getattr(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, getattr); + EXP_MD_COUNTER_INCREMENT(exp, getattr); + rc = MDP(exp->exp_obd, getattr)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_null_inode(struct obd_export *exp, + const struct lu_fid *fid) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, null_inode); + EXP_MD_COUNTER_INCREMENT(exp, null_inode); + rc = MDP(exp->exp_obd, null_inode)(exp, fid); + RETURN(rc); +} + +static inline int md_find_cbdata(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_iterator_t it, void *data) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, find_cbdata); + EXP_MD_COUNTER_INCREMENT(exp, find_cbdata); + rc = MDP(exp->exp_obd, find_cbdata)(exp, fid, it, data); + RETURN(rc); +} + +static inline int md_close(struct obd_export *exp, struct md_op_data *op_data, + struct md_open_data *mod, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, close); + EXP_MD_COUNTER_INCREMENT(exp, close); + rc = MDP(exp->exp_obd, close)(exp, op_data, mod, request); + RETURN(rc); +} + +static inline int md_create(struct obd_export *exp, struct md_op_data *op_data, + const void *data, int datalen, int mode, __u32 uid, + __u32 gid, cfs_cap_t cap_effective, __u64 rdev, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, create); + EXP_MD_COUNTER_INCREMENT(exp, create); + rc = MDP(exp->exp_obd, create)(exp, op_data, data, datalen, mode, + uid, gid, cap_effective, rdev, request); + RETURN(rc); +} + +static inline int md_done_writing(struct obd_export *exp, + struct md_op_data *op_data, + struct md_open_data *mod) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, done_writing); + EXP_MD_COUNTER_INCREMENT(exp, done_writing); + rc = MDP(exp->exp_obd, done_writing)(exp, op_data, mod); + RETURN(rc); +} + +static inline int md_enqueue(struct obd_export *exp, + struct ldlm_enqueue_info *einfo, + struct lookup_intent *it, + struct md_op_data *op_data, + struct lustre_handle *lockh, + void *lmm, int lmmsize, + struct ptlrpc_request **req, + int extra_lock_flags) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, enqueue); + EXP_MD_COUNTER_INCREMENT(exp, enqueue); + rc = MDP(exp->exp_obd, enqueue)(exp, einfo, it, op_data, lockh, + lmm, lmmsize, req, extra_lock_flags); + RETURN(rc); +} + +static inline int md_getattr_name(struct obd_export *exp, + struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, getattr_name); + EXP_MD_COUNTER_INCREMENT(exp, getattr_name); + rc = MDP(exp->exp_obd, getattr_name)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_intent_lock(struct obd_export *exp, + struct md_op_data *op_data, void *lmm, + int lmmsize, struct lookup_intent *it, + int lookup_flags, struct ptlrpc_request **reqp, + ldlm_blocking_callback cb_blocking, + __u64 extra_lock_flags) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, intent_lock); + EXP_MD_COUNTER_INCREMENT(exp, intent_lock); + rc = MDP(exp->exp_obd, intent_lock)(exp, op_data, lmm, lmmsize, + it, lookup_flags, reqp, cb_blocking, + extra_lock_flags); + RETURN(rc); +} + +static inline int md_link(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, link); + EXP_MD_COUNTER_INCREMENT(exp, link); + rc = MDP(exp->exp_obd, link)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_rename(struct obd_export *exp, struct md_op_data *op_data, + const char *old, int oldlen, const char *new, + int newlen, struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, rename); + EXP_MD_COUNTER_INCREMENT(exp, rename); + rc = MDP(exp->exp_obd, rename)(exp, op_data, old, oldlen, new, + newlen, request); + RETURN(rc); +} + +static inline int md_is_subdir(struct obd_export *exp, + const struct lu_fid *pfid, + const struct lu_fid *cfid, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, is_subdir); + EXP_MD_COUNTER_INCREMENT(exp, is_subdir); + rc = MDP(exp->exp_obd, is_subdir)(exp, pfid, cfid, request); + RETURN(rc); +} + +static inline int md_setattr(struct obd_export *exp, struct md_op_data *op_data, + void *ea, int ealen, void *ea2, int ea2len, + struct ptlrpc_request **request, + struct md_open_data **mod) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, setattr); + EXP_MD_COUNTER_INCREMENT(exp, setattr); + rc = MDP(exp->exp_obd, setattr)(exp, op_data, ea, ealen, + ea2, ea2len, request, mod); + RETURN(rc); +} + +static inline int md_sync(struct obd_export *exp, const struct lu_fid *fid, + struct obd_capa *oc, struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, sync); + EXP_MD_COUNTER_INCREMENT(exp, sync); + rc = MDP(exp->exp_obd, sync)(exp, fid, oc, request); + RETURN(rc); +} + +static inline int md_readpage(struct obd_export *exp, struct md_op_data *opdata, + struct page **pages, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, readpage); + EXP_MD_COUNTER_INCREMENT(exp, readpage); + rc = MDP(exp->exp_obd, readpage)(exp, opdata, pages, request); + RETURN(rc); +} + +static inline int md_unlink(struct obd_export *exp, struct md_op_data *op_data, + struct ptlrpc_request **request) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, unlink); + EXP_MD_COUNTER_INCREMENT(exp, unlink); + rc = MDP(exp->exp_obd, unlink)(exp, op_data, request); + RETURN(rc); +} + +static inline int md_get_lustre_md(struct obd_export *exp, + struct ptlrpc_request *req, + struct obd_export *dt_exp, + struct obd_export *md_exp, + struct lustre_md *md) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, get_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, get_lustre_md); + RETURN(MDP(exp->exp_obd, get_lustre_md)(exp, req, dt_exp, md_exp, md)); +} + +static inline int md_free_lustre_md(struct obd_export *exp, + struct lustre_md *md) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, free_lustre_md); + EXP_MD_COUNTER_INCREMENT(exp, free_lustre_md); + RETURN(MDP(exp->exp_obd, free_lustre_md)(exp, md)); +} + +static inline int md_setxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + obd_valid valid, const char *name, + const char *input, int input_size, + int output_size, int flags, __u32 suppgid, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, setxattr); + EXP_MD_COUNTER_INCREMENT(exp, setxattr); + RETURN(MDP(exp->exp_obd, setxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + suppgid, request)); +} + +static inline int md_getxattr(struct obd_export *exp, + const struct lu_fid *fid, struct obd_capa *oc, + obd_valid valid, const char *name, + const char *input, int input_size, + int output_size, int flags, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, getxattr); + EXP_MD_COUNTER_INCREMENT(exp, getxattr); + RETURN(MDP(exp->exp_obd, getxattr)(exp, fid, oc, valid, name, input, + input_size, output_size, flags, + request)); +} + +static inline int md_set_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och, + struct ptlrpc_request *open_req) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, set_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, set_open_replay_data); + RETURN(MDP(exp->exp_obd, set_open_replay_data)(exp, och, open_req)); +} + +static inline int md_clear_open_replay_data(struct obd_export *exp, + struct obd_client_handle *och) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, clear_open_replay_data); + EXP_MD_COUNTER_INCREMENT(exp, clear_open_replay_data); + RETURN(MDP(exp->exp_obd, clear_open_replay_data)(exp, och)); +} + +static inline int md_set_lock_data(struct obd_export *exp, + __u64 *lockh, void *data, __u64 *bits) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, set_lock_data); + EXP_MD_COUNTER_INCREMENT(exp, set_lock_data); + RETURN(MDP(exp->exp_obd, set_lock_data)(exp, lockh, data, bits)); +} + +static inline int md_cancel_unused(struct obd_export *exp, + const struct lu_fid *fid, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + ldlm_cancel_flags_t flags, + void *opaque) +{ + int rc; + ENTRY; + + EXP_CHECK_MD_OP(exp, cancel_unused); + EXP_MD_COUNTER_INCREMENT(exp, cancel_unused); + + rc = MDP(exp->exp_obd, cancel_unused)(exp, fid, policy, mode, + flags, opaque); + RETURN(rc); +} + +static inline ldlm_mode_t md_lock_match(struct obd_export *exp, __u64 flags, + const struct lu_fid *fid, + ldlm_type_t type, + ldlm_policy_data_t *policy, + ldlm_mode_t mode, + struct lustre_handle *lockh) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, lock_match); + EXP_MD_COUNTER_INCREMENT(exp, lock_match); + RETURN(MDP(exp->exp_obd, lock_match)(exp, flags, fid, type, + policy, mode, lockh)); +} + +static inline int md_init_ea_size(struct obd_export *exp, int easize, + int def_asize, int cookiesize) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, init_ea_size); + EXP_MD_COUNTER_INCREMENT(exp, init_ea_size); + RETURN(MDP(exp->exp_obd, init_ea_size)(exp, easize, def_asize, + cookiesize)); +} + +static inline int md_get_remote_perm(struct obd_export *exp, + const struct lu_fid *fid, + struct obd_capa *oc, __u32 suppgid, + struct ptlrpc_request **request) +{ + ENTRY; + EXP_CHECK_MD_OP(exp, get_remote_perm); + EXP_MD_COUNTER_INCREMENT(exp, get_remote_perm); + RETURN(MDP(exp->exp_obd, get_remote_perm)(exp, fid, oc, suppgid, + request)); +} + +static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa, + renew_capa_cb_t cb) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, renew_capa); + EXP_MD_COUNTER_INCREMENT(exp, renew_capa); + rc = MDP(exp->exp_obd, renew_capa)(exp, ocapa, cb); + RETURN(rc); +} + +static inline int md_unpack_capa(struct obd_export *exp, + struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa **oc) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, unpack_capa); + EXP_MD_COUNTER_INCREMENT(exp, unpack_capa); + rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc); + RETURN(rc); +} + +static inline int md_intent_getattr_async(struct obd_export *exp, + struct md_enqueue_info *minfo, + struct ldlm_enqueue_info *einfo) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, intent_getattr_async); + EXP_MD_COUNTER_INCREMENT(exp, intent_getattr_async); + rc = MDP(exp->exp_obd, intent_getattr_async)(exp, minfo, einfo); + RETURN(rc); +} + +static inline int md_revalidate_lock(struct obd_export *exp, + struct lookup_intent *it, + struct lu_fid *fid, __u64 *bits) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, revalidate_lock); + EXP_MD_COUNTER_INCREMENT(exp, revalidate_lock); + rc = MDP(exp->exp_obd, revalidate_lock)(exp, it, fid, bits); + RETURN(rc); +} + + +/* OBD Metadata Support */ + +extern int obd_init_caches(void); +extern void obd_cleanup_caches(void); + +/* support routines */ +extern struct kmem_cache *obdo_cachep; + +#define OBDO_ALLOC(ptr) \ +do { \ + OBD_SLAB_ALLOC_PTR_GFP((ptr), obdo_cachep, __GFP_IO); \ +} while(0) + +#define OBDO_FREE(ptr) \ +do { \ + OBD_SLAB_FREE_PTR((ptr), obdo_cachep); \ +} while(0) + + +static inline void obdo2fid(struct obdo *oa, struct lu_fid *fid) +{ + /* something here */ +} + +static inline void fid2obdo(struct lu_fid *fid, struct obdo *oa) +{ + /* something here */ +} + +typedef int (*register_lwp_cb)(void *data); + +struct lwp_register_item { + struct obd_export **lri_exp; + register_lwp_cb lri_cb_func; + void *lri_cb_data; + struct list_head lri_list; + char lri_name[MTI_NAME_MAXLEN]; +}; + +/* I'm as embarrassed about this as you are. + * + * <shaver> // XXX do not look into _superhack with remaining eye + * <shaver> // XXX if this were any uglier, I'd get my own show on MTV */ +extern int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); + +/* obd_mount.c */ + +/* sysctl.c */ +extern void obd_sysctl_init (void); +extern void obd_sysctl_clean (void); + +/* uuid.c */ +typedef __u8 class_uuid_t[16]; +void class_uuid_unparse(class_uuid_t in, struct obd_uuid *out); + +/* lustre_peer.c */ +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index); +int class_add_uuid(const char *uuid, __u64 nid); +int class_del_uuid (const char *uuid); +int class_check_uuid(struct obd_uuid *uuid, __u64 nid); +void class_init_uuidlist(void); +void class_exit_uuidlist(void); + +/* mea.c */ +int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen); +int raw_name2idx(int hashtype, int count, const char *name, int namelen); + +/* prng.c */ +#define ll_generate_random_uuid(uuid_out) cfs_get_random_bytes(uuid_out, sizeof(class_uuid_t)) + +#endif /* __LINUX_OBD_CLASS_H */ diff --git a/drivers/staging/lustre/lustre/include/obd_lov.h b/drivers/staging/lustre/lustre/include/obd_lov.h new file mode 100644 index 000000000000..d82f3341d0a8 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_lov.h @@ -0,0 +1,126 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_LOV_H__ +#define _OBD_LOV_H__ + +#define LOV_DEFAULT_STRIPE_SIZE (1 << LNET_MTU_BITS) + +static inline int lov_stripe_md_size(__u16 stripes) +{ + return sizeof(struct lov_stripe_md) + stripes*sizeof(struct lov_oinfo*); +} + +static inline __u32 lov_mds_md_size(__u16 stripes, __u32 lmm_magic) +{ + if (lmm_magic == LOV_MAGIC_V3) + return sizeof(struct lov_mds_md_v3) + + stripes * sizeof(struct lov_ost_data_v1); + else + return sizeof(struct lov_mds_md_v1) + + stripes * sizeof(struct lov_ost_data_v1); +} + +struct lov_version_size { + __u32 lvs_magic; + size_t lvs_lmm_size; + size_t lvs_lod_size; +}; + +static inline __u32 lov_mds_md_stripecnt(int ea_size, __u32 lmm_magic) +{ + static const struct lov_version_size lmm_ver_size[] = { + { .lvs_magic = LOV_MAGIC_V3, + .lvs_lmm_size = sizeof(struct lov_mds_md_v3), + .lvs_lod_size = sizeof(struct lov_ost_data_v1) }, + { .lvs_magic = LOV_MAGIC_V1, + .lvs_lmm_size = sizeof(struct lov_mds_md_v1), + .lvs_lod_size = sizeof(struct lov_ost_data_v1)} }; + int i; + + for (i = 0; i < ARRAY_SIZE(lmm_ver_size); i++) { + if (lmm_magic == lmm_ver_size[i].lvs_magic) { + if (ea_size <= lmm_ver_size[i].lvs_lmm_size) + return 0; + return (ea_size - lmm_ver_size[i].lvs_lmm_size) / + lmm_ver_size[i].lvs_lod_size; + } + } + + /* Invalid LOV magic, so no stripes could fit */ + return 0; +} + +/* lov_do_div64(a, b) returns a % b, and a = a / b. + * The 32-bit code is LOV-specific due to knowing about stripe limits in + * order to reduce the divisor to a 32-bit number. If the divisor is + * already a 32-bit value the compiler handles this directly. */ +#if BITS_PER_LONG > 32 +# define lov_do_div64(n,base) ({ \ + uint64_t __base = (base); \ + uint64_t __rem; \ + __rem = ((uint64_t)(n)) % __base; \ + (n) = ((uint64_t)(n)) / __base; \ + __rem; \ + }) +#else +# define lov_do_div64(n,base) ({ \ + uint64_t __rem; \ + if ((sizeof(base) > 4) && (((base) & 0xffffffff00000000ULL) != 0)) { \ + int __remainder; \ + LASSERTF(!((base) & (LOV_MIN_STRIPE_SIZE - 1)), "64 bit lov " \ + "division %llu / %llu\n", (n), (uint64_t)(base)); \ + __remainder = (n) & (LOV_MIN_STRIPE_SIZE - 1); \ + (n) >>= LOV_MIN_STRIPE_BITS; \ + __rem = do_div(n, (base) >> LOV_MIN_STRIPE_BITS); \ + __rem <<= LOV_MIN_STRIPE_BITS; \ + __rem += __remainder; \ + } else { \ + __rem = do_div(n, base); \ + } \ + __rem; \ + }) +#endif + +#define IOC_LOV_TYPE 'g' +#define IOC_LOV_MIN_NR 50 +#define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long) +#define IOC_LOV_MAX_NR 50 + +#define QOS_DEFAULT_THRESHOLD 10 /* MB */ +#define QOS_DEFAULT_MAXAGE 5 /* Seconds */ + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_ost.h b/drivers/staging/lustre/lustre/include/obd_ost.h new file mode 100644 index 000000000000..af89843c312b --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_ost.h @@ -0,0 +1,96 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/include/obd_ost.h + * + * Data structures for object storage targets and client: OST & OSC's + * + * See also lustre_idl.h for wire formats of requests. + */ + +#ifndef _LUSTRE_OST_H +#define _LUSTRE_OST_H + +#include <obd_class.h> + +struct osc_brw_async_args { + struct obdo *aa_oa; + int aa_requested_nob; + int aa_nio_count; + obd_count aa_page_count; + int aa_resends; + struct brw_page **aa_ppga; + struct client_obd *aa_cli; + struct list_head aa_oaps; + struct list_head aa_exts; + struct obd_capa *aa_ocapa; + struct cl_req *aa_clerq; +}; + +#define osc_grant_args osc_brw_async_args +struct osc_async_args { + struct obd_info *aa_oi; +}; + +struct osc_setattr_args { + struct obdo *sa_oa; + obd_enqueue_update_f sa_upcall; + void *sa_cookie; +}; + +struct osc_fsync_args { + struct obd_info *fa_oi; + obd_enqueue_update_f fa_upcall; + void *fa_cookie; +}; + +struct osc_enqueue_args { + struct obd_export *oa_exp; + __u64 *oa_flags; + obd_enqueue_update_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle *oa_lockh; + struct ldlm_enqueue_info *oa_ei; + unsigned int oa_agl:1; +}; + +#if 0 +int osc_extent_blocking_cb(struct ldlm_lock *lock, + struct ldlm_lock_desc *new, void *data, + int flag); +#endif + +#endif diff --git a/drivers/staging/lustre/lustre/include/obd_support.h b/drivers/staging/lustre/lustre/include/obd_support.h new file mode 100644 index 000000000000..b5d40afc3599 --- /dev/null +++ b/drivers/staging/lustre/lustre/include/obd_support.h @@ -0,0 +1,851 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef _OBD_SUPPORT +#define _OBD_SUPPORT + +#include <linux/libcfs/libcfs.h> +#include <lvfs.h> +#include <lprocfs_status.h> + +#include <linux/obd_support.h> + +/* global variables */ +extern struct lprocfs_stats *obd_memory; +enum { + OBD_MEMORY_STAT = 0, + OBD_MEMORY_PAGES_STAT = 1, + OBD_STATS_NUM, +}; + +extern unsigned int obd_debug_peer_on_timeout; +extern unsigned int obd_dump_on_timeout; +extern unsigned int obd_dump_on_eviction; +/* obd_timeout should only be used for recovery, not for + networking / disk / timings affected by load (use Adaptive Timeouts) */ +extern unsigned int obd_timeout; /* seconds */ +extern unsigned int ldlm_timeout; /* seconds */ +extern unsigned int obd_timeout_set; +extern unsigned int ldlm_timeout_set; +extern unsigned int at_min; +extern unsigned int at_max; +extern unsigned int at_history; +extern int at_early_margin; +extern int at_extra; +extern unsigned int obd_sync_filter; +extern unsigned int obd_max_dirty_pages; +extern atomic_t obd_dirty_pages; +extern atomic_t obd_dirty_transit_pages; +extern unsigned int obd_alloc_fail_rate; +extern char obd_jobid_var[]; + +/* lvfs.c */ +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line); + +/* Some hash init argument constants */ +#define HASH_POOLS_BKT_BITS 3 +#define HASH_POOLS_CUR_BITS 3 +#define HASH_POOLS_MAX_BITS 7 +#define HASH_UUID_BKT_BITS 5 +#define HASH_UUID_CUR_BITS 7 +#define HASH_UUID_MAX_BITS 12 +#define HASH_NID_BKT_BITS 5 +#define HASH_NID_CUR_BITS 7 +#define HASH_NID_MAX_BITS 12 +#define HASH_NID_STATS_BKT_BITS 5 +#define HASH_NID_STATS_CUR_BITS 7 +#define HASH_NID_STATS_MAX_BITS 12 +#define HASH_LQE_BKT_BITS 5 +#define HASH_LQE_CUR_BITS 7 +#define HASH_LQE_MAX_BITS 12 +#define HASH_CONN_BKT_BITS 5 +#define HASH_CONN_CUR_BITS 5 +#define HASH_CONN_MAX_BITS 15 +#define HASH_EXP_LOCK_BKT_BITS 5 +#define HASH_EXP_LOCK_CUR_BITS 7 +#define HASH_EXP_LOCK_MAX_BITS 16 +#define HASH_CL_ENV_BKT_BITS 5 +#define HASH_CL_ENV_BITS 10 +#define HASH_JOB_STATS_BKT_BITS 5 +#define HASH_JOB_STATS_CUR_BITS 7 +#define HASH_JOB_STATS_MAX_BITS 12 + +/* Timeout definitions */ +#define OBD_TIMEOUT_DEFAULT 100 +#define LDLM_TIMEOUT_DEFAULT 20 +#define MDS_LDLM_TIMEOUT_DEFAULT 6 +/* Time to wait for all clients to reconnect during recovery (hard limit) */ +#define OBD_RECOVERY_TIME_HARD (obd_timeout * 9) +/* Time to wait for all clients to reconnect during recovery (soft limit) */ +/* Should be very conservative; must catch the first reconnect after reboot */ +#define OBD_RECOVERY_TIME_SOFT (obd_timeout * 3) +/* Change recovery-small 26b time if you change this */ +#define PING_INTERVAL max(obd_timeout / 4, 1U) +/* a bit more than maximal journal commit time in seconds */ +#define PING_INTERVAL_SHORT min(PING_INTERVAL, 7U) +/* Client may skip 1 ping; we must wait at least 2.5. But for multiple + * failover targets the client only pings one server at a time, and pings + * can be lost on a loaded network. Since eviction has serious consequences, + * and there's no urgent need to evict a client just because it's idle, we + * should be very conservative here. */ +#define PING_EVICT_TIMEOUT (PING_INTERVAL * 6) +#define DISK_TIMEOUT 50 /* Beyond this we warn about disk speed */ +#define CONNECTION_SWITCH_MIN 5U /* Connection switching rate limiter */ + /* Max connect interval for nonresponsive servers; ~50s to avoid building up + connect requests in the LND queues, but within obd_timeout so we don't + miss the recovery window */ +#define CONNECTION_SWITCH_MAX min(50U, max(CONNECTION_SWITCH_MIN,obd_timeout)) +#define CONNECTION_SWITCH_INC 5 /* Connection timeout backoff */ +/* In general this should be low to have quick detection of a system + running on a backup server. (If it's too low, import_select_connection + will increase the timeout anyhow.) */ +#define INITIAL_CONNECT_TIMEOUT max(CONNECTION_SWITCH_MIN,obd_timeout/20) +/* The max delay between connects is SWITCH_MAX + SWITCH_INC + INITIAL */ +#define RECONNECT_DELAY_MAX (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + \ + INITIAL_CONNECT_TIMEOUT) +/* The min time a target should wait for clients to reconnect in recovery */ +#define OBD_RECOVERY_TIME_MIN (2*RECONNECT_DELAY_MAX) +#define OBD_IR_FACTOR_MIN 1 +#define OBD_IR_FACTOR_MAX 10 +#define OBD_IR_FACTOR_DEFAULT (OBD_IR_FACTOR_MAX/2) +/* default timeout for the MGS to become IR_FULL */ +#define OBD_IR_MGS_TIMEOUT (4*obd_timeout) +#define LONG_UNLINK 300 /* Unlink should happen before now */ + +/** + * Time interval of shrink, if the client is "idle" more than this interval, + * then the ll_grant thread will return the requested grant space to filter + */ +#define GRANT_SHRINK_INTERVAL 1200/*20 minutes*/ + +#define OBD_FAIL_MDS 0x100 +#define OBD_FAIL_MDS_HANDLE_UNPACK 0x101 +#define OBD_FAIL_MDS_GETATTR_NET 0x102 +#define OBD_FAIL_MDS_GETATTR_PACK 0x103 +#define OBD_FAIL_MDS_READPAGE_NET 0x104 +#define OBD_FAIL_MDS_READPAGE_PACK 0x105 +#define OBD_FAIL_MDS_SENDPAGE 0x106 +#define OBD_FAIL_MDS_REINT_NET 0x107 +#define OBD_FAIL_MDS_REINT_UNPACK 0x108 +#define OBD_FAIL_MDS_REINT_SETATTR 0x109 +#define OBD_FAIL_MDS_REINT_SETATTR_WRITE 0x10a +#define OBD_FAIL_MDS_REINT_CREATE 0x10b +#define OBD_FAIL_MDS_REINT_CREATE_WRITE 0x10c +#define OBD_FAIL_MDS_REINT_UNLINK 0x10d +#define OBD_FAIL_MDS_REINT_UNLINK_WRITE 0x10e +#define OBD_FAIL_MDS_REINT_LINK 0x10f +#define OBD_FAIL_MDS_REINT_LINK_WRITE 0x110 +#define OBD_FAIL_MDS_REINT_RENAME 0x111 +#define OBD_FAIL_MDS_REINT_RENAME_WRITE 0x112 +#define OBD_FAIL_MDS_OPEN_NET 0x113 +#define OBD_FAIL_MDS_OPEN_PACK 0x114 +#define OBD_FAIL_MDS_CLOSE_NET 0x115 +#define OBD_FAIL_MDS_CLOSE_PACK 0x116 +#define OBD_FAIL_MDS_CONNECT_NET 0x117 +#define OBD_FAIL_MDS_CONNECT_PACK 0x118 +#define OBD_FAIL_MDS_REINT_NET_REP 0x119 +#define OBD_FAIL_MDS_DISCONNECT_NET 0x11a +#define OBD_FAIL_MDS_GETSTATUS_NET 0x11b +#define OBD_FAIL_MDS_GETSTATUS_PACK 0x11c +#define OBD_FAIL_MDS_STATFS_PACK 0x11d +#define OBD_FAIL_MDS_STATFS_NET 0x11e +#define OBD_FAIL_MDS_GETATTR_NAME_NET 0x11f +#define OBD_FAIL_MDS_PIN_NET 0x120 +#define OBD_FAIL_MDS_UNPIN_NET 0x121 +#define OBD_FAIL_MDS_ALL_REPLY_NET 0x122 +#define OBD_FAIL_MDS_ALL_REQUEST_NET 0x123 +#define OBD_FAIL_MDS_SYNC_NET 0x124 +#define OBD_FAIL_MDS_SYNC_PACK 0x125 +#define OBD_FAIL_MDS_DONE_WRITING_NET 0x126 +#define OBD_FAIL_MDS_DONE_WRITING_PACK 0x127 +#define OBD_FAIL_MDS_ALLOC_OBDO 0x128 +#define OBD_FAIL_MDS_PAUSE_OPEN 0x129 +#define OBD_FAIL_MDS_STATFS_LCW_SLEEP 0x12a +#define OBD_FAIL_MDS_OPEN_CREATE 0x12b +#define OBD_FAIL_MDS_OST_SETATTR 0x12c +#define OBD_FAIL_MDS_QUOTACHECK_NET 0x12d +#define OBD_FAIL_MDS_QUOTACTL_NET 0x12e +#define OBD_FAIL_MDS_CLIENT_ADD 0x12f +#define OBD_FAIL_MDS_GETXATTR_NET 0x130 +#define OBD_FAIL_MDS_GETXATTR_PACK 0x131 +#define OBD_FAIL_MDS_SETXATTR_NET 0x132 +#define OBD_FAIL_MDS_SETXATTR 0x133 +#define OBD_FAIL_MDS_SETXATTR_WRITE 0x134 +#define OBD_FAIL_MDS_FS_SETUP 0x135 +#define OBD_FAIL_MDS_RESEND 0x136 +#define OBD_FAIL_MDS_LLOG_CREATE_FAILED 0x137 +#define OBD_FAIL_MDS_LOV_SYNC_RACE 0x138 +#define OBD_FAIL_MDS_OSC_PRECREATE 0x139 +#define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x13a +#define OBD_FAIL_MDS_CLOSE_NET_REP 0x13b +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x13c +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x13d +#define OBD_FAIL_MDS_REMOVE_COMMON_EA 0x13e +#define OBD_FAIL_MDS_ALLOW_COMMON_EA_SETTING 0x13f +#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD 0x140 +#define OBD_FAIL_MDS_LOV_PREP_CREATE 0x141 +#define OBD_FAIL_MDS_REINT_DELAY 0x142 +#define OBD_FAIL_MDS_READLINK_EPROTO 0x143 +#define OBD_FAIL_MDS_OPEN_WAIT_CREATE 0x144 +#define OBD_FAIL_MDS_PDO_LOCK 0x145 +#define OBD_FAIL_MDS_PDO_LOCK2 0x146 +#define OBD_FAIL_MDS_OSC_CREATE_FAIL 0x147 +#define OBD_FAIL_MDS_NEGATIVE_POSITIVE 0x148 +#define OBD_FAIL_MDS_HSM_STATE_GET_NET 0x149 +#define OBD_FAIL_MDS_HSM_STATE_SET_NET 0x14a +#define OBD_FAIL_MDS_HSM_PROGRESS_NET 0x14b +#define OBD_FAIL_MDS_HSM_REQUEST_NET 0x14c +#define OBD_FAIL_MDS_HSM_CT_REGISTER_NET 0x14d +#define OBD_FAIL_MDS_HSM_CT_UNREGISTER_NET 0x14e +#define OBD_FAIL_MDS_SWAP_LAYOUTS_NET 0x14f +#define OBD_FAIL_MDS_HSM_ACTION_NET 0x150 +#define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 + +/* layout lock */ +#define OBD_FAIL_MDS_NO_LL_GETATTR 0x170 +#define OBD_FAIL_MDS_NO_LL_OPEN 0x171 +#define OBD_FAIL_MDS_LL_BLOCK 0x172 + +/* CMD */ +#define OBD_FAIL_MDS_IS_SUBDIR_NET 0x180 +#define OBD_FAIL_MDS_IS_SUBDIR_PACK 0x181 +#define OBD_FAIL_MDS_SET_INFO_NET 0x182 +#define OBD_FAIL_MDS_WRITEPAGE_NET 0x183 +#define OBD_FAIL_MDS_WRITEPAGE_PACK 0x184 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x185 +#define OBD_FAIL_MDS_GET_INFO_NET 0x186 +#define OBD_FAIL_MDS_DQACQ_NET 0x187 + +/* OI scrub */ +#define OBD_FAIL_OSD_SCRUB_DELAY 0x190 +#define OBD_FAIL_OSD_SCRUB_CRASH 0x191 +#define OBD_FAIL_OSD_SCRUB_FATAL 0x192 +#define OBD_FAIL_OSD_FID_MAPPING 0x193 +#define OBD_FAIL_OSD_LMA_INCOMPAT 0x194 + +#define OBD_FAIL_OST 0x200 +#define OBD_FAIL_OST_CONNECT_NET 0x201 +#define OBD_FAIL_OST_DISCONNECT_NET 0x202 +#define OBD_FAIL_OST_GET_INFO_NET 0x203 +#define OBD_FAIL_OST_CREATE_NET 0x204 +#define OBD_FAIL_OST_DESTROY_NET 0x205 +#define OBD_FAIL_OST_GETATTR_NET 0x206 +#define OBD_FAIL_OST_SETATTR_NET 0x207 +#define OBD_FAIL_OST_OPEN_NET 0x208 +#define OBD_FAIL_OST_CLOSE_NET 0x209 +#define OBD_FAIL_OST_BRW_NET 0x20a +#define OBD_FAIL_OST_PUNCH_NET 0x20b +#define OBD_FAIL_OST_STATFS_NET 0x20c +#define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f +#define OBD_FAIL_OST_SYNC_NET 0x210 +#define OBD_FAIL_OST_ALL_REPLY_NET 0x211 +#define OBD_FAIL_OST_ALL_REQUEST_NET 0x212 +#define OBD_FAIL_OST_LDLM_REPLY_NET 0x213 +#define OBD_FAIL_OST_BRW_PAUSE_BULK 0x214 +#define OBD_FAIL_OST_ENOSPC 0x215 +#define OBD_FAIL_OST_EROFS 0x216 +#define OBD_FAIL_OST_ENOENT 0x217 +#define OBD_FAIL_OST_QUOTACHECK_NET 0x218 +#define OBD_FAIL_OST_QUOTACTL_NET 0x219 +#define OBD_FAIL_OST_CHECKSUM_RECEIVE 0x21a +#define OBD_FAIL_OST_CHECKSUM_SEND 0x21b +#define OBD_FAIL_OST_BRW_SIZE 0x21c +#define OBD_FAIL_OST_DROP_REQ 0x21d +#define OBD_FAIL_OST_SETATTR_CREDITS 0x21e +#define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f +#define OBD_FAIL_OST_BRW_WRITE_BULK2 0x220 +#define OBD_FAIL_OST_LLOG_RECOVERY_TIMEOUT 0x221 +#define OBD_FAIL_OST_CANCEL_COOKIE_TIMEOUT 0x222 +#define OBD_FAIL_OST_PAUSE_CREATE 0x223 +#define OBD_FAIL_OST_BRW_PAUSE_PACK 0x224 +#define OBD_FAIL_OST_CONNECT_NET2 0x225 +#define OBD_FAIL_OST_NOMEM 0x226 +#define OBD_FAIL_OST_BRW_PAUSE_BULK2 0x227 +#define OBD_FAIL_OST_MAPBLK_ENOSPC 0x228 +#define OBD_FAIL_OST_ENOINO 0x229 +#define OBD_FAIL_OST_DQACQ_NET 0x230 +#define OBD_FAIL_OST_STATFS_EINPROGRESS 0x231 + +#define OBD_FAIL_LDLM 0x300 +#define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 +#define OBD_FAIL_LDLM_ENQUEUE_NET 0x302 +#define OBD_FAIL_LDLM_CONVERT_NET 0x303 +#define OBD_FAIL_LDLM_CANCEL_NET 0x304 +#define OBD_FAIL_LDLM_BL_CALLBACK_NET 0x305 +#define OBD_FAIL_LDLM_CP_CALLBACK_NET 0x306 +#define OBD_FAIL_LDLM_GL_CALLBACK_NET 0x307 +#define OBD_FAIL_LDLM_ENQUEUE_EXTENT_ERR 0x308 +#define OBD_FAIL_LDLM_ENQUEUE_INTENT_ERR 0x309 +#define OBD_FAIL_LDLM_CREATE_RESOURCE 0x30a +#define OBD_FAIL_LDLM_ENQUEUE_BLOCKED 0x30b +#define OBD_FAIL_LDLM_REPLY 0x30c +#define OBD_FAIL_LDLM_RECOV_CLIENTS 0x30d +#define OBD_FAIL_LDLM_ENQUEUE_OLD_EXPORT 0x30e +#define OBD_FAIL_LDLM_GLIMPSE 0x30f +#define OBD_FAIL_LDLM_CANCEL_RACE 0x310 +#define OBD_FAIL_LDLM_CANCEL_EVICT_RACE 0x311 +#define OBD_FAIL_LDLM_PAUSE_CANCEL 0x312 +#define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 +#define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 +#define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 +#define OBD_FAIL_LDLM_INTR_CP_AST 0x317 +#define OBD_FAIL_LDLM_CP_BL_RACE 0x318 +#define OBD_FAIL_LDLM_NEW_LOCK 0x319 +#define OBD_FAIL_LDLM_AGL_DELAY 0x31a +#define OBD_FAIL_LDLM_AGL_NOLOCK 0x31b +#define OBD_FAIL_LDLM_OST_LVB 0x31c + +/* LOCKLESS IO */ +#define OBD_FAIL_LDLM_SET_CONTENTION 0x385 + +#define OBD_FAIL_OSC 0x400 +#define OBD_FAIL_OSC_BRW_READ_BULK 0x401 +#define OBD_FAIL_OSC_BRW_WRITE_BULK 0x402 +#define OBD_FAIL_OSC_LOCK_BL_AST 0x403 +#define OBD_FAIL_OSC_LOCK_CP_AST 0x404 +#define OBD_FAIL_OSC_MATCH 0x405 +#define OBD_FAIL_OSC_BRW_PREP_REQ 0x406 +#define OBD_FAIL_OSC_SHUTDOWN 0x407 +#define OBD_FAIL_OSC_CHECKSUM_RECEIVE 0x408 +#define OBD_FAIL_OSC_CHECKSUM_SEND 0x409 +#define OBD_FAIL_OSC_BRW_PREP_REQ2 0x40a +#define OBD_FAIL_OSC_CONNECT_CKSUM 0x40b +#define OBD_FAIL_OSC_CKSUM_ADLER_ONLY 0x40c +#define OBD_FAIL_OSC_DIO_PAUSE 0x40d +#define OBD_FAIL_OSC_OBJECT_CONTENTION 0x40e +#define OBD_FAIL_OSC_CP_CANCEL_RACE 0x40f +#define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 +#define OBD_FAIL_OSC_NO_GRANT 0x411 +#define OBD_FAIL_OSC_DELAY_SETTIME 0x412 + +#define OBD_FAIL_PTLRPC 0x500 +#define OBD_FAIL_PTLRPC_ACK 0x501 +#define OBD_FAIL_PTLRPC_RQBD 0x502 +#define OBD_FAIL_PTLRPC_BULK_GET_NET 0x503 +#define OBD_FAIL_PTLRPC_BULK_PUT_NET 0x504 +#define OBD_FAIL_PTLRPC_DROP_RPC 0x505 +#define OBD_FAIL_PTLRPC_DELAY_SEND 0x506 +#define OBD_FAIL_PTLRPC_DELAY_RECOV 0x507 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB 0x508 +#define OBD_FAIL_PTLRPC_PAUSE_REQ 0x50a +#define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c +#define OBD_FAIL_PTLRPC_IMP_DEACTIVE 0x50d +#define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 +#define OBD_FAIL_PTLRPC_DROP_REQ_OPC 0x513 +#define OBD_FAIL_PTLRPC_FINISH_REPLAY 0x514 +#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2 0x515 +#define OBD_FAIL_PTLRPC_DELAY_IMP_FULL 0x516 +#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x517 + +#define OBD_FAIL_OBD_PING_NET 0x600 +#define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 +#define OBD_FAIL_OBD_LOGD_NET 0x602 +#define OBD_FAIL_OBD_QC_CALLBACK_NET 0x603 +#define OBD_FAIL_OBD_DQACQ 0x604 +#define OBD_FAIL_OBD_LLOG_SETUP 0x605 +#define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 +#define OBD_FAIL_OBD_IDX_READ_NET 0x607 +#define OBD_FAIL_OBD_IDX_READ_BREAK 0x608 +#define OBD_FAIL_OBD_NO_LRU 0x609 + +#define OBD_FAIL_TGT_REPLY_NET 0x700 +#define OBD_FAIL_TGT_CONN_RACE 0x701 +#define OBD_FAIL_TGT_FORCE_RECONNECT 0x702 +#define OBD_FAIL_TGT_DELAY_CONNECT 0x703 +#define OBD_FAIL_TGT_DELAY_RECONNECT 0x704 +#define OBD_FAIL_TGT_DELAY_PRECREATE 0x705 +#define OBD_FAIL_TGT_TOOMANY_THREADS 0x706 +#define OBD_FAIL_TGT_REPLAY_DROP 0x707 +#define OBD_FAIL_TGT_FAKE_EXP 0x708 +#define OBD_FAIL_TGT_REPLAY_DELAY 0x709 +#define OBD_FAIL_TGT_LAST_REPLAY 0x710 +#define OBD_FAIL_TGT_CLIENT_ADD 0x711 +#define OBD_FAIL_TGT_RCVG_FLAG 0x712 + +#define OBD_FAIL_MDC_REVALIDATE_PAUSE 0x800 +#define OBD_FAIL_MDC_ENQUEUE_PAUSE 0x801 +#define OBD_FAIL_MDC_OLD_EXT_FLAGS 0x802 +#define OBD_FAIL_MDC_GETATTR_ENQUEUE 0x803 +#define OBD_FAIL_MDC_RPCS_SEM 0x804 +#define OBD_FAIL_MDC_LIGHTWEIGHT 0x805 + +#define OBD_FAIL_MGS 0x900 +#define OBD_FAIL_MGS_ALL_REQUEST_NET 0x901 +#define OBD_FAIL_MGS_ALL_REPLY_NET 0x902 +#define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 +#define OBD_FAIL_MGS_PAUSE_REQ 0x904 +#define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 + +#define OBD_FAIL_QUOTA_DQACQ_NET 0xA01 +#define OBD_FAIL_QUOTA_EDQUOT 0xA02 +#define OBD_FAIL_QUOTA_DELAY_REINT 0xA03 +#define OBD_FAIL_QUOTA_RECOVERABLE_ERR 0xA04 + +#define OBD_FAIL_LPROC_REMOVE 0xB00 + +#define OBD_FAIL_GENERAL_ALLOC 0xC00 + +#define OBD_FAIL_SEQ 0x1000 +#define OBD_FAIL_SEQ_QUERY_NET 0x1001 +#define OBD_FAIL_SEQ_EXHAUST 0x1002 + +#define OBD_FAIL_FLD 0x1100 +#define OBD_FAIL_FLD_QUERY_NET 0x1101 + +#define OBD_FAIL_SEC_CTX 0x1200 +#define OBD_FAIL_SEC_CTX_INIT_NET 0x1201 +#define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 +#define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 +#define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 + +#define OBD_FAIL_LLOG 0x1300 +#define OBD_FAIL_LLOG_ORIGIN_CONNECT_NET 0x1301 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CREATE_NET 0x1302 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_DESTROY_NET 0x1303 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_READ_HEADER_NET 0x1304 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_NEXT_BLOCK_NET 0x1305 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_PREV_BLOCK_NET 0x1306 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_WRITE_REC_NET 0x1307 +#define OBD_FAIL_LLOG_ORIGIN_HANDLE_CLOSE_NET 0x1308 +#define OBD_FAIL_LLOG_CATINFO_NET 0x1309 +#define OBD_FAIL_MDS_SYNC_CAPA_SL 0x1310 +#define OBD_FAIL_SEQ_ALLOC 0x1311 + +#define OBD_FAIL_LLITE 0x1400 +#define OBD_FAIL_LLITE_FAULT_TRUNC_RACE 0x1401 +#define OBD_FAIL_LOCK_STATE_WAIT_INTR 0x1402 +#define OBD_FAIL_LOV_INIT 0x1403 +#define OBD_FAIL_GLIMPSE_DELAY 0x1404 + +#define OBD_FAIL_FID_INDIR 0x1501 +#define OBD_FAIL_FID_INLMA 0x1502 +#define OBD_FAIL_FID_IGIF 0x1504 +#define OBD_FAIL_FID_LOOKUP 0x1505 +#define OBD_FAIL_FID_NOLMA 0x1506 + +/* LFSCK */ +#define OBD_FAIL_LFSCK_DELAY1 0x1600 +#define OBD_FAIL_LFSCK_DELAY2 0x1601 +#define OBD_FAIL_LFSCK_DELAY3 0x1602 +#define OBD_FAIL_LFSCK_LINKEA_CRASH 0x1603 +#define OBD_FAIL_LFSCK_LINKEA_MORE 0x1604 +#define OBD_FAIL_LFSCK_FATAL1 0x1608 +#define OBD_FAIL_LFSCK_FATAL2 0x1609 +#define OBD_FAIL_LFSCK_CRASH 0x160a +#define OBD_FAIL_LFSCK_NO_AUTO 0x160b +#define OBD_FAIL_LFSCK_NO_DOUBLESCAN 0x160c + +/* UPDATE */ +#define OBD_FAIL_UPDATE_OBJ_NET 0x1700 +#define OBD_FAIL_UPDATE_OBJ_NET_REP 0x1701 + + +/* Assign references to moved code to reduce code changes */ +#define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) +#define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) +#define OBD_FAIL_CHECK_VALUE(id, value) CFS_FAIL_CHECK_VALUE(id, value) +#define OBD_FAIL_CHECK_ORSET(id, value) CFS_FAIL_CHECK_ORSET(id, value) +#define OBD_FAIL_CHECK_RESET(id, value) CFS_FAIL_CHECK_RESET(id, value) +#define OBD_FAIL_RETURN(id, ret) CFS_FAIL_RETURN(id, ret) +#define OBD_FAIL_TIMEOUT(id, secs) CFS_FAIL_TIMEOUT(id, secs) +#define OBD_FAIL_TIMEOUT_MS(id, ms) CFS_FAIL_TIMEOUT_MS(id, ms) +#define OBD_FAIL_TIMEOUT_ORSET(id, value, secs) CFS_FAIL_TIMEOUT_ORSET(id, value, secs) +#define OBD_RACE(id) CFS_RACE(id) +#define OBD_FAIL_ONCE CFS_FAIL_ONCE +#define OBD_FAILED CFS_FAILED + +extern atomic_t libcfs_kmemory; + +#ifdef LPROCFS +#define obd_memory_add(size) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sub(size) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_STAT, (long)(size)) +#define obd_memory_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) +#define obd_pages_add(order) \ + lprocfs_counter_add(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sub(order) \ + lprocfs_counter_sub(obd_memory, OBD_MEMORY_PAGES_STAT, \ + (long)(1 << (order))) +#define obd_pages_sum() \ + lprocfs_stats_collector(obd_memory, OBD_MEMORY_PAGES_STAT, \ + LPROCFS_FIELDS_FLAGS_SUM) + +extern void obd_update_maxusage(void); +extern __u64 obd_memory_max(void); +extern __u64 obd_pages_max(void); + +#else + +extern __u64 obd_alloc; +extern __u64 obd_pages; + +extern __u64 obd_max_alloc; +extern __u64 obd_max_pages; + +static inline void obd_memory_add(long size) +{ + obd_alloc += size; + if (obd_alloc > obd_max_alloc) + obd_max_alloc = obd_alloc; +} + +static inline void obd_memory_sub(long size) +{ + obd_alloc -= size; +} + +static inline void obd_pages_add(int order) +{ + obd_pages += 1<< order; + if (obd_pages > obd_max_pages) + obd_max_pages = obd_pages; +} + +static inline void obd_pages_sub(int order) +{ + obd_pages -= 1<< order; +} + +#define obd_memory_sum() (obd_alloc) +#define obd_pages_sum() (obd_pages) + +#define obd_memory_max() (obd_max_alloc) +#define obd_pages_max() (obd_max_pages) + +#endif + +#define OBD_DEBUG_MEMUSAGE (1) + +#if OBD_DEBUG_MEMUSAGE +#define OBD_ALLOC_POST(ptr, size, name) \ + obd_memory_add(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr) + +#define OBD_FREE_PRE(ptr, size, name) \ + LASSERT(ptr); \ + obd_memory_sub(size); \ + CDEBUG(D_MALLOC, name " '" #ptr "': %d at %p.\n", \ + (int)(size), ptr); \ + POISON(ptr, 0x5a, size) + +#else /* !OBD_DEBUG_MEMUSAGE */ + +#define OBD_ALLOC_POST(ptr, size, name) ((void)0) +#define OBD_FREE_PRE(ptr, size, name) ((void)0) + +#endif /* !OBD_DEBUG_MEMUSAGE */ + +#define HAS_FAIL_ALLOC_FLAG OBD_FAIL_CHECK(OBD_FAIL_GENERAL_ALLOC) + +#define OBD_ALLOC_FAIL_BITS 24 +#define OBD_ALLOC_FAIL_MASK ((1 << OBD_ALLOC_FAIL_BITS) - 1) +#define OBD_ALLOC_FAIL_MULT (OBD_ALLOC_FAIL_MASK / 100) + +#if defined(LUSTRE_UTILS) /* this version is for utils only */ +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags) : \ + kmalloc_node(size, flags, cfs_cpt_spread_node(cptab, cpt)); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("kmalloc of '" #ptr "' (%d bytes) failed at %s:%d\n", \ + (int)(size), __FILE__, __LINE__); \ + } else { \ + memset(ptr, 0, size); \ + CDEBUG(D_MALLOC, "kmalloced '" #ptr "': %d at %p\n", \ + (int)(size), ptr); \ + } \ +} while (0) + +#else /* this version is for the kernel and liblustre */ +#define OBD_FREE_RTN0(ptr) \ +({ \ + kfree(ptr); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, flags) \ +do { \ + (ptr) = (cptab) == NULL ? \ + kmalloc(size, flags | __GFP_ZERO) : \ + kmalloc_node(size, flags | __GFP_ZERO, \ + cfs_cpt_spread_node(cptab, cpt)); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "km", size, \ + __FILE__, __LINE__) || \ + OBD_FREE_RTN0(ptr)))){ \ + OBD_ALLOC_POST(ptr, size, "kmalloced"); \ + } \ +} while (0) +#endif + +#define OBD_ALLOC_GFP(ptr, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, NULL, 0, size, gfp_mask) + +#define OBD_ALLOC(ptr, size) OBD_ALLOC_GFP(ptr, size, __GFP_IO) +#define OBD_ALLOC_WAIT(ptr, size) OBD_ALLOC_GFP(ptr, size, GFP_IOFS) +#define OBD_ALLOC_PTR(ptr) OBD_ALLOC(ptr, sizeof *(ptr)) +#define OBD_ALLOC_PTR_WAIT(ptr) OBD_ALLOC_WAIT(ptr, sizeof *(ptr)) + +#define OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, gfp_mask) \ + __OBD_MALLOC_VERBOSE(ptr, cptab, cpt, size, gfp_mask) + +#define OBD_CPT_ALLOC(ptr, cptab, cpt, size) \ + OBD_CPT_ALLOC_GFP(ptr, cptab, cpt, size, __GFP_IO) + +#define OBD_CPT_ALLOC_PTR(ptr, cptab, cpt) \ + OBD_CPT_ALLOC(ptr, cptab, cpt, sizeof *(ptr)) + +# define __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) \ +do { \ + (ptr) = cptab == NULL ? \ + vzalloc(size) : \ + vzalloc_node(size, cfs_cpt_spread_node(cptab, cpt)); \ + if (unlikely((ptr) == NULL)) { \ + CERROR("vmalloc of '" #ptr "' (%d bytes) failed\n", \ + (int)(size)); \ + CERROR(LPU64" total bytes allocated by Lustre, %d by LNET\n", \ + obd_memory_sum(), atomic_read(&libcfs_kmemory)); \ + } else { \ + OBD_ALLOC_POST(ptr, size, "vmalloced"); \ + } \ +} while(0) + +# define OBD_VMALLOC(ptr, size) \ + __OBD_VMALLOC_VEROBSE(ptr, NULL, 0, size) +# define OBD_CPT_VMALLOC(ptr, cptab, cpt, size) \ + __OBD_VMALLOC_VEROBSE(ptr, cptab, cpt, size) + + +/* Allocations above this size are considered too big and could not be done + * atomically. + * + * Be very careful when changing this value, especially when decreasing it, + * since vmalloc in Linux doesn't perform well on multi-cores system, calling + * vmalloc in critical path would hurt peformance badly. See LU-66. + */ +#define OBD_ALLOC_BIG (4 * PAGE_CACHE_SIZE) + +#define OBD_ALLOC_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VMALLOC(ptr, size); \ + else \ + OBD_ALLOC(ptr, size); \ +} while (0) + +#define OBD_CPT_ALLOC_LARGE(ptr, cptab, cpt, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_CPT_VMALLOC(ptr, cptab, cpt, size); \ + else \ + OBD_CPT_ALLOC(ptr, cptab, cpt, size); \ +} while (0) + +#define OBD_FREE_LARGE(ptr, size) \ +do { \ + if (size > OBD_ALLOC_BIG) \ + OBD_VFREE(ptr, size); \ + else \ + OBD_FREE(ptr, size); \ +} while (0) + + +#ifdef CONFIG_DEBUG_SLAB +#define POISON(ptr, c, s) do {} while (0) +#define POISON_PTR(ptr) ((void)0) +#else +#define POISON(ptr, c, s) memset(ptr, c, s) +#define POISON_PTR(ptr) (ptr) = (void *)0xdeadbeef +#endif + +#ifdef POISON_BULK +#define POISON_PAGE(page, val) do { memset(kmap(page), val, PAGE_CACHE_SIZE); \ + kunmap(page); } while (0) +#else +#define POISON_PAGE(page, val) do { } while (0) +#endif + +#define OBD_FREE(ptr, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "kfreed"); \ + kfree(ptr); \ + POISON_PTR(ptr); \ +} while(0) + + +#define OBD_FREE_RCU(ptr, size, handle) \ +do { \ + struct portals_handle *__h = (handle); \ + \ + LASSERT(handle != NULL); \ + __h->h_cookie = (unsigned long)(ptr); \ + __h->h_size = (size); \ + call_rcu(&__h->h_rcu, class_handle_free_cb); \ + POISON_PTR(ptr); \ +} while(0) + + +#define OBD_VFREE(ptr, size) \ + do { \ + OBD_FREE_PRE(ptr, size, "vfreed"); \ + vfree(ptr); \ + POISON_PTR(ptr); \ + } while (0) + +/* we memset() the slab object to 0 when allocation succeeds, so DO NOT + * HAVE A CTOR THAT DOES ANYTHING. its work will be cleared here. we'd + * love to assert on that, but slab.c keeps kmem_cache_s all to itself. */ +#define OBD_SLAB_FREE_RTN0(ptr, slab) \ +({ \ + kmem_cache_free((slab), (ptr)); \ + (ptr) = NULL; \ + 0; \ +}) + +#define __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, type) \ +do { \ + LASSERT(ergo((type) != GFP_ATOMIC, !in_interrupt())); \ + (ptr) = (cptab) == NULL ? \ + kmem_cache_alloc(slab, type | __GFP_ZERO) : \ + kmem_cache_alloc_node(slab, type | __GFP_ZERO, \ + cfs_cpt_spread_node(cptab, cpt)); \ + if (likely((ptr) != NULL && \ + (!HAS_FAIL_ALLOC_FLAG || obd_alloc_fail_rate == 0 || \ + !obd_alloc_fail(ptr, #ptr, "slab-", size, \ + __FILE__, __LINE__) || \ + OBD_SLAB_FREE_RTN0(ptr, slab)))) { \ + OBD_ALLOC_POST(ptr, size, "slab-alloced"); \ + } \ +} while(0) + +#define OBD_SLAB_ALLOC_GFP(ptr, slab, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, NULL, 0, size, flags) +#define OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, flags) \ + __OBD_SLAB_ALLOC_VERBOSE(ptr, slab, cptab, cpt, size, flags) + +#define OBD_FREE_PTR(ptr) OBD_FREE(ptr, sizeof *(ptr)) + +#define OBD_SLAB_FREE(ptr, slab, size) \ +do { \ + OBD_FREE_PRE(ptr, size, "slab-freed"); \ + kmem_cache_free(slab, ptr); \ + POISON_PTR(ptr); \ +} while(0) + +#define OBD_SLAB_ALLOC(ptr, slab, size) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, size, __GFP_IO) + +#define OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, size) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, size, __GFP_IO) + +#define OBD_SLAB_ALLOC_PTR(ptr, slab) \ + OBD_SLAB_ALLOC(ptr, slab, sizeof *(ptr)) + +#define OBD_SLAB_CPT_ALLOC_PTR(ptr, slab, cptab, cpt) \ + OBD_SLAB_CPT_ALLOC(ptr, slab, cptab, cpt, sizeof *(ptr)) + +#define OBD_SLAB_ALLOC_PTR_GFP(ptr, slab, flags) \ + OBD_SLAB_ALLOC_GFP(ptr, slab, sizeof *(ptr), flags) + +#define OBD_SLAB_CPT_ALLOC_PTR_GFP(ptr, slab, cptab, cpt, flags) \ + OBD_SLAB_CPT_ALLOC_GFP(ptr, slab, cptab, cpt, sizeof *(ptr), flags) + +#define OBD_SLAB_FREE_PTR(ptr, slab) \ + OBD_SLAB_FREE((ptr), (slab), sizeof *(ptr)) + +#define KEY_IS(str) \ + (keylen >= (sizeof(str)-1) && memcmp(key, str, (sizeof(str)-1)) == 0) + +/* Wrapper for contiguous page frame allocation */ +#define __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) \ +do { \ + (ptr) = (cptab) == NULL ? \ + alloc_page(gfp_mask) : \ + alloc_pages_node(cfs_cpt_spread_node(cptab, cpt), gfp_mask, 0);\ + if (unlikely((ptr) == NULL)) { \ + CERROR("alloc_pages of '" #ptr "' %d page(s) / "LPU64" bytes "\ + "failed\n", (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT)); \ + CERROR(LPU64" total bytes and "LPU64" total pages " \ + "("LPU64" bytes) allocated by Lustre, " \ + "%d total bytes by LNET\n", \ + obd_memory_sum(), \ + obd_pages_sum() << PAGE_CACHE_SHIFT, \ + obd_pages_sum(), \ + atomic_read(&libcfs_kmemory)); \ + } else { \ + obd_pages_add(0); \ + CDEBUG(D_MALLOC, "alloc_pages '" #ptr "': %d page(s) / " \ + LPU64" bytes at %p.\n", \ + (int)1, \ + (__u64)(1 << PAGE_CACHE_SHIFT), ptr); \ + } \ +} while (0) + +#define OBD_PAGE_ALLOC(ptr, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, NULL, 0, gfp_mask) +#define OBD_PAGE_CPT_ALLOC(ptr, cptab, cpt, gfp_mask) \ + __OBD_PAGE_ALLOC_VERBOSE(ptr, cptab, cpt, gfp_mask) + +#define OBD_PAGE_FREE(ptr) \ +do { \ + LASSERT(ptr); \ + obd_pages_sub(0); \ + CDEBUG(D_MALLOC, "free_pages '" #ptr "': %d page(s) / "LPU64" bytes " \ + "at %p.\n", \ + (int)1, (__u64)(1 << PAGE_CACHE_SHIFT), \ + ptr); \ + __free_page(ptr); \ + (ptr) = (void *)0xdeadbeef; \ +} while (0) + +#endif |