diff options
Diffstat (limited to 'drivers/staging/lustre/lustre/obdclass')
42 files changed, 30096 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile new file mode 100644 index 000000000000..b80c13c6f5dd --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/Makefile @@ -0,0 +1,13 @@ +obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o + +obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \ + llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \ + genops.o uuid.o llog_ioctl.o lprocfs_status.o \ + lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \ + local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\ + mea.o lu_object.o dt_object.o capa.o cl_object.o \ + cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o \ + lu_ucred.o + + +ccflags-y := -I$(src)/../include diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c new file mode 100644 index 000000000000..c2a6702c9f2c --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/acl.c @@ -0,0 +1,546 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/acl.c + * + * Lustre Access Control List. + * + * Author: Fan Yong <fanyong@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_SEC +#include <lu_object.h> +#include <lustre_acl.h> +#include <lustre_eacl.h> +#include <obd_support.h> + +#ifdef CONFIG_FS_POSIX_ACL + +#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION + +enum { + ES_UNK = 0, /* unknown stat */ + ES_UNC = 1, /* ACL entry is not changed */ + ES_MOD = 2, /* ACL entry is modified */ + ES_ADD = 3, /* ACL entry is added */ + ES_DEL = 4 /* ACL entry is deleted */ +}; + +static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); + d->e_stat = le32_to_cpu(s->e_stat); +} + +static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d, + ext_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); + d->e_stat = cpu_to_le32(s->e_stat); +} + +static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = le16_to_cpu(s->e_tag); + d->e_perm = le16_to_cpu(s->e_perm); + d->e_id = le32_to_cpu(s->e_id); +} + +static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d, + posix_acl_xattr_entry *s) +{ + d->e_tag = cpu_to_le16(s->e_tag); + d->e_perm = cpu_to_le16(s->e_perm); + d->e_id = cpu_to_le32(s->e_id); +} + + +/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */ +static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header, + int old_count, int new_count) +{ + int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr); + int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr); + posix_acl_xattr_header *new; + + if (unlikely(old_count <= new_count)) + return old_size; + + OBD_ALLOC(new, new_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, new_size); + OBD_FREE(*header, old_size); + *header = new; + return new_size; +} + +/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */ +static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header, + int old_count) +{ + int ext_count = le32_to_cpu((*header)->a_count); + int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr); + ext_acl_xattr_header *new; + + if (unlikely(old_count <= ext_count)) + return 0; + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + return -ENOMEM; + + memcpy(new, *header, ext_size); + OBD_FREE(*header, old_size); + *header = new; + return 0; +} + +/* + * Generate new extended ACL based on the posix ACL. + */ +ext_acl_xattr_header * +lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size) +{ + int count, i, esize; + ext_acl_xattr_header *new; + ENTRY; + + if (unlikely(size < 0)) + RETURN(ERR_PTR(-EINVAL)); + else if (!size) + count = 0; + else + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr); + OBD_ALLOC(new, esize); + if (unlikely(new == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + new->a_count = cpu_to_le32(count); + for (i = 0; i < count; i++) { + new->a_entries[i].e_tag = header->a_entries[i].e_tag; + new->a_entries[i].e_perm = header->a_entries[i].e_perm; + new->a_entries[i].e_id = header->a_entries[i].e_id; + new->a_entries[i].e_stat = cpu_to_le32(ES_UNK); + } + + RETURN(new); +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext); + +/* + * Filter out the "nobody" entries in the posix ACL. + */ +int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size, + posix_acl_xattr_header **out) +{ + int count, i, j, rc = 0; + __u32 id; + posix_acl_xattr_header *new; + ENTRY; + + if (unlikely(size < 0)) + RETURN(-EINVAL); + else if (!size) + RETURN(0); + + OBD_ALLOC(new, size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + for (i = 0, j = 0; i < count; i++) { + id = le32_to_cpu(header->a_entries[i].e_id); + switch (le16_to_cpu(header->a_entries[i].e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (id != ACL_UNDEFINED_ID) + GOTO(_out, rc = -EIO); + + memcpy(&new->a_entries[j++], &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_USER: + if (id != NOBODY_UID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + case ACL_GROUP: + if (id != NOBODY_GID) + memcpy(&new->a_entries[j++], + &header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + break; + default: + GOTO(_out, rc = -EIO); + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, count, j); + if (rc >= 0) { + size = rc; + *out = new; + rc = 0; + } + EXIT; + +_out: + if (rc) { + OBD_FREE(new, size); + size = rc; + } + return size; +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_filter); + +/* + * Release the posix ACL space. + */ +void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size) +{ + OBD_FREE(header, size); +} +EXPORT_SYMBOL(lustre_posix_acl_xattr_free); + +/* + * Release the extended ACL space. + */ +void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header) +{ + OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \ + ext_acl_xattr)); +} +EXPORT_SYMBOL(lustre_ext_acl_xattr_free); + +static ext_acl_xattr_entry * +lustre_ext_acl_xattr_search(ext_acl_xattr_header *header, + posix_acl_xattr_entry *entry, int *pos) +{ + int once, start, end, i, j, count = le32_to_cpu(header->a_count); + + once = 0; + start = *pos; + end = count; + +again: + for (i = start; i < end; i++) { + if (header->a_entries[i].e_tag == entry->e_tag && + header->a_entries[i].e_id == entry->e_id) { + j = i; + if (++i >= count) + i = 0; + *pos = i; + return &header->a_entries[j]; + } + } + + if (!once) { + once = 1; + start = 0; + end = *pos; + goto again; + } + + return NULL; +} + +/* + * Merge the posix ACL and the extended ACL into new posix ACL. + */ +int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header, + posix_acl_xattr_header **out) +{ + int posix_count, posix_size, i, j; + int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0; + posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID}; + posix_acl_xattr_header *new; + ext_acl_xattr_entry *ee, ae; + ENTRY; + + lustre_posix_acl_cpu_to_le(&pe, &pe); + ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos); + if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) { + /* there are only base ACL entries at most. */ + posix_count = 3; + posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + for (i = 0, j = 0; i < ext_count; i++) { + lustre_ext_acl_le_to_cpu(&ae, + &ext_header->a_entries[i]); + switch (ae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_OTHER: + if (ae.e_id != ACL_UNDEFINED_ID) + GOTO(_out, rc = -EIO); + + if (ae.e_stat != ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + break; + case ACL_MASK: + case ACL_USER: + case ACL_GROUP: + if (ae.e_stat == ES_DEL) + break; + default: + GOTO(_out, rc = -EIO); + } + } + } else { + /* maybe there are valid ACL_USER or ACL_GROUP entries in the + * original server-side ACL, they are regarded as ES_UNC stat.*/ + int ori_posix_count; + + if (unlikely(size < 0)) + RETURN(-EINVAL); + else if (!size) + ori_posix_count = 0; + else + ori_posix_count = + CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + posix_count = ori_posix_count + ext_count; + posix_size = + CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr); + OBD_ALLOC(new, posix_size); + if (unlikely(new == NULL)) + RETURN(-ENOMEM); + + new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION); + /* 1. process the unchanged ACL entries + * in the original server-side ACL. */ + pos = 0; + for (i = 0, j = 0; i < ori_posix_count; i++) { + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee == NULL) + memcpy(&new->a_entries[j++], + &posix_header->a_entries[i], + sizeof(posix_acl_xattr_entry)); + } + + /* 2. process the non-deleted entries + * from client-side extended ACL. */ + for (i = 0; i < ext_count; i++) { + if (le16_to_cpu(ext_header->a_entries[i].e_stat) != + ES_DEL) { + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j++].e_id = + ext_header->a_entries[i].e_id; + } + } + } + + /* free unused space. */ + rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j); + if (rc >= 0) { + posix_size = rc; + *out = new; + rc = 0; + } + EXIT; + +_out: + if (rc) { + OBD_FREE(new, posix_size); + posix_size = rc; + } + return posix_size; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2posix); + +/* + * Merge the posix ACL and the extended ACL into new extended ACL. + */ +ext_acl_xattr_header * +lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size, + ext_acl_xattr_header *ext_header) +{ + int ori_ext_count, posix_count, ext_count, ext_size; + int i, j, pos = 0, rc = 0; + posix_acl_xattr_entry pae; + ext_acl_xattr_header *new; + ext_acl_xattr_entry *ee, eae; + ENTRY; + + if (unlikely(size < 0)) + RETURN(ERR_PTR(-EINVAL)); + else if (!size) + posix_count = 0; + else + posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr); + ori_ext_count = le32_to_cpu(ext_header->a_count); + ext_count = posix_count + ori_ext_count; + ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr); + + OBD_ALLOC(new, ext_size); + if (unlikely(new == NULL)) + RETURN(ERR_PTR(-ENOMEM)); + + for (i = 0, j = 0; i < posix_count; i++) { + lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]); + switch (pae.e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + if (pae.e_id != ACL_UNDEFINED_ID) + GOTO(out, rc = -EIO); + case ACL_USER: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_UID) + break; + + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + case ACL_GROUP: + /* ignore "nobody" entry. */ + if (pae.e_id == NOBODY_GID) + break; + new->a_entries[j].e_tag = + posix_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + posix_header->a_entries[i].e_perm; + new->a_entries[j].e_id = + posix_header->a_entries[i].e_id; + ee = lustre_ext_acl_xattr_search(ext_header, + &posix_header->a_entries[i], &pos); + if (ee) { + if (posix_header->a_entries[i].e_perm != + ee->e_perm) + /* entry modified. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_MOD); + else + /* entry unchanged. */ + ee->e_stat = + new->a_entries[j++].e_stat = + cpu_to_le32(ES_UNC); + } else { + /* new entry. */ + new->a_entries[j++].e_stat = + cpu_to_le32(ES_ADD); + } + break; + default: + GOTO(out, rc = -EIO); + } + } + + /* process deleted entries. */ + for (i = 0; i < ori_ext_count; i++) { + lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]); + if (eae.e_stat == ES_UNK) { + /* ignore "nobody" entry. */ + if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) || + (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID)) + continue; + + new->a_entries[j].e_tag = + ext_header->a_entries[i].e_tag; + new->a_entries[j].e_perm = + ext_header->a_entries[i].e_perm; + new->a_entries[j].e_id = ext_header->a_entries[i].e_id; + new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL); + } + } + + new->a_count = cpu_to_le32(j); + /* free unused space. */ + rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count); + EXIT; + +out: + if (rc) { + OBD_FREE(new, ext_size); + new = ERR_PTR(rc); + } + return new; +} +EXPORT_SYMBOL(lustre_acl_xattr_merge2ext); + +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c new file mode 100644 index 000000000000..3e532f5106e4 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/capa.c @@ -0,0 +1,401 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/capa.c + * + * Lustre Capability Hash Management + * + * Author: Lai Siyao<lsy@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include <linux/version.h> +#include <linux/fs.h> +#include <asm/unistd.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/init.h> + +#include <obd_class.h> +#include <lustre_debug.h> +#include <lustre/lustre_idl.h> + +#include <linux/list.h> +#include <lustre_capa.h> + +#define NR_CAPAHASH 32 +#define CAPA_HASH_SIZE 3000 /* for MDS & OSS */ + +struct kmem_cache *capa_cachep = NULL; + +/* lock for capa hash/capa_list/fo_capa_keys */ +DEFINE_SPINLOCK(capa_lock); + +struct list_head capa_list[CAPA_SITE_MAX]; + +static struct capa_hmac_alg capa_hmac_algs[] = { + DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20), +}; +/* capa count */ +int capa_count[CAPA_SITE_MAX] = { 0, }; + +EXPORT_SYMBOL(capa_cachep); +EXPORT_SYMBOL(capa_list); +EXPORT_SYMBOL(capa_lock); +EXPORT_SYMBOL(capa_count); + +struct hlist_head *init_capa_hash(void) +{ + struct hlist_head *hash; + int nr_hash, i; + + OBD_ALLOC(hash, PAGE_CACHE_SIZE); + if (!hash) + return NULL; + + nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head); + LASSERT(nr_hash > NR_CAPAHASH); + + for (i = 0; i < NR_CAPAHASH; i++) + INIT_HLIST_HEAD(hash + i); + return hash; +} +EXPORT_SYMBOL(init_capa_hash); + +static inline int capa_on_server(struct obd_capa *ocapa) +{ + return ocapa->c_site == CAPA_SITE_SERVER; +} + +static inline void capa_delete(struct obd_capa *ocapa) +{ + LASSERT(capa_on_server(ocapa)); + hlist_del_init(&ocapa->u.tgt.c_hash); + list_del_init(&ocapa->c_list); + capa_count[ocapa->c_site]--; + /* release the ref when alloc */ + capa_put(ocapa); +} + +void cleanup_capa_hash(struct hlist_head *hash) +{ + int i; + struct hlist_node *next; + struct obd_capa *oc; + + spin_lock(&capa_lock); + for (i = 0; i < NR_CAPAHASH; i++) { + hlist_for_each_entry_safe(oc, next, hash + i, + u.tgt.c_hash) + capa_delete(oc); + } + spin_unlock(&capa_lock); + + OBD_FREE(hash, PAGE_CACHE_SIZE); +} +EXPORT_SYMBOL(cleanup_capa_hash); + +static inline int capa_hashfn(struct lu_fid *fid) +{ + return (fid_oid(fid) ^ fid_ver(fid)) * + (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH; +} + +/* capa renewal time check is earlier than that on client, which is to prevent + * client renew right after obtaining it. */ +static inline int capa_is_to_expire(struct obd_capa *oc) +{ + return cfs_time_before(cfs_time_sub(oc->c_expiry, + cfs_time_seconds(oc->c_capa.lc_timeout)*2/3), + cfs_time_current()); +} + +static struct obd_capa *find_capa(struct lustre_capa *capa, + struct hlist_head *head, int alive) +{ + struct obd_capa *ocapa; + int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa); + + hlist_for_each_entry(ocapa, head, u.tgt.c_hash) { + if (memcmp(&ocapa->c_capa, capa, len)) + continue; + /* don't return one that will expire soon in this case */ + if (alive && capa_is_to_expire(ocapa)) + continue; + + LASSERT(capa_on_server(ocapa)); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found"); + return ocapa; + } + + return NULL; +} + +#define LRU_CAPA_DELETE_COUNT 12 +static inline void capa_delete_lru(struct list_head *head) +{ + struct obd_capa *ocapa; + struct list_head *node = head->next; + int count = 0; + + /* free LRU_CAPA_DELETE_COUNT unused capa from head */ + while (count++ < LRU_CAPA_DELETE_COUNT) { + ocapa = list_entry(node, struct obd_capa, c_list); + node = node->next; + if (atomic_read(&ocapa->c_refc)) + continue; + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru"); + capa_delete(ocapa); + } +} + +/* add or update */ +struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa) +{ + struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid); + struct obd_capa *ocapa, *old = NULL; + struct list_head *list = &capa_list[CAPA_SITE_SERVER]; + + ocapa = alloc_capa(CAPA_SITE_SERVER); + if (IS_ERR(ocapa)) + return NULL; + + spin_lock(&capa_lock); + old = find_capa(capa, head, 0); + if (!old) { + ocapa->c_capa = *capa; + set_capa_expiry(ocapa); + hlist_add_head(&ocapa->u.tgt.c_hash, head); + list_add_tail(&ocapa->c_list, list); + capa_get(ocapa); + capa_count[CAPA_SITE_SERVER]++; + if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE) + capa_delete_lru(list); + spin_unlock(&capa_lock); + return ocapa; + } else { + capa_get(old); + spin_unlock(&capa_lock); + capa_put(ocapa); + return old; + } +} +EXPORT_SYMBOL(capa_add); + +struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa, + int alive) +{ + struct obd_capa *ocapa; + + spin_lock(&capa_lock); + ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive); + if (ocapa) { + list_move_tail(&ocapa->c_list, + &capa_list[CAPA_SITE_SERVER]); + capa_get(ocapa); + } + spin_unlock(&capa_lock); + + return ocapa; +} +EXPORT_SYMBOL(capa_lookup); + +int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key) +{ + struct ll_crypto_hash *tfm; + struct capa_hmac_alg *alg; + int keylen; + struct scatterlist sl; + + if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) { + CERROR("unknown capability hmac algorithm!\n"); + return -EFAULT; + } + + alg = &capa_hmac_algs[capa_alg(capa)]; + + tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0); + if (!tfm) { + CERROR("crypto_alloc_tfm failed, check whether your kernel" + "has crypto support!\n"); + return -ENOMEM; + } + keylen = alg->ha_keylen; + + sg_set_page(&sl, virt_to_page(capa), + offsetof(struct lustre_capa, lc_hmac), + (unsigned long)(capa) % PAGE_CACHE_SIZE); + + ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac); + ll_crypto_free_hash(tfm); + + return 0; +} +EXPORT_SYMBOL(capa_hmac); + +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + ENTRY; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 ); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + RETURN(PTR_ERR(tfm)); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to encrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_encrypt_id); + +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd; + struct scatterlist ss; + struct blkcipher_desc desc; + unsigned int min; + int rc; + char alg[CRYPTO_MAX_ALG_NAME+1] = "aes"; + ENTRY; + + /* passing "aes" in a variable instead of a constant string keeps gcc + * 4.3.2 happy */ + tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 ); + if (IS_ERR(tfm)) { + CERROR("failed to load transform for aes\n"); + RETURN(PTR_ERR(tfm)); + } + + min = ll_crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + sg_set_page(&sd, virt_to_page(d), 16, + (unsigned long)(d) % PAGE_CACHE_SIZE); + + sg_set_page(&ss, virt_to_page(s), 16, + (unsigned long)(s) % PAGE_CACHE_SIZE); + + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to decrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} +EXPORT_SYMBOL(capa_decrypt_id); + +void capa_cpy(void *capa, struct obd_capa *ocapa) +{ + spin_lock(&ocapa->c_lock); + *(struct lustre_capa *)capa = ocapa->c_capa; + spin_unlock(&ocapa->c_lock); +} +EXPORT_SYMBOL(capa_cpy); + +void _debug_capa(struct lustre_capa *c, + struct libcfs_debug_msg_data *msgdata, + const char *fmt, ... ) +{ + va_list args; + va_start(args, fmt); + libcfs_debug_vmsg2(msgdata, fmt, args, + " capability@%p fid "DFID" opc "LPX64" uid "LPU64 + " gid "LPU64" flags %u alg %d keyid %u timeout %u " + "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c), + capa_uid(c), capa_gid(c), capa_flags(c), + capa_alg(c), capa_keyid(c), capa_timeout(c), + capa_expiry(c)); + va_end(args); +} +EXPORT_SYMBOL(_debug_capa); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h new file mode 100644 index 000000000000..7eb0ad7b3644 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h @@ -0,0 +1,121 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal cl interfaces. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +#define CLT_PVEC_SIZE (14) + +/** + * Possible levels of the nesting. Currently this is 2: there are "top" + * entities (files, extent locks), and "sub" entities (stripes and stripe + * locks). This is used only for debugging counters right now. + */ +enum clt_nesting_level { + CNL_TOP, + CNL_SUB, + CNL_NR +}; + +/** + * Counters used to check correctness of cl_lock interface usage. + */ +struct cl_thread_counters { + /** + * Number of outstanding calls to cl_lock_mutex_get() made by the + * current thread. For debugging. + */ + int ctc_nr_locks_locked; + /** List of locked locks. */ + struct lu_ref ctc_locks_locked; + /** Number of outstanding holds on locks. */ + int ctc_nr_held; + /** Number of outstanding uses on locks. */ + int ctc_nr_used; + /** Number of held extent locks. */ + int ctc_nr_locks_acquired; +}; + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /* + * Common fields. + */ + struct cl_io clt_io; + struct cl_2queue clt_queue; + + /* + * Fields used by cl_lock.c + */ + struct cl_lock_descr clt_descr; + struct cl_page_list clt_list; + /** + * Counters for every level of lock nesting. + */ + struct cl_thread_counters clt_counters[CNL_NR]; + /** @} debugging */ + + /* + * Fields used by cl_page.c + */ + struct cl_page *clt_pvec[CLT_PVEC_SIZE]; + + /* + * Fields used by cl_io.c + */ + /** + * Pointer to the topmost ongoing IO in this thread. + */ + struct cl_io *clt_current_io; + /** + * Used for submitting a sync io. + */ + struct cl_sync_io clt_anchor; + /** + * Fields used by cl_lock_discard_pages(). + */ + pgoff_t clt_next_index; + pgoff_t clt_fn_index; /* first non-overlapped index */ +}; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); + +#endif /* _CL_INTERNAL_H */ diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c new file mode 100644 index 000000000000..75c9be8875e0 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c @@ -0,0 +1,1753 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client IO. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd_class.h> +#include <obd_support.h> +#include <lustre_fid.h> +#include <linux/list.h> +#include <cl_object.h> +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +#define cl_io_for_each(slice, io) \ + list_for_each_entry((slice), &io->ci_layers, cis_linkage) +#define cl_io_for_each_reverse(slice, io) \ + list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * Returns true iff there is an IO ongoing in the given environment. + */ +int cl_io_is_going(const struct lu_env *env) +{ + return cl_env_info(env)->clt_current_io != NULL; +} +EXPORT_SYMBOL(cl_io_is_going); + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + struct cl_thread_info *info; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.prev, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + info = cl_env_info(env); + if (info->clt_current_io == io) + info->clt_current_io = NULL; + + /* sanity check for layout change */ + switch(io->ci_type) { + case CIT_READ: + case CIT_WRITE: + break; + case CIT_FAULT: + case CIT_FSYNC: + LASSERT(!io->ci_need_restart); + break; + case CIT_SETATTR: + case CIT_MISC: + /* Check ignore layout change conf */ + LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout, + !io->ci_need_restart)); + break; + default: + LBUG(); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_type = iot; + INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + INIT_LIST_HEAD(&io->ci_lockset.cls_curr); + INIT_LIST_HEAD(&io->ci_lockset.cls_done); + INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + RETURN(result); +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj != cl_object_top(obj)); + if (info->clt_current_io == NULL) + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj == cl_object_top(obj)); + LASSERT(info->clt_current_io == NULL); + + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %u ["LPU64", "LPU64") %u %u\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); +} +EXPORT_SYMBOL(cl_io_rw_init); + +static inline const struct lu_fid * +cl_lock_descr_fid(const struct cl_lock_descr *descr) +{ + return lu_object_fid(&descr->cld_obj->co_lu); +} + +static int cl_lock_descr_sort(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?: + __diff_normalize(d0->cld_start, d1->cld_start); +} + +static int cl_lock_descr_cmp(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + int ret; + + ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)); + if (ret) + return ret; + if (d0->cld_end < d1->cld_start) + return -1; + if (d0->cld_start > d0->cld_end) + return 1; + return 0; +} + +static void cl_lock_descr_merge(struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + d0->cld_start = min(d0->cld_start, d1->cld_start); + d0->cld_end = max(d0->cld_end, d1->cld_end); + + if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE) + d0->cld_mode = CLM_WRITE; + + if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP) + d0->cld_mode = CLM_GROUP; +} + +/* + * Sort locks in lexicographical order of their (fid, start-offset) pairs. + */ +static void cl_io_locks_sort(struct cl_io *io) +{ + int done = 0; + + ENTRY; + /* hidden treasure: bubble sort for now. */ + do { + struct cl_io_lock_link *curr; + struct cl_io_lock_link *prev; + struct cl_io_lock_link *temp; + + done = 1; + prev = NULL; + + list_for_each_entry_safe(curr, temp, + &io->ci_lockset.cls_todo, + cill_linkage) { + if (prev != NULL) { + switch (cl_lock_descr_sort(&prev->cill_descr, + &curr->cill_descr)) { + case 0: + /* + * IMPOSSIBLE: Identical locks are + * already removed at + * this point. + */ + default: + LBUG(); + case +1: + list_move_tail(&curr->cill_linkage, + &prev->cill_linkage); + done = 0; + continue; /* don't change prev: it's + * still "previous" */ + case -1: /* already in order */ + break; + } + } + prev = curr; + } + } while (!done); + EXIT; +} + +/** + * Check whether \a queue contains locks matching \a need. + * + * \retval +ve there is a matching lock in the \a queue + * \retval 0 there are no matching locks in the \a queue + */ +int cl_queue_match(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_match(&scan->cill_descr, need)) + RETURN(+1); + } + RETURN(0); +} +EXPORT_SYMBOL(cl_queue_match); + +static int cl_queue_merge(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_cmp(&scan->cill_descr, need)) + continue; + cl_lock_descr_merge(&scan->cill_descr, need); + CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n", + scan->cill_descr.cld_mode, scan->cill_descr.cld_start, + scan->cill_descr.cld_end); + RETURN(+1); + } + RETURN(0); + +} + +static int cl_lockset_match(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_match(&set->cls_curr, need) || + cl_queue_match(&set->cls_done, need); +} + +static int cl_lockset_merge(const struct cl_lockset *set, + const struct cl_lock_descr *need) +{ + return cl_queue_merge(&set->cls_todo, need) || + cl_lockset_match(set, need); +} + +static int cl_lockset_lock_one(const struct lu_env *env, + struct cl_io *io, struct cl_lockset *set, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock; + int result; + + ENTRY; + + lock = cl_lock_request(env, io, &link->cill_descr, "io", io); + + if (!IS_ERR(lock)) { + link->cill_lock = lock; + list_move(&link->cill_linkage, &set->cls_curr); + if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) { + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + } else + result = 0; + } else + result = PTR_ERR(lock); + RETURN(result); +} + +static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock = link->cill_lock; + + ENTRY; + list_del_init(&link->cill_linkage); + if (lock != NULL) { + cl_lock_release(env, lock, "io", io); + link->cill_lock = NULL; + } + if (link->cill_fini != NULL) + link->cill_fini(env, link); + EXIT; +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + struct cl_lock *lock; + int result; + + ENTRY; + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + if (!cl_lockset_match(set, &link->cill_descr)) { + /* XXX some locking to guarantee that locks aren't + * expanded in between. */ + result = cl_lockset_lock_one(env, io, set, link); + if (result != 0) + break; + } else + cl_lock_link_fini(env, io, link); + } + if (result == 0) { + list_for_each_entry_safe(link, temp, + &set->cls_curr, cill_linkage) { + lock = link->cill_lock; + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, + &set->cls_done); + else + break; + } + } + RETURN(result); +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + cl_io_locks_sort(io); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + cl_unuse(env, link->cill_lock); + cl_lock_link_fini(env, io, link); + } + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired); + EXIT; +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + result = 0; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_rw_advance); + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + ENTRY; + if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + ENTRY; + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_state = CIS_IO_GOING; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; + EXIT; +} +EXPORT_SYMBOL(cl_io_end); + +static const struct cl_page_slice * +cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type); + LINVRNT(slice != NULL); + return slice; +} + +/** + * True iff \a page is within \a io range. + */ +static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io) +{ + int result = 1; + loff_t start; + loff_t end; + pgoff_t idx; + + idx = page->cp_index; + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* + * check that [start, end) and [pos, pos + count) extents + * overlap. + */ + if (!cl_io_is_append(io)) { + const struct cl_io_rw_common *crw = &(io->u.ci_rw); + start = cl_offset(page->cp_obj, idx); + end = cl_offset(page->cp_obj, idx + 1); + result = crw->crw_pos < end && + start < crw->crw_pos + crw->crw_count; + } + break; + case CIT_FAULT: + result = io->u.ci_fault.ft_index == idx; + break; + default: + LBUG(); + } + return result; +} + +/** + * Called by read io, when page has to be read from the server. + * + * \see cl_io_operations::cio_read_page() + */ +int cl_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + const struct cl_io_slice *scan; + struct cl_2queue *queue; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_page_in_io(page, io)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + queue = &io->ci_queue; + + cl_2queue_init(queue); + /* + * ->cio_read_page() methods called in the loop below are supposed to + * never block waiting for network (the only subtle point is the + * creation of new pages for read-ahead that might result in cache + * shrinking, but currently only clean pages are shrunk and this + * requires no network io). + * + * Should this ever starts blocking, retry loop would be needed for + * "parallel io" (see CLO_REPEAT loops in cl_lock.c). + */ + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_read_page != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + LINVRNT(slice != NULL); + result = scan->cis_iop->cio_read_page(env, scan, slice); + if (result != 0) + break; + } + } + if (result == 0) + result = cl_io_submit_rw(env, io, CRT_READ, queue); + /* + * Unlock unsent pages in case of error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_read_page); + +/** + * Called by write io to prepare page to receive data from user buffer. + * + * \see cl_io_operations::cio_prepare_write() + */ +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->cio_prepare_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_prepare_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_prepare_write); + +/** + * Called by write io after user data were copied into a page. + * + * \see cl_io_operations::cio_commit_write() + */ +int cl_io_commit_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + /* + * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov) + * already called cl_page_cache_add(), moving page into CPS_CACHED + * state. Better (and more general) way of dealing with such situation + * is needed. + */ + LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_commit_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_commit_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + LINVRNT(result <= 0); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_commit_write); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->req_op[crt].cio_submit == NULL) + continue; + result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt, + queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Submit a sync_io and wait for the IO to be finished, or error happens. + * If \a timeout is zero, it means to wait for the IO unconditionally. + */ +int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue, + long timeout) +{ + struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor; + struct cl_page *pg; + int rc; + + cl_page_list_for_each(pg, &queue->c2_qin) { + LASSERT(pg->cp_sync_io == NULL); + pg->cp_sync_io = anchor; + } + + cl_sync_io_init(anchor, queue->c2_qin.pl_nr); + rc = cl_io_submit_rw(env, io, iot, queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * read found up-to-date pages in the cache, or write found + * clean pages), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(pg, &queue->c2_qin) { + pg->cp_sync_io = NULL; + cl_sync_io_note(anchor, +1); + } + + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, + anchor, timeout); + } else { + LASSERT(list_empty(&queue->c2_qout.pl_pages)); + cl_page_list_for_each(pg, &queue->c2_qin) + pg->cp_sync_io = NULL; + } + return rc; +} +EXPORT_SYMBOL(cl_io_submit_sync); + +/** + * Cancel an IO which has been submitted by cl_io_submit_rw. + */ +int cl_io_cancel(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue) +{ + struct cl_page *page; + int result = 0; + + CERROR("Canceling ongoing page trasmission\n"); + cl_page_list_for_each(page, queue) { + int rc; + + LINVRNT(cl_page_in_io(page, io)); + rc = cl_page_cancel(env, page); + result = result ?: rc; + } + return result; +} +EXPORT_SYMBOL(cl_io_cancel); + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + * - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + } while (result == 0 && io->ci_continue); + if (result == 0) + result = io->ci_result; + RETURN(result < 0 ? result : 0); +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + ENTRY; + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + ENTRY; + plist->pl_nr = 0; + INIT_LIST_HEAD(&plist->pl_pages); + plist->pl_owner = current; + EXIT; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +{ + ENTRY; + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + LINVRNT(plist->pl_owner == current); + + lockdep_off(); + mutex_lock(&page->cp_mutex); + lockdep_on(); + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist); + cl_page_get(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LINVRNT(plist->pl_owner == current); + + ENTRY; + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == current); + LINVRNT(src->pl_owner == current); + + ENTRY; + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, + page->cp_queue_ref, "queue", src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +{ + struct cl_page *page; + struct cl_page *tmp; + + LINVRNT(list->pl_owner == current); + LINVRNT(head->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, tmp, list) + cl_page_list_move(head, list, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_splice); + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del(&page->cp_reference, "queue", plist); + cl_page_put(env, page); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Owns all pages in a queue. + */ +int cl_page_list_own(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + pgoff_t index = 0; + int result; + + LINVRNT(plist->pl_owner == current); + + ENTRY; + result = 0; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(index <= page->cp_index); + index = page->cp_index; + if (cl_page_own(env, io, page) == 0) + result = result ?: page->cp_error; + else + cl_page_list_del(env, plist, page); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_own); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} +EXPORT_SYMBOL(cl_page_list_assume); + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == current); + ENTRY; + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Unmaps all pages in a queue from user virtual memory. + */ +int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + int result; + + LINVRNT(plist->pl_owner == current); + ENTRY; + result = 0; + cl_page_list_for_each(page, plist) { + result = cl_page_unmap(env, io, page); + if (result != 0) + break; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_unmap); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_page_list_add(&queue->c2_qin, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_assume); + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top(), cl_page_top(). + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + ENTRY; + while (io->ci_parent != NULL) + io = io->ci_parent; + RETURN(io); +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Adds request slice to the compound request. + * + * This is called by cl_device_operations::cdo_req_init() methods to add a + * per-layer state to the request. New state is added at the end of + * cl_req::crq_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops) +{ + ENTRY; + list_add_tail(&slice->crs_linkage, &req->crq_layers); + slice->crs_dev = dev; + slice->crs_ops = ops; + slice->crs_req = req; + EXIT; +} +EXPORT_SYMBOL(cl_req_slice_add); + +static void cl_req_free(const struct lu_env *env, struct cl_req *req) +{ + unsigned i; + + LASSERT(list_empty(&req->crq_pages)); + LASSERT(req->crq_nrpages == 0); + LINVRNT(list_empty(&req->crq_layers)); + LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL)); + ENTRY; + + if (req->crq_o != NULL) { + for (i = 0; i < req->crq_nrobjs; ++i) { + struct cl_object *obj = req->crq_o[i].ro_obj; + if (obj != NULL) { + lu_object_ref_del_at(&obj->co_lu, + req->crq_o[i].ro_obj_ref, + "cl_req", req); + cl_object_put(env, obj); + } + } + OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]); + } + OBD_FREE_PTR(req); + EXIT; +} + +static int cl_req_init(const struct lu_env *env, struct cl_req *req, + struct cl_page *page) +{ + struct cl_device *dev; + struct cl_page_slice *slice; + int result; + + ENTRY; + result = 0; + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev); + if (dev->cd_ops->cdo_req_init != NULL) { + result = dev->cd_ops->cdo_req_init(env, + dev, req); + if (result != 0) + break; + } + } + page = page->cp_child; + } while (page != NULL && result == 0); + RETURN(result); +} + +/** + * Invokes per-request transfer completion call-backs + * (cl_req_operations::cro_completion()) bottom-to-top. + */ +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc) +{ + struct cl_req_slice *slice; + + ENTRY; + /* + * for the lack of list_for_each_entry_reverse_safe()... + */ + while (!list_empty(&req->crq_layers)) { + slice = list_entry(req->crq_layers.prev, + struct cl_req_slice, crs_linkage); + list_del_init(&slice->crs_linkage); + if (slice->crs_ops->cro_completion != NULL) + slice->crs_ops->cro_completion(env, slice, rc); + } + cl_req_free(env, req); + EXIT; +} +EXPORT_SYMBOL(cl_req_completion); + +/** + * Allocates new transfer request. + */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects) +{ + struct cl_req *req; + + LINVRNT(nr_objects > 0); + ENTRY; + + OBD_ALLOC_PTR(req); + if (req != NULL) { + int result; + + OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]); + if (req->crq_o != NULL) { + req->crq_nrobjs = nr_objects; + req->crq_type = crt; + INIT_LIST_HEAD(&req->crq_pages); + INIT_LIST_HEAD(&req->crq_layers); + result = cl_req_init(env, req, page); + } else + result = -ENOMEM; + if (result != 0) { + cl_req_completion(env, req, result); + req = ERR_PTR(result); + } + } else + req = ERR_PTR(-ENOMEM); + RETURN(req); +} +EXPORT_SYMBOL(cl_req_alloc); + +/** + * Adds a page to a request. + */ +void cl_req_page_add(const struct lu_env *env, + struct cl_req *req, struct cl_page *page) +{ + struct cl_object *obj; + struct cl_req_obj *rqo; + int i; + + ENTRY; + page = cl_page_top(page); + + LASSERT(list_empty(&page->cp_flight)); + LASSERT(page->cp_req == NULL); + + CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n", + req, req->crq_type, req->crq_nrpages); + + list_add_tail(&page->cp_flight, &req->crq_pages); + ++req->crq_nrpages; + page->cp_req = req; + obj = cl_object_top(page->cp_obj); + for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) { + if (rqo->ro_obj == NULL) { + rqo->ro_obj = obj; + cl_object_get(obj); + rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_req", req); + break; + } + } + LASSERT(i < req->crq_nrobjs); + EXIT; +} +EXPORT_SYMBOL(cl_req_page_add); + +/** + * Removes a page from a request. + */ +void cl_req_page_done(const struct lu_env *env, struct cl_page *page) +{ + struct cl_req *req = page->cp_req; + + ENTRY; + page = cl_page_top(page); + + LASSERT(!list_empty(&page->cp_flight)); + LASSERT(req->crq_nrpages > 0); + + list_del_init(&page->cp_flight); + --req->crq_nrpages; + page->cp_req = NULL; + EXIT; +} +EXPORT_SYMBOL(cl_req_page_done); + +/** + * Notifies layers that request is about to depart by calling + * cl_req_operations::cro_prep() top-to-bottom. + */ +int cl_req_prep(const struct lu_env *env, struct cl_req *req) +{ + int i; + int result; + const struct cl_req_slice *slice; + + ENTRY; + /* + * Check that the caller of cl_req_alloc() didn't lie about the number + * of objects. + */ + for (i = 0; i < req->crq_nrobjs; ++i) + LASSERT(req->crq_o[i].ro_obj != NULL); + + result = 0; + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + if (slice->crs_ops->cro_prep != NULL) { + result = slice->crs_ops->cro_prep(env, slice); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_req_prep); + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags) +{ + const struct cl_req_slice *slice; + struct cl_page *page; + int i; + + LASSERT(!list_empty(&req->crq_pages)); + ENTRY; + + /* Take any page to use as a model. */ + page = list_entry(req->crq_pages.next, struct cl_page, cp_flight); + + for (i = 0; i < req->crq_nrobjs; ++i) { + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + const struct cl_page_slice *scan; + const struct cl_object *obj; + + scan = cl_page_at(page, + slice->crs_dev->cd_lu_dev.ld_type); + LASSERT(scan != NULL); + obj = scan->cpl_obj; + if (slice->crs_ops->cro_attr_set != NULL) + slice->crs_ops->cro_attr_set(env, slice, obj, + attr + i, flags); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_req_attr_set); + +/* XXX complete(), init_completion(), and wait_for_completion(), until they are + * implemented in libcfs. */ +# include <linux/sched.h> + +/** + * Initialize synchronous io wait anchor, for transfer of \a nrpages pages. + */ +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages) +{ + ENTRY; + init_waitqueue_head(&anchor->csi_waitq); + atomic_set(&anchor->csi_sync_nr, nrpages); + atomic_set(&anchor->csi_barrier, nrpages > 0); + anchor->csi_sync_rc = 0; + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_init); + +/** + * Wait until all transfer completes. Transfer completion routine has to call + * cl_sync_io_note() for every page. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor, + long timeout) +{ + struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout), + NULL, NULL, NULL); + int rc; + ENTRY; + + LASSERT(timeout >= 0); + + rc = l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + if (rc < 0) { + CERROR("SYNC IO failed with error: %d, try to cancel " + "%d remaining pages\n", + rc, atomic_read(&anchor->csi_sync_nr)); + + (void)cl_io_cancel(env, io, queue); + + lwi = (struct l_wait_info) { 0 }; + (void)l_wait_event(anchor->csi_waitq, + atomic_read(&anchor->csi_sync_nr) == 0, + &lwi); + } else { + rc = anchor->csi_sync_rc; + } + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + cl_page_list_assume(env, io, queue); + + /* wait until cl_sync_io_note() has done wakeup */ + while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) { + cpu_relax(); + } + + POISON(anchor, 0x5a, sizeof *anchor); + RETURN(rc); +} +EXPORT_SYMBOL(cl_sync_io_wait); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret) +{ + ENTRY; + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + LASSERT(atomic_read(&anchor->csi_sync_nr) > 0); + if (atomic_dec_and_test(&anchor->csi_sync_nr)) { + wake_up_all(&anchor->csi_waitq); + /* it's safe to nuke or reuse anchor now */ + atomic_set(&anchor->csi_barrier, 0); + } + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_note); diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c new file mode 100644 index 000000000000..d34e044fc854 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c @@ -0,0 +1,2304 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Extent Lock. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd_class.h> +#include <obd_support.h> +#include <lustre_fid.h> +#include <linux/list.h> +#include <cl_object.h> +#include "cl_internal.h" + +/** Lock class of cl_lock::cll_guard */ +static struct lock_class_key cl_lock_guard_class; +static struct kmem_cache *cl_lock_kmem; + +static struct lu_kmem_descr cl_lock_caches[] = { + { + .ckd_cache = &cl_lock_kmem, + .ckd_name = "cl_lock_kmem", + .ckd_size = sizeof (struct cl_lock) + }, + { + .ckd_cache = NULL + } +}; + +#define CS_LOCK_INC(o, item) +#define CS_LOCK_DEC(o, item) +#define CS_LOCKSTATE_INC(o, state) +#define CS_LOCKSTATE_DEC(o, state) + +/** + * Basic lock invariant that is maintained at all times. Caller either has a + * reference to \a lock, or somehow assures that \a lock cannot be freed. + * + * \see cl_lock_invariant() + */ +static int cl_lock_invariant_trusted(const struct lu_env *env, + const struct cl_lock *lock) +{ + return ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) && + atomic_read(&lock->cll_ref) >= lock->cll_holds && + lock->cll_holds >= lock->cll_users && + lock->cll_holds >= 0 && + lock->cll_users >= 0 && + lock->cll_depth >= 0; +} + +/** + * Stronger lock invariant, checking that caller has a reference on a lock. + * + * \see cl_lock_invariant_trusted() + */ +static int cl_lock_invariant(const struct lu_env *env, + const struct cl_lock *lock) +{ + int result; + + result = atomic_read(&lock->cll_ref) > 0 && + cl_lock_invariant_trusted(env, lock); + if (!result && env != NULL) + CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken"); + return result; +} + +/** + * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock. + */ +static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock) +{ + return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting; +} + +/** + * Returns a set of counters for this lock, depending on a lock nesting. + */ +static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env, + const struct cl_lock *lock) +{ + struct cl_thread_info *info; + enum clt_nesting_level nesting; + + info = cl_env_info(env); + nesting = cl_lock_nesting(lock); + LASSERT(nesting < ARRAY_SIZE(info->clt_counters)); + return &info->clt_counters[nesting]; +} + +static void cl_lock_trace0(int level, const struct lu_env *env, + const char *prefix, const struct cl_lock *lock, + const char *func, const int line) +{ + struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj); + CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)" + "(%p/%d/%d) at %s():%d\n", + prefix, lock, atomic_read(&lock->cll_ref), + lock->cll_guarder, lock->cll_depth, + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags, + env, h->coh_nesting, cl_lock_nr_mutexed(env), + func, line); +} +#define cl_lock_trace(level, env, prefix, lock) \ + cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__) + +#define RETIP ((unsigned long)__builtin_return_address(0)) + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key cl_lock_key; + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{ + lockdep_set_class_and_name(lock, &cl_lock_key, "EXT"); +} + +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired++; + lock_map_acquire(&lock->dep_map); +} + +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{ + cl_lock_counters(env, lock)->ctc_nr_locks_acquired--; + lock_release(&lock->dep_map, 0, RETIP); +} + +#else /* !CONFIG_LOCKDEP */ + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{} +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{} +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{} + +#endif /* !CONFIG_LOCKDEP */ + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + ENTRY; + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; + EXIT; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +/** + * Returns true iff a lock with the mode \a has provides at least the same + * guarantees as a lock with the mode \a need. + */ +int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need) +{ + LINVRNT(need == CLM_READ || need == CLM_WRITE || + need == CLM_PHANTOM || need == CLM_GROUP); + LINVRNT(has == CLM_READ || has == CLM_WRITE || + has == CLM_PHANTOM || has == CLM_GROUP); + CLASSERT(CLM_PHANTOM < CLM_READ); + CLASSERT(CLM_READ < CLM_WRITE); + CLASSERT(CLM_WRITE < CLM_GROUP); + + if (has != CLM_GROUP) + return need <= has; + else + return need == has; +} +EXPORT_SYMBOL(cl_lock_mode_match); + +/** + * Returns true iff extent portions of lock descriptions match. + */ +int cl_lock_ext_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + has->cld_start <= need->cld_start && + has->cld_end >= need->cld_end && + cl_lock_mode_match(has->cld_mode, need->cld_mode) && + (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid); +} +EXPORT_SYMBOL(cl_lock_ext_match); + +/** + * Returns true iff a lock with the description \a has provides at least the + * same guarantees as a lock with the description \a need. + */ +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + cl_object_same(has->cld_obj, need->cld_obj) && + cl_lock_ext_match(has, need); +} +EXPORT_SYMBOL(cl_lock_descr_match); + +static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + + LINVRNT(!cl_lock_is_mutexed(lock)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "free lock", lock); + might_sleep(); + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, + struct cl_lock_slice, cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + CS_LOCK_DEC(obj, total); + CS_LOCKSTATE_DEC(obj, lock->cll_state); + lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock); + cl_object_put(env, obj); + lu_ref_fini(&lock->cll_reference); + lu_ref_fini(&lock->cll_holders); + mutex_destroy(&lock->cll_guard); + OBD_SLAB_FREE_PTR(lock, cl_lock_kmem); + EXIT; +} + +/** + * Releases a reference on a lock. + * + * When last reference is released, lock is returned to the cache, unless it + * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed + * immediately. + * + * \see cl_object_put(), cl_page_put() + */ +void cl_lock_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + obj = lock->cll_descr.cld_obj; + LINVRNT(obj != NULL); + + CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + + if (atomic_dec_and_test(&lock->cll_ref)) { + if (lock->cll_state == CLS_FREEING) { + LASSERT(list_empty(&lock->cll_linkage)); + cl_lock_free(env, lock); + } + CS_LOCK_DEC(obj, busy); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_put); + +/** + * Acquires an additional reference to a lock. + * + * This can be called only by caller already possessing a reference to \a + * lock. + * + * \see cl_object_get(), cl_page_get() + */ +void cl_lock_get(struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(NULL, lock)); + CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + atomic_inc(&lock->cll_ref); +} +EXPORT_SYMBOL(cl_lock_get); + +/** + * Acquires a reference to a lock. + * + * This is much like cl_lock_get(), except that this function can be used to + * acquire initial reference to the cached lock. Caller has to deal with all + * possible races. Use with care! + * + * \see cl_page_get_trust() + */ +void cl_lock_get_trust(struct cl_lock *lock) +{ + CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + if (atomic_inc_return(&lock->cll_ref) == 1) + CS_LOCK_INC(lock->cll_descr.cld_obj, busy); +} +EXPORT_SYMBOL(cl_lock_get_trust); + +/** + * Helper function destroying the lock that wasn't completely initialized. + * + * Other threads can acquire references to the top-lock through its + * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately. + */ +static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_mutex_get(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); +} + +static struct cl_lock *cl_lock_alloc(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *descr) +{ + struct cl_lock *lock; + struct lu_object_header *head; + + ENTRY; + OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO); + if (lock != NULL) { + atomic_set(&lock->cll_ref, 1); + lock->cll_descr = *descr; + lock->cll_state = CLS_NEW; + cl_object_get(obj); + lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_lock", lock); + INIT_LIST_HEAD(&lock->cll_layers); + INIT_LIST_HEAD(&lock->cll_linkage); + INIT_LIST_HEAD(&lock->cll_inclosure); + lu_ref_init(&lock->cll_reference); + lu_ref_init(&lock->cll_holders); + mutex_init(&lock->cll_guard); + lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class); + init_waitqueue_head(&lock->cll_wq); + head = obj->co_lu.lo_header; + CS_LOCKSTATE_INC(obj, CLS_NEW); + CS_LOCK_INC(obj, total); + CS_LOCK_INC(obj, create); + cl_lock_lockdep_init(lock); + list_for_each_entry(obj, &head->loh_layers, + co_lu.lo_linkage) { + int err; + + err = obj->co_ops->coo_lock_init(env, obj, lock, io); + if (err != 0) { + cl_lock_finish(env, lock); + lock = ERR_PTR(err); + break; + } + } + } else + lock = ERR_PTR(-ENOMEM); + RETURN(lock); +} + +/** + * Transfer the lock into INTRANSIT state and return the original state. + * + * \pre state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED + * \post state: CLS_INTRANSIT + * \see CLS_INTRANSIT + */ +enum cl_lock_state cl_lock_intransit(const struct lu_env *env, + struct cl_lock *lock) +{ + enum cl_lock_state state = lock->cll_state; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(state != CLS_INTRANSIT); + LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED, + "Malformed lock state %d.\n", state); + + cl_lock_state_set(env, lock, CLS_INTRANSIT); + lock->cll_intransit_owner = current; + cl_lock_hold_add(env, lock, "intransit", current); + return state; +} +EXPORT_SYMBOL(cl_lock_intransit); + +/** + * Exit the intransit state and restore the lock state to the original state + */ +void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(state != CLS_INTRANSIT); + LASSERT(lock->cll_intransit_owner == current); + + lock->cll_intransit_owner = NULL; + cl_lock_state_set(env, lock, state); + cl_lock_unhold(env, lock, "intransit", current); +} +EXPORT_SYMBOL(cl_lock_extransit); + +/** + * Checking whether the lock is intransit state + */ +int cl_lock_is_intransit(struct cl_lock *lock) +{ + LASSERT(cl_lock_is_mutexed(lock)); + return lock->cll_state == CLS_INTRANSIT && + lock->cll_intransit_owner != current; +} +EXPORT_SYMBOL(cl_lock_is_intransit); +/** + * Returns true iff lock is "suitable" for given io. E.g., locks acquired by + * truncate and O_APPEND cannot be reused for read/non-append-write, as they + * cover multiple stripes and can trigger cascading timeouts. + */ +static int cl_lock_fits_into(const struct lu_env *env, + const struct cl_lock *lock, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_fits_into != NULL && + !slice->cls_ops->clo_fits_into(env, slice, need, io)) + RETURN(0); + } + RETURN(1); +} + +static struct cl_lock *cl_lock_lookup(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_lock *lock; + struct cl_object_header *head; + + ENTRY; + + head = cl_object_header(obj); + LINVRNT(spin_is_locked(&head->coh_lock_guard)); + CS_LOCK_INC(obj, lookup); + list_for_each_entry(lock, &head->coh_locks, cll_linkage) { + int matched; + + matched = cl_lock_ext_match(&lock->cll_descr, need) && + lock->cll_state < CLS_FREEING && + lock->cll_error == 0 && + !(lock->cll_flags & CLF_CANCELLED) && + cl_lock_fits_into(env, lock, need, io); + CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n", + PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need), + matched); + if (matched) { + cl_lock_get_trust(lock); + CS_LOCK_INC(obj, hit); + RETURN(lock); + } + } + RETURN(NULL); +} + +/** + * Returns a lock matching description \a need. + * + * This is the main entry point into the cl_lock caching interface. First, a + * cache (implemented as a per-object linked list) is consulted. If lock is + * found there, it is returned immediately. Otherwise new lock is allocated + * and returned. In any case, additional reference to lock is acquired. + * + * \see cl_object_find(), cl_page_find() + */ +static struct cl_lock *cl_lock_find(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + ENTRY; + + obj = need->cld_obj; + head = cl_object_header(obj); + + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + + if (lock == NULL) { + lock = cl_lock_alloc(env, obj, io, need); + if (!IS_ERR(lock)) { + struct cl_lock *ghost; + + spin_lock(&head->coh_lock_guard); + ghost = cl_lock_lookup(env, obj, io, need); + if (ghost == NULL) { + list_add_tail(&lock->cll_linkage, + &head->coh_locks); + spin_unlock(&head->coh_lock_guard); + CS_LOCK_INC(obj, busy); + } else { + spin_unlock(&head->coh_lock_guard); + /* + * Other threads can acquire references to the + * top-lock through its sub-locks. Hence, it + * cannot be cl_lock_free()-ed immediately. + */ + cl_lock_finish(env, lock); + lock = ghost; + } + } + } + RETURN(lock); +} + +/** + * Returns existing lock matching given description. This is similar to + * cl_lock_find() except that no new lock is created, and returned lock is + * guaranteed to be in enum cl_lock_state::CLS_HELD state. + */ +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + obj = need->cld_obj; + head = cl_object_header(obj); + + do { + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + if (lock == NULL) + return NULL; + + cl_lock_mutex_get(env, lock); + if (lock->cll_state == CLS_INTRANSIT) + /* Don't care return value. */ + cl_lock_state_wait(env, lock); + if (lock->cll_state == CLS_FREEING) { + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + } while (lock == NULL); + + cl_lock_hold_add(env, lock, scope, source); + cl_lock_user_add(env, lock); + if (lock->cll_state == CLS_CACHED) + cl_use_try(env, lock, 1); + if (lock->cll_state == CLS_HELD) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, 0); + cl_lock_put(env, lock); + } else { + cl_unuse_try(env, lock); + cl_lock_unhold(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + lock = NULL; + } + + return lock; +} +EXPORT_SYMBOL(cl_lock_peek); + +/** + * Returns a slice within a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(NULL, lock)); + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} +EXPORT_SYMBOL(cl_lock_at); + +static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + counters = cl_lock_counters(env, lock); + lock->cll_depth++; + counters->ctc_nr_locks_locked++; + lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock); + cl_lock_trace(D_TRACE, env, "got mutex", lock); +} + +/** + * Locks cl_lock object. + * + * This is used to manipulate cl_lock fields, and to serialize state + * transitions in the lock state machine. + * + * \post cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_put() + */ +void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_guarder == current) { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_depth > 0); + } else { + struct cl_object_header *hdr; + struct cl_thread_info *info; + int i; + + LINVRNT(lock->cll_guarder != current); + hdr = cl_object_header(lock->cll_descr.cld_obj); + /* + * Check that mutices are taken in the bottom-to-top order. + */ + info = cl_env_info(env); + for (i = 0; i < hdr->coh_nesting; ++i) + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting); + lock->cll_guarder = current; + LINVRNT(lock->cll_depth == 0); + } + cl_lock_mutex_tail(env, lock); +} +EXPORT_SYMBOL(cl_lock_mutex_get); + +/** + * Try-locks cl_lock object. + * + * \retval 0 \a lock was successfully locked + * + * \retval -EBUSY \a lock cannot be locked right now + * + * \post ergo(result == 0, cl_lock_is_mutexed(lock)) + * + * \see cl_lock_mutex_get() + */ +int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + + result = 0; + if (lock->cll_guarder == current) { + LINVRNT(lock->cll_depth > 0); + cl_lock_mutex_tail(env, lock); + } else if (mutex_trylock(&lock->cll_guard)) { + LINVRNT(lock->cll_depth == 0); + lock->cll_guarder = current; + cl_lock_mutex_tail(env, lock); + } else + result = -EBUSY; + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_mutex_try); + +/** + {* Unlocks cl_lock object. + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_get() + */ +void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_counters *counters; + + LINVRNT(cl_lock_invariant(env, lock)); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_guarder == current); + LINVRNT(lock->cll_depth > 0); + + counters = cl_lock_counters(env, lock); + LINVRNT(counters->ctc_nr_locks_locked > 0); + + cl_lock_trace(D_TRACE, env, "put mutex", lock); + lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock); + counters->ctc_nr_locks_locked--; + if (--lock->cll_depth == 0) { + lock->cll_guarder = NULL; + mutex_unlock(&lock->cll_guard); + } +} +EXPORT_SYMBOL(cl_lock_mutex_put); + +/** + * Returns true iff lock's mutex is owned by the current thread. + */ +int cl_lock_is_mutexed(struct cl_lock *lock) +{ + return lock->cll_guarder == current; +} +EXPORT_SYMBOL(cl_lock_is_mutexed); + +/** + * Returns number of cl_lock mutices held by the current thread (environment). + */ +int cl_lock_nr_mutexed(const struct lu_env *env) +{ + struct cl_thread_info *info; + int i; + int locked; + + /* + * NOTE: if summation across all nesting levels (currently 2) proves + * too expensive, a summary counter can be added to + * struct cl_thread_info. + */ + info = cl_env_info(env); + for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + locked += info->clt_counters[i].ctc_nr_locks_locked; + return locked; +} +EXPORT_SYMBOL(cl_lock_nr_mutexed); + +static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + if (!(lock->cll_flags & CLF_CANCELLED)) { + const struct cl_lock_slice *slice; + + lock->cll_flags |= CLF_CANCELLED; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + } + EXIT; +} + +static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object_header *head; + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_FREEING); + + head = cl_object_header(lock->cll_descr.cld_obj); + + spin_lock(&head->coh_lock_guard); + list_del_init(&lock->cll_linkage); + spin_unlock(&head->coh_lock_guard); + + /* + * From now on, no new references to this lock can be acquired + * by cl_lock_lookup(). + */ + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_delete != NULL) + slice->cls_ops->clo_delete(env, slice); + } + /* + * From now on, no new references to this lock can be acquired + * by layer-specific means (like a pointer from struct + * ldlm_lock in osc, or a pointer from top-lock to sub-lock in + * lov). + * + * Lock will be finally freed in cl_lock_put() when last of + * existing references goes away. + */ + } + EXIT; +} + +/** + * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a + * top-lock (nesting == 0) accounts for this modification in the per-thread + * debugging counters. Sub-lock holds can be released by a thread different + * from one that acquired it. + */ +static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_holds += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_held += delta; + LASSERT(counters->ctc_nr_held >= 0); + } +} + +/** + * Mod(ifie)s cl_lock::cll_users counter for a given lock. See + * cl_lock_hold_mod() for the explanation of the debugging code. + */ +static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_counters *counters; + enum clt_nesting_level nesting; + + lock->cll_users += delta; + nesting = cl_lock_nesting(lock); + if (nesting == CNL_TOP) { + counters = &cl_env_info(env)->clt_counters[CNL_TOP]; + counters->ctc_nr_used += delta; + LASSERT(counters->ctc_nr_used >= 0); + } +} + +void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock); + lu_ref_del(&lock->cll_holders, scope, source); + cl_lock_hold_mod(env, lock, -1); + if (lock->cll_holds == 0) { + CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock); + if (lock->cll_descr.cld_mode == CLM_PHANTOM || + lock->cll_descr.cld_mode == CLM_GROUP || + lock->cll_state != CLS_CACHED) + /* + * If lock is still phantom or grouplock when user is + * done with it---destroy the lock. + */ + lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED; + if (lock->cll_flags & CLF_CANCELPEND) { + lock->cll_flags &= ~CLF_CANCELPEND; + cl_lock_cancel0(env, lock); + } + if (lock->cll_flags & CLF_DOOMED) { + /* no longer doomed: it's dead... Jim. */ + lock->cll_flags &= ~CLF_DOOMED; + cl_lock_delete0(env, lock); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_hold_release); + +/** + * Waits until lock state is changed. + * + * This function is called with cl_lock mutex locked, atomically releases + * mutex and goes to sleep, waiting for a lock state change (signaled by + * cl_lock_signal()), and re-acquires the mutex before return. + * + * This function is used to wait until lock state machine makes some progress + * and to emulate synchronous operations on top of asynchronous lock + * interface. + * + * \retval -EINTR wait was interrupted + * + * \retval 0 wait wasn't interrupted + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_signal() + */ +int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock) +{ + wait_queue_t waiter; + sigset_t blocked; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_depth == 1); + LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */ + + cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock); + result = lock->cll_error; + if (result == 0) { + /* To avoid being interrupted by the 'non-fatal' signals + * (SIGCHLD, for instance), we'd block them temporarily. + * LU-305 */ + blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS); + + init_waitqueue_entry_current(&waiter); + add_wait_queue(&lock->cll_wq, &waiter); + set_current_state(TASK_INTERRUPTIBLE); + cl_lock_mutex_put(env, lock); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + + /* Returning ERESTARTSYS instead of EINTR so syscalls + * can be restarted if signals are pending here */ + result = -ERESTARTSYS; + if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) { + waitq_wait(&waiter, TASK_INTERRUPTIBLE); + if (!cfs_signal_pending()) + result = 0; + } + + cl_lock_mutex_get(env, lock); + set_current_state(TASK_RUNNING); + remove_wait_queue(&lock->cll_wq, &waiter); + + /* Restore old blocked signals */ + cfs_restore_sigs(blocked); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_state_wait); + +static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + const struct cl_lock_slice *slice; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) + if (slice->cls_ops->clo_state != NULL) + slice->cls_ops->clo_state(env, slice, state); + wake_up_all(&lock->cll_wq); + EXIT; +} + +/** + * Notifies waiters that lock state changed. + * + * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all + * layers about state change by calling cl_lock_operations::clo_state() + * top-to-bottom. + */ +void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock); + cl_lock_state_signal(env, lock, lock->cll_state); + EXIT; +} +EXPORT_SYMBOL(cl_lock_signal); + +/** + * Changes lock state. + * + * This function is invoked to notify layers that lock state changed, possible + * as a result of an asynchronous event such as call-back reception. + * + * \post lock->cll_state == state + * + * \see cl_lock_operations::clo_state() + */ +void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + ENTRY; + LASSERT(lock->cll_state <= state || + (lock->cll_state == CLS_CACHED && + (state == CLS_HELD || /* lock found in cache */ + state == CLS_NEW || /* sub-lock canceled */ + state == CLS_INTRANSIT)) || + /* lock is in transit state */ + lock->cll_state == CLS_INTRANSIT); + + if (lock->cll_state != state) { + CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state); + CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state); + + cl_lock_state_signal(env, lock, state); + lock->cll_state = state; + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_state_set); + +static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + do { + result = 0; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_INTRANSIT); + + result = -ENOSYS; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_unuse != NULL) { + result = slice->cls_ops->clo_unuse(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + } while (result == CLO_REPEAT); + + return result; +} + +/** + * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling + * cl_lock_operations::clo_use() top-to-bottom to notify layers. + * @atomic = 1, it must unuse the lock to recovery the lock to keep the + * use process atomic + */ +int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic) +{ + const struct cl_lock_slice *slice; + int result; + enum cl_lock_state state; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "use lock", lock); + + LASSERT(lock->cll_state == CLS_CACHED); + if (lock->cll_error) + RETURN(lock->cll_error); + + result = -ENOSYS; + state = cl_lock_intransit(env, lock); + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_use != NULL) { + result = slice->cls_ops->clo_use(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + + LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n", + lock->cll_state); + + if (result == 0) { + state = CLS_HELD; + } else { + if (result == -ESTALE) { + /* + * ESTALE means sublock being cancelled + * at this time, and set lock state to + * be NEW here and ask the caller to repeat. + */ + state = CLS_NEW; + result = CLO_REPEAT; + } + + /* @atomic means back-off-on-failure. */ + if (atomic) { + int rc; + rc = cl_unuse_try_internal(env, lock); + /* Vet the results. */ + if (rc < 0 && result > 0) + result = rc; + } + + } + cl_lock_extransit(env, lock, state); + RETURN(result); +} +EXPORT_SYMBOL(cl_use_try); + +/** + * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers + * top-to-bottom. + */ +static int cl_enqueue_kick(const struct lu_env *env, + struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + const struct cl_lock_slice *slice; + + ENTRY; + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue != NULL) { + result = slice->cls_ops->clo_enqueue(env, + slice, io, flags); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + RETURN(result); +} + +/** + * Tries to enqueue a lock. + * + * This function is called repeatedly by cl_enqueue() until either lock is + * enqueued, or error occurs. This function does not block waiting for + * networking communication to complete. + * + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \see cl_enqueue() cl_lock_operations::clo_enqueue() + * \see cl_lock_state::CLS_ENQUEUED + */ +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + + result = lock->cll_error; + if (result != 0) + break; + + switch (lock->cll_state) { + case CLS_NEW: + cl_lock_state_set(env, lock, CLS_QUEUING); + /* fall-through */ + case CLS_QUEUING: + /* kick layers. */ + result = cl_enqueue_kick(env, lock, io, flags); + /* For AGL case, the cl_lock::cll_state may + * become CLS_HELD already. */ + if (result == 0 && lock->cll_state == CLS_QUEUING) + cl_lock_state_set(env, lock, CLS_ENQUEUED); + break; + case CLS_INTRANSIT: + LASSERT(cl_lock_is_intransit(lock)); + result = CLO_WAIT; + break; + case CLS_CACHED: + /* yank lock from the cache. */ + result = cl_use_try(env, lock, 0); + break; + case CLS_ENQUEUED: + case CLS_HELD: + result = 0; + break; + default: + case CLS_FREEING: + /* + * impossible, only held locks with increased + * ->cll_holds can be enqueued, and they cannot be + * freed. + */ + LBUG(); + } + } while (result == CLO_REPEAT); + RETURN(result); +} +EXPORT_SYMBOL(cl_enqueue_try); + +/** + * Cancel the conflicting lock found during previous enqueue. + * + * \retval 0 conflicting lock has been canceled. + * \retval -ve error code. + */ +int cl_lock_enqueue_wait(const struct lu_env *env, + struct cl_lock *lock, + int keep_mutex) +{ + struct cl_lock *conflict; + int rc = 0; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + LASSERT(lock->cll_conflict != NULL); + + conflict = lock->cll_conflict; + lock->cll_conflict = NULL; + + cl_lock_mutex_put(env, lock); + LASSERT(cl_lock_nr_mutexed(env) == 0); + + cl_lock_mutex_get(env, conflict); + cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict); + cl_lock_cancel(env, conflict); + cl_lock_delete(env, conflict); + + while (conflict->cll_state != CLS_FREEING) { + rc = cl_lock_state_wait(env, conflict); + if (rc != 0) + break; + } + cl_lock_mutex_put(env, conflict); + lu_ref_del(&conflict->cll_reference, "cancel-wait", lock); + cl_lock_put(env, conflict); + + if (keep_mutex) + cl_lock_mutex_get(env, lock); + + LASSERT(rc <= 0); + RETURN(rc); +} +EXPORT_SYMBOL(cl_lock_enqueue_wait); + +static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + cl_lock_user_add(env, lock); + do { + result = cl_enqueue_try(env, lock, io, enqflags); + if (result == CLO_WAIT) { + if (lock->cll_conflict != NULL) + result = cl_lock_enqueue_wait(env, lock, 1); + else + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result != 0) + cl_unuse_try(env, lock); + LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL), + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} + +/** + * Enqueues a lock. + * + * \pre current thread or io owns a hold on lock. + * + * \post ergo(result == 0, lock->users increased) + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + */ +int cl_enqueue(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + cl_lock_lockdep_acquire(env, lock, enqflags); + cl_lock_mutex_get(env, lock); + result = cl_enqueue_locked(env, lock, io, enqflags); + cl_lock_mutex_put(env, lock); + if (result != 0) + cl_lock_lockdep_release(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_enqueue); + +/** + * Tries to unlock a lock. + * + * This function is called to release underlying resource: + * 1. for top lock, the resource is sublocks it held; + * 2. for sublock, the resource is the reference to dlmlock. + * + * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT. + * + * \see cl_unuse() cl_lock_operations::clo_unuse() + * \see cl_lock_state::CLS_CACHED + */ +int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + enum cl_lock_state state = CLS_NEW; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock); + + if (lock->cll_users > 1) { + cl_lock_user_del(env, lock); + RETURN(0); + } + + /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold + * underlying resources. */ + if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) { + cl_lock_user_del(env, lock); + RETURN(0); + } + + /* + * New lock users (->cll_users) are not protecting unlocking + * from proceeding. From this point, lock eventually reaches + * CLS_CACHED, is reinitialized to CLS_NEW or fails into + * CLS_FREEING. + */ + state = cl_lock_intransit(env, lock); + + result = cl_unuse_try_internal(env, lock); + LASSERT(lock->cll_state == CLS_INTRANSIT); + LASSERT(result != CLO_WAIT); + cl_lock_user_del(env, lock); + if (result == 0 || result == -ESTALE) { + /* + * Return lock back to the cache. This is the only + * place where lock is moved into CLS_CACHED state. + * + * If one of ->clo_unuse() methods returned -ESTALE, lock + * cannot be placed into cache and has to be + * re-initialized. This happens e.g., when a sub-lock was + * canceled while unlocking was in progress. + */ + if (state == CLS_HELD && result == 0) + state = CLS_CACHED; + else + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + + /* + * Hide -ESTALE error. + * If the lock is a glimpse lock, and it has multiple + * stripes. Assuming that one of its sublock returned -ENAVAIL, + * and other sublocks are matched write locks. In this case, + * we can't set this lock to error because otherwise some of + * its sublocks may not be canceled. This causes some dirty + * pages won't be written to OSTs. -jay + */ + result = 0; + } else { + CERROR("result = %d, this is unlikely!\n", result); + state = CLS_NEW; + cl_lock_extransit(env, lock, state); + } + RETURN(result ?: lock->cll_error); +} +EXPORT_SYMBOL(cl_unuse_try); + +static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + ENTRY; + + result = cl_unuse_try(env, lock); + if (result) + CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result); + + EXIT; +} + +/** + * Unlocks a lock. + */ +void cl_unuse(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_mutex_get(env, lock); + cl_unuse_locked(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_release(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_unuse); + +/** + * Tries to wait for a lock. + * + * This function is called repeatedly by cl_wait() until either lock is + * granted, or error occurs. This function does not block waiting for network + * communication to complete. + * + * \see cl_wait() cl_lock_operations::clo_wait() + * \see cl_lock_state::CLS_HELD + */ +int cl_wait_try(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock); + do { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_QUEUING || + lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD || + lock->cll_state == CLS_INTRANSIT, + "lock state: %d\n", lock->cll_state); + LASSERT(lock->cll_users > 0); + LASSERT(lock->cll_holds > 0); + + result = lock->cll_error; + if (result != 0) + break; + + if (cl_lock_is_intransit(lock)) { + result = CLO_WAIT; + break; + } + + if (lock->cll_state == CLS_HELD) + /* nothing to do */ + break; + + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_wait != NULL) { + result = slice->cls_ops->clo_wait(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + if (result == 0) { + LASSERT(lock->cll_state != CLS_INTRANSIT); + cl_lock_state_set(env, lock, CLS_HELD); + } + } while (result == CLO_REPEAT); + RETURN(result); +} +EXPORT_SYMBOL(cl_wait_try); + +/** + * Waits until enqueued lock is granted. + * + * \pre current thread or io owns a hold on the lock + * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \post ergo(result == 0, lock->cll_state == CLS_HELD) + */ +int cl_wait(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + ENTRY; + cl_lock_mutex_get(env, lock); + + LINVRNT(cl_lock_invariant(env, lock)); + LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD, + "Wrong state %d \n", lock->cll_state); + LASSERT(lock->cll_holds > 0); + + do { + result = cl_wait_try(env, lock); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result < 0) { + cl_unuse_try(env, lock); + cl_lock_lockdep_release(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, "wait lock", lock); + cl_lock_mutex_put(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_wait); + +/** + * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock + * value. + */ +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + unsigned long pound; + unsigned long ounce; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + pound = 0; + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_weigh != NULL) { + ounce = slice->cls_ops->clo_weigh(env, slice); + pound += ounce; + if (pound < ounce) /* over-weight^Wflow */ + pound = ~0UL; + } + } + RETURN(pound); +} +EXPORT_SYMBOL(cl_lock_weigh); + +/** + * Notifies layers that lock description changed. + * + * The server can grant client a lock different from one that was requested + * (e.g., larger in extent). This method is called when actually granted lock + * description becomes known to let layers to accommodate for changed lock + * description. + * + * \see cl_lock_operations::clo_modify() + */ +int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc) +{ + const struct cl_lock_slice *slice; + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object_header *hdr = cl_object_header(obj); + int result; + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "modify lock", lock); + /* don't allow object to change */ + LASSERT(obj == desc->cld_obj); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_modify != NULL) { + result = slice->cls_ops->clo_modify(env, slice, desc); + if (result != 0) + RETURN(result); + } + } + CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n", + PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu))); + /* + * Just replace description in place. Nothing more is needed for + * now. If locks were indexed according to their extent and/or mode, + * that index would have to be updated here. + */ + spin_lock(&hdr->coh_lock_guard); + lock->cll_descr = *desc; + spin_unlock(&hdr->coh_lock_guard); + RETURN(0); +} +EXPORT_SYMBOL(cl_lock_modify); + +/** + * Initializes lock closure with a given origin. + * + * \see cl_lock_closure + */ +void cl_lock_closure_init(const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait) +{ + LINVRNT(cl_lock_is_mutexed(origin)); + LINVRNT(cl_lock_invariant(env, origin)); + + INIT_LIST_HEAD(&closure->clc_list); + closure->clc_origin = origin; + closure->clc_wait = wait; + closure->clc_nr = 0; +} +EXPORT_SYMBOL(cl_lock_closure_init); + +/** + * Builds a closure of \a lock. + * + * Building of a closure consists of adding initial lock (\a lock) into it, + * and calling cl_lock_operations::clo_closure() methods of \a lock. These + * methods might call cl_lock_closure_build() recursively again, adding more + * locks to the closure, etc. + * + * \see cl_lock_closure + */ +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(closure->clc_origin)); + LINVRNT(cl_lock_invariant(env, closure->clc_origin)); + + result = cl_lock_enclosure(env, lock, closure); + if (result == 0) { + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_closure != NULL) { + result = slice->cls_ops->clo_closure(env, slice, + closure); + if (result != 0) + break; + } + } + } + if (result != 0) + cl_lock_disclosure(env, closure); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_closure_build); + +/** + * Adds new lock to a closure. + * + * Try-locks \a lock and if succeeded, adds it to the closure (never more than + * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting + * until next try-lock is likely to succeed. + */ +int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + int result = 0; + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock); + if (!cl_lock_mutex_try(env, lock)) { + /* + * If lock->cll_inclosure is not empty, lock is already in + * this closure. + */ + if (list_empty(&lock->cll_inclosure)) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure", closure); + list_add(&lock->cll_inclosure, &closure->clc_list); + closure->clc_nr++; + } else + cl_lock_mutex_put(env, lock); + result = 0; + } else { + cl_lock_disclosure(env, closure); + if (closure->clc_wait) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure-w", closure); + cl_lock_mutex_put(env, closure->clc_origin); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + cl_lock_mutex_get(env, lock); + cl_lock_mutex_put(env, lock); + + cl_lock_mutex_get(env, closure->clc_origin); + lu_ref_del(&lock->cll_reference, "closure-w", closure); + cl_lock_put(env, lock); + } + result = CLO_REPEAT; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_enclosure); + +/** Releases mutices of enclosed locks. */ +void cl_lock_disclosure(const struct lu_env *env, + struct cl_lock_closure *closure) +{ + struct cl_lock *scan; + struct cl_lock *temp; + + cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin); + list_for_each_entry_safe(scan, temp, &closure->clc_list, + cll_inclosure){ + list_del_init(&scan->cll_inclosure); + cl_lock_mutex_put(env, scan); + lu_ref_del(&scan->cll_reference, "closure", closure); + cl_lock_put(env, scan); + closure->clc_nr--; + } + LASSERT(closure->clc_nr == 0); +} +EXPORT_SYMBOL(cl_lock_disclosure); + +/** Finalizes a closure. */ +void cl_lock_closure_fini(struct cl_lock_closure *closure) +{ + LASSERT(closure->clc_nr == 0); + LASSERT(list_empty(&closure->clc_list)); +} +EXPORT_SYMBOL(cl_lock_closure_fini); + +/** + * Destroys this lock. Notifies layers (bottom-to-top) that lock is being + * destroyed, then destroy the lock. If there are holds on the lock, postpone + * destruction until all holds are released. This is called when a decision is + * made to destroy the lock in the future. E.g., when a blocking AST is + * received on it, or fatal communication error happens. + * + * Caller must have a reference on this lock to prevent a situation, when + * deleted lock lingers in memory for indefinite time, because nobody calls + * cl_lock_put() to finish it. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * \pre ergo(cl_lock_nesting(lock) == CNL_TOP, + * cl_lock_nr_mutexed(env) == 1) + * [i.e., if a top-lock is deleted, mutices of no other locks can be + * held, as deletion of sub-locks might require releasing a top-lock + * mutex] + * + * \see cl_lock_operations::clo_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP, + cl_lock_nr_mutexed(env) == 1)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "delete lock", lock); + if (lock->cll_holds == 0) + cl_lock_delete0(env, lock); + else + lock->cll_flags |= CLF_DOOMED; + EXIT; +} +EXPORT_SYMBOL(cl_lock_delete); + +/** + * Mark lock as irrecoverably failed, and mark it for destruction. This + * happens when, e.g., server fails to grant a lock to us, or networking + * time-out happens. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * + * \see clo_lock_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_error == 0 && error != 0) { + cl_lock_trace(D_DLMTRACE, env, "set lock error", lock); + lock->cll_error = error; + cl_lock_signal(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_error); + +/** + * Cancels this lock. Notifies layers + * (bottom-to-top) that lock is being cancelled, then destroy the lock. If + * there are holds on the lock, postpone cancellation until + * all holds are released. + * + * Cancellation notification is delivered to layers at most once. + * + * \see cl_lock_operations::clo_cancel() + * \see cl_lock::cll_holds + */ +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock); + if (lock->cll_holds == 0) + cl_lock_cancel0(env, lock); + else + lock->cll_flags |= CLF_CANCELPEND; + EXIT; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Finds an existing lock covering given index and optionally different from a + * given \a except lock. + */ +struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env, + struct cl_object *obj, pgoff_t index, + struct cl_lock *except, + int pending, int canceld) +{ + struct cl_object_header *head; + struct cl_lock *scan; + struct cl_lock *lock; + struct cl_lock_descr *need; + + ENTRY; + + head = cl_object_header(obj); + need = &cl_env_info(env)->clt_descr; + lock = NULL; + + need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but + * not PHANTOM */ + need->cld_start = need->cld_end = index; + need->cld_enq_flags = 0; + + spin_lock(&head->coh_lock_guard); + /* It is fine to match any group lock since there could be only one + * with a uniq gid and it conflicts with all other lock modes too */ + list_for_each_entry(scan, &head->coh_locks, cll_linkage) { + if (scan != except && + (scan->cll_descr.cld_mode == CLM_GROUP || + cl_lock_ext_match(&scan->cll_descr, need)) && + scan->cll_state >= CLS_HELD && + scan->cll_state < CLS_FREEING && + /* + * This check is racy as the lock can be canceled right + * after it is done, but this is fine, because page exists + * already. + */ + (canceld || !(scan->cll_flags & CLF_CANCELLED)) && + (pending || !(scan->cll_flags & CLF_CANCELPEND))) { + /* Don't increase cs_hit here since this + * is just a helper function. */ + cl_lock_get_trust(scan); + lock = scan; + break; + } + } + spin_unlock(&head->coh_lock_guard); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_at_pgoff); + +/** + * Calculate the page offset at the layer of @lock. + * At the time of this writing, @page is top page and @lock is sub lock. + */ +static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock) +{ + struct lu_device_type *dtype; + const struct cl_page_slice *slice; + + dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type; + slice = cl_page_at(page, dtype); + LASSERT(slice != NULL); + return slice->cpl_page->cp_index; +} + +/** + * Check if page @page is covered by an extra lock or discard it. + */ +static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + pgoff_t index = pgoff_at_lock(page, lock); + + if (index >= info->clt_fn_index) { + struct cl_lock *tmp; + + /* refresh non-overlapped index */ + tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index, + lock, 1, 0); + if (tmp != NULL) { + /* Cache the first-non-overlapped index so as to skip + * all pages within [index, clt_fn_index). This + * is safe because if tmp lock is canceled, it will + * discard these pages. */ + info->clt_fn_index = tmp->cll_descr.cld_end + 1; + if (tmp->cll_descr.cld_end == CL_PAGE_EOF) + info->clt_fn_index = CL_PAGE_EOF; + cl_lock_put(env, tmp); + } else if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + } + + info->clt_next_index = index + 1; + return CLP_GANG_OKAY; +} + +static int discard_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_lock *lock = cbdata; + + LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageWriteback(cl_page_vmpage(env, page)))); + KLASSERT(ergo(page->cp_type == CPT_CACHEABLE, + !PageDirty(cl_page_vmpage(env, page)))); + + info->clt_next_index = pgoff_at_lock(page, lock) + 1; + if (cl_page_own(env, io, page) == 0) { + /* discard the page */ + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + } else { + LASSERT(page->cp_state == CPS_FREEING); + } + + return CLP_GANG_OKAY; +} + +/** + * Discard pages protected by the given lock. This function traverses radix + * tree to find all covering pages and discard them. If a page is being covered + * by other locks, it should remain in cache. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_io *io = &info->clt_io; + struct cl_lock_descr *descr = &lock->cll_descr; + cl_page_gang_cb_t cb; + int res; + int result; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + + io->ci_obj = cl_object_top(descr->cld_obj); + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result != 0) + GOTO(out, result); + + cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb; + info->clt_fn_index = info->clt_next_index = descr->cld_start; + do { + res = cl_page_gang_lookup(env, descr->cld_obj, io, + info->clt_next_index, descr->cld_end, + cb, (void *)lock); + if (info->clt_next_index > descr->cld_end) + break; + + if (res == CLP_GANG_RESCHED) + cond_resched(); + } while (res != CLP_GANG_OKAY); +out: + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_discard_pages); + +/** + * Eliminate all locks for a given object. + * + * Caller has to guarantee that no lock is in active use. + * + * \param cancel when this is set, cl_locks_prune() cancels locks before + * destroying. + */ +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel) +{ + struct cl_object_header *head; + struct cl_lock *lock; + + ENTRY; + head = cl_object_header(obj); + /* + * If locks are destroyed without cancellation, all pages must be + * already destroyed (as otherwise they will be left unprotected). + */ + LASSERT(ergo(!cancel, + head->coh_tree.rnode == NULL && head->coh_pages == 0)); + + spin_lock(&head->coh_lock_guard); + while (!list_empty(&head->coh_locks)) { + lock = container_of(head->coh_locks.next, + struct cl_lock, cll_linkage); + cl_lock_get_trust(lock); + spin_unlock(&head->coh_lock_guard); + lu_ref_add(&lock->cll_reference, "prune", current); + +again: + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_users <= 1); + if (unlikely(lock->cll_users == 1)) { + struct l_wait_info lwi = { 0 }; + + cl_lock_mutex_put(env, lock); + l_wait_event(lock->cll_wq, + lock->cll_users == 0, + &lwi); + goto again; + } + + if (cancel) + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, "prune", current); + cl_lock_put(env, lock); + spin_lock(&head->coh_lock_guard); + } + spin_unlock(&head->coh_lock_guard); + EXIT; +} +EXPORT_SYMBOL(cl_locks_prune); + +static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + while (1) { + lock = cl_lock_find(env, io, need); + if (IS_ERR(lock)) + break; + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING && + !(lock->cll_flags & CLF_CANCELLED)) { + cl_lock_hold_mod(env, lock, +1); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + break; + } + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + } + RETURN(lock); +} + +/** + * Returns a lock matching \a need description with a reference and a hold on + * it. + * + * This is much like cl_lock_find(), except that cl_lock_hold() additionally + * guarantees that lock is not in the CLS_FREEING state on return. + */ +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (!IS_ERR(lock)) + cl_lock_mutex_put(env, lock); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_hold); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + int rc; + __u32 enqflags = need->cld_enq_flags; + + ENTRY; + do { + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (IS_ERR(lock)) + break; + + rc = cl_enqueue_locked(env, lock, io, enqflags); + if (rc == 0) { + if (cl_lock_fits_into(env, lock, need, io)) { + if (!(enqflags & CEF_AGL)) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, lock, + enqflags); + break; + } + rc = 1; + } + cl_unuse_locked(env, lock); + } + cl_lock_trace(D_DLMTRACE, env, + rc <= 0 ? "enqueue failed" : "agl succeed", lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + if (rc > 0) { + LASSERT(enqflags & CEF_AGL); + lock = NULL; + } else if (rc != 0) { + lock = ERR_PTR(rc); + } + } while (rc == 0); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Adds a hold to a known lock. + */ +void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state != CLS_FREEING); + + ENTRY; + cl_lock_hold_mod(env, lock, +1); + cl_lock_get(lock); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + EXIT; +} +EXPORT_SYMBOL(cl_lock_hold_add); + +/** + * Releases a hold and a reference on a lock, on which caller acquired a + * mutex. + */ +void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_hold_release(env, lock, scope, source); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_unhold); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_trace(D_DLMTRACE, env, "release lock", lock); + cl_lock_mutex_get(env, lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_release); + +void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + cl_lock_used_mod(env, lock, +1); + EXIT; +} +EXPORT_SYMBOL(cl_lock_user_add); + +void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_users > 0); + + ENTRY; + cl_lock_used_mod(env, lock, -1); + if (lock->cll_users == 0) + wake_up_all(&lock->cll_wq); + EXIT; +} +EXPORT_SYMBOL(cl_lock_user_del); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char *names[] = { + [CLM_PHANTOM] = "P", + [CLM_READ] = "R", + [CLM_WRITE] = "W", + [CLM_GROUP] = "G" + }; + if (0 <= mode && mode < ARRAY_SIZE(names)) + return names[mode]; + else + return "U"; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ", + lock, atomic_read(&lock->cll_ref), + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); + +int cl_lock_init(void) +{ + return lu_kmem_init(cl_lock_caches); +} + +void cl_lock_fini(void) +{ + lu_kmem_fini(cl_lock_caches); +} diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c new file mode 100644 index 000000000000..cdb5fba04591 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c @@ -0,0 +1,1148 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Object. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_page_guard + * ->coh_lock_guard + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/libcfs/libcfs.h> +/* class_put_type() */ +#include <obd_class.h> +#include <obd_support.h> +#include <lustre_fid.h> +#include <linux/list.h> +#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */ +#include <cl_object.h> +#include "cl_internal.h" + +static struct kmem_cache *cl_env_kmem; + +/** Lock class of cl_object_header::coh_page_guard */ +static struct lock_class_key cl_page_guard_class; +/** Lock class of cl_object_header::coh_lock_guard */ +static struct lock_class_key cl_lock_guard_class; +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +extern __u32 lu_context_tags_default; +extern __u32 lu_session_tags_default; +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + ENTRY; + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_page_guard); + spin_lock_init(&h->coh_lock_guard); + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class); + lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_pages = 0; + /* XXX hard coded GFP_* mask. */ + INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC); + INIT_LIST_HEAD(&h->coh_locks); + h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + LASSERT(list_empty(&h->coh_locks)); + lu_object_header_fini(&h->coh_lu); +} +EXPORT_SYMBOL(cl_object_header_fini); + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_page_top(), cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_set(). + */ +void cl_object_attr_lock(struct cl_object *o) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lu_object_header *top; + int result; + + LASSERT(spin_is_locked(cl_object_attr_guard(obj))); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom + * to top. + */ +int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned v) +{ + struct lu_object_header *top; + int result; + + LASSERT(spin_is_locked(cl_object_attr_guard(obj))); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_set != NULL) { + result = obj->co_ops->coo_attr_set(env, obj, attr, v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_set); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, + co_lu.lo_linkage) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), + "size: "LPU64" mtime: "LPU64" atime: "LPU64" " + "ctime: "LPU64" blocks: "LPU64"\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(result); +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr; + + hdr = cl_object_header(obj); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); + /* + * Destroy all locks. Object destruction (including cl_inode_fini()) + * cannot cancel the locks, because in the case of a local client, + * where client and server share the same thread running + * prune_icache(), this can dead-lock with ldlm_cancel_handler() + * waiting on __wait_on_freeing_inode(). + */ + cl_locks_prune(env, obj, 0); +} +EXPORT_SYMBOL(cl_object_kill); + +/** + * Prunes caches of pages and locks for this object. + */ +void cl_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + ENTRY; + cl_pages_prune(env, obj); + cl_locks_prune(env, obj, 1); + EXIT; +} +EXPORT_SYMBOL(cl_object_prune); + +/** + * Check if the object has locks. + */ +int cl_object_has_locks(struct cl_object *obj) +{ + struct cl_object_header *head = cl_object_header(obj); + int has; + + spin_lock(&head->coh_lock_guard); + has = list_empty(&head->coh_locks); + spin_unlock(&head->coh_lock_guard); + + return (has == 0); +} +EXPORT_SYMBOL(cl_object_has_locks); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + int i; + + cs->cs_name = name; + for (i = 0; i < CS_NR; i++) + atomic_set(&cs->cs_stats[i], 0); +} + +int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h) +{ + int i; + /* + * lookup hit total cached create + * env: ...... ...... ...... ...... ...... + */ + if (h) { + const char *names[CS_NR] = CS_NAMES; + + seq_printf(m, "%6s", " "); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8s", names[i]); + seq_printf(m, "\n"); + } + + seq_printf(m, "%5.5s:", cs->cs_name); + for (i = 0; i < CS_NR; i++) + seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i])); + return 0; +} + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + int i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + cache_stats_init(&s->cs_locks, "locks"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i) + atomic_set(&s->cs_locks_state[i], 0); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_stats = { ATOMIC_INIT(0), } +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, struct seq_file *m) +{ + int i; + static const char *pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + static const char *lstate[] = { + [CLS_NEW] = "n", + [CLS_QUEUING] = "q", + [CLS_ENQUEUED] = "e", + [CLS_HELD] = "h", + [CLS_INTRANSIT] = "t", + [CLS_CACHED] = "c", + [CLS_FREEING] = "f" + }; +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + lu_site_stats_print(&site->cs_lu, m); + cache_stats_print(&site->cs_pages, m, 1); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + seq_printf(m, "%s: %u ", pstate[i], + atomic_read(&site->cs_pages_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&site->cs_locks, m, 0); + seq_printf(m, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i) + seq_printf(m, "%s: %u ", lstate[i], + atomic_read(&site->cs_locks_state[i])); + seq_printf(m, "]\n"); + cache_stats_print(&cl_env_stats, m, 0); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/** + * The most efficient way is to store cl_env pointer in task specific + * structures. On Linux, it wont' be easy to use task_struct->journal_info + * because Lustre code may call into other fs which has certain assumptions + * about journal_info. Currently following fields in task_struct are identified + * can be used for this purpose: + * - cl_env: for liblustre. + * - tux_info: ony on RedHat kernel. + * - ... + * \note As long as we use task_struct to store cl_env, we assume that once + * called into Lustre, we'll never call into the other part of the kernel + * which will use those fields in task_struct without explicitly exiting + * Lustre. + * + * If there's no space in task_struct is available, hash will be used. + * bz20044, bz22683. + */ + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + + /** + * This allows cl_env to be entered into cl_env_hash which implements + * the current thread -> client environment lookup. + */ + struct hlist_node ce_node; + /** + * Owner for the current cl_env. + * + * If LL_TASK_CL_ENV is defined, this point to the owning current, + * only for debugging purpose ; + * Otherwise hash is used, and this is the key for cfs_hash. + * Now current thread pid is stored. Note using thread pointer would + * lead to unbalanced hash because of its specific allocation locality + * and could be varied for different platforms and OSes, even different + * OS versions. + */ + void *ce_owner; + + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; +}; + +#define CL_ENV_INC(counter) +#define CL_ENV_DEC(counter) + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL); + + cle->ce_ref = 1; + cle->ce_debug = debug; + CL_ENV_INC(busy); +} + + +/* + * The implementation of using hash table to connect cl_env and thread + */ + +static cfs_hash_t *cl_env_hash; + +static unsigned cl_env_hops_hash(cfs_hash_t *lh, + const void *key, unsigned mask) +{ +#if BITS_PER_LONG == 64 + return cfs_hash_u64_hash((__u64)key, mask); +#else + return cfs_hash_u32_hash((__u32)key, mask); +#endif +} + +static void *cl_env_hops_obj(struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); + return (void *)cle; +} + +static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn) +{ + struct cl_env *cle = cl_env_hops_obj(hn); + + LASSERT(cle->ce_owner != NULL); + return (key == cle->ce_owner); +} + +static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn) +{ + struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node); + LASSERT(cle->ce_magic == &cl_env_init0); +} + +static cfs_hash_ops_t cl_env_hops = { + .hs_hash = cl_env_hops_hash, + .hs_key = cl_env_hops_obj, + .hs_keycmp = cl_env_hops_keycmp, + .hs_object = cl_env_hops_obj, + .hs_get = cl_env_hops_noop, + .hs_put_locked = cl_env_hops_noop, +}; + +static inline struct cl_env *cl_env_fetch(void) +{ + struct cl_env *cle; + + cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid); + LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0)); + return cle; +} + +static inline void cl_env_attach(struct cl_env *cle) +{ + if (cle) { + int rc; + + LASSERT(cle->ce_owner == NULL); + cle->ce_owner = (void *) (long) current->pid; + rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(rc == 0); + } +} + +static inline void cl_env_do_detach(struct cl_env *cle) +{ + void *cookie; + + LASSERT(cle->ce_owner == (void *) (long) current->pid); + cookie = cfs_hash_del(cl_env_hash, cle->ce_owner, + &cle->ce_node); + LASSERT(cookie == cle); + cle->ce_owner = NULL; +} + +static int cl_env_store_init(void) { + cl_env_hash = cfs_hash_create("cl_env", + HASH_CL_ENV_BITS, HASH_CL_ENV_BITS, + HASH_CL_ENV_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &cl_env_hops, + CFS_HASH_RW_BKTLOCK); + return cl_env_hash != NULL ? 0 :-ENOMEM; +} + +static void cl_env_store_fini(void) { + cfs_hash_putref(cl_env_hash); +} + + +static inline struct cl_env *cl_env_detach(struct cl_env *cle) +{ + if (cle == NULL) + cle = cl_env_fetch(); + + if (cle && cle->ce_owner) + cl_env_do_detach(cle); + + return cle; +} + +static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO); + if (cle != NULL) { + int rc; + + INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, + LCT_SESSION | ses_tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + CL_ENV_INC(create); + CL_ENV_INC(total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + CL_ENV_DEC(total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +struct lu_env *cl_env_peek(int *refcheck) +{ + struct lu_env *env; + struct cl_env *cle; + + CL_ENV_INC(lookup); + + /* check that we don't go far from untrusted pointer */ + CLASSERT(offsetof(struct cl_env, ce_magic) == 0); + + env = NULL; + cle = cl_env_fetch(); + if (cle != NULL) { + CL_ENV_INC(hit); + env = &cle->ce_lu; + *refcheck = ++cle->ce_ref; + } + CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle); + return env; +} +EXPORT_SYMBOL(cl_env_peek); + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(int *refcheck) +{ + struct lu_env *env; + + env = cl_env_peek(refcheck); + if (env == NULL) { + env = cl_env_new(lu_context_tags_default, + lu_session_tags_default, + __builtin_return_address(0)); + + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + cl_env_attach(cle); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(int *refcheck, __u32 tags) +{ + struct lu_env *env; + + LASSERT(cl_env_peek(refcheck) == NULL); + env = cl_env_new(tags, tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + LASSERT(cle->ce_owner == NULL); + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + CL_ENV_DEC(busy); + cl_env_detach(cle); + cle->ce_debug = NULL; + cl_env_exit(cle); + cl_env_fini(cle); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Declares a point of re-entrancy. + * + * \see cl_env_reexit() + */ +void *cl_env_reenter(void) +{ + return cl_env_detach(NULL); +} +EXPORT_SYMBOL(cl_env_reenter); + +/** + * Exits re-entrancy. + */ +void cl_env_reexit(void *cookie) +{ + cl_env_detach(NULL); + cl_env_attach(cookie); +} +EXPORT_SYMBOL(cl_env_reexit); + +/** + * Setup user-supplied \a env as a current environment. This is to be used to + * guaranteed that environment exists even when cl_env_get() fails. It is up + * to user to ensure proper concurrency control. + * + * \see cl_env_unplant() + */ +void cl_env_implant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + + cl_env_attach(cle); + cl_env_get(refcheck); + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); +} +EXPORT_SYMBOL(cl_env_implant); + +/** + * Detach environment installed earlier by cl_env_implant(). + */ +void cl_env_unplant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 1); + + CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle); + + cl_env_detach(cle); + cl_env_put(env, refcheck); +} +EXPORT_SYMBOL(cl_env_unplant); + +struct lu_env *cl_env_nested_get(struct cl_env_nest *nest) +{ + struct lu_env *env; + + nest->cen_cookie = NULL; + env = cl_env_peek(&nest->cen_refcheck); + if (env != NULL) { + if (!cl_io_is_going(env)) + return env; + else { + cl_env_put(env, &nest->cen_refcheck); + nest->cen_cookie = cl_env_reenter(); + } + } + env = cl_env_get(&nest->cen_refcheck); + if (IS_ERR(env)) { + cl_env_reexit(nest->cen_cookie); + return env; + } + + LASSERT(!cl_io_is_going(env)); + return env; +} +EXPORT_SYMBOL(cl_env_nested_get); + +void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env) +{ + cl_env_put(env, &nest->cen_refcheck); + cl_env_reexit(nest->cen_cookie); +} +EXPORT_SYMBOL(cl_env_nested_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + ENTRY; + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_attr2lvb); + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + ENTRY; + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_lvb2attr); + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +int cl_lock_init(void); +void cl_lock_fini(void); + +int cl_page_init(void); +void cl_page_fini(void); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl0_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl0, struct cl_thread_info); + +static void *cl_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct cl_thread_info *info; + + info = cl0_key_init(ctx, key); + if (!IS_ERR(info)) { + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } + return info; +} + +static void cl_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info; + int i; + + info = data; + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + cl0_key_fini(ctx, key, data); +} + +static void cl_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info = data; + int i; + + for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) { + LASSERT(info->clt_counters[i].ctc_nr_held == 0); + LASSERT(info->clt_counters[i].ctc_nr_used == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0); + LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0); + lu_ref_fini(&info->clt_counters[i].ctc_locks_locked); + lu_ref_init(&info->clt_counters[i].ctc_locks_locked); + } +} + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, + .lct_exit = cl_key_exit +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof (struct cl_env) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + result = cl_env_store_init(); + if (result) + return result; + + result = lu_kmem_init(cl_object_caches); + if (result) + goto out_store; + + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result) + goto out_kmem; + + result = cl_lock_init(); + if (result) + goto out_context; + + result = cl_page_init(); + if (result) + goto out_lock; + + return 0; +out_lock: + cl_lock_fini(); +out_context: + lu_context_key_degister(&cl_key); +out_kmem: + lu_kmem_fini(cl_object_caches); +out_store: + cl_env_store_fini(); + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + cl_lock_fini(); + cl_page_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); + cl_env_store_fini(); +} diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c new file mode 100644 index 000000000000..bb9335911c34 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c @@ -0,0 +1,1605 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Page. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/libcfs/libcfs.h> +#include <obd_class.h> +#include <obd_support.h> +#include <linux/list.h> + +#include <cl_object.h> +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix); + +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) + +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) + +/* Disable page statistic by default due to huge performance penalty. */ +#define CS_PAGE_INC(o, item) +#define CS_PAGE_DEC(o, item) +#define CS_PAGESTATE_INC(o, state) +#define CS_PAGESTATE_DEC(o, state) + +/** + * Internal version of cl_page_top, it should be called if the page is + * known to be not freed, says with page referenced, or radix tree lock held, + * or page owned. + */ +static struct cl_page *cl_page_top_trusted(struct cl_page *page) +{ + while (page->cp_parent != NULL) + page = page->cp_parent; + return page; +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by locking page radix-tree + * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(atomic_read(&page->cp_ref) > 0); + atomic_inc(&page->cp_ref); +} + +/** + * Returns a slice within a page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + ENTRY; + + page = cl_page_top_trusted((struct cl_page *)page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + page = page->cp_child; + } while (page != NULL); + RETURN(NULL); +} + +/** + * Returns a page with given index in the given object, or NULL if no page is + * found. Acquires a reference on \a page. + * + * Locking: called under cl_object_header::coh_page_guard spin-lock. + */ +struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index) +{ + struct cl_page *page; + + LASSERT(spin_is_locked(&hdr->coh_page_guard)); + + page = radix_tree_lookup(&hdr->coh_tree, index); + if (page != NULL) + cl_page_get_trust(page); + return page; +} +EXPORT_SYMBOL(cl_page_lookup); + +/** + * Returns a list of pages by a given [start, end] of \a obj. + * + * \param resched If not NULL, then we give up before hogging CPU for too + * long and set *resched = 1, in that case caller should implement a retry + * logic. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + * + * Return at least one page in @queue unless there is no covered page. + */ +int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, pgoff_t start, pgoff_t end, + cl_page_gang_cb_t cb, void *cbdata) +{ + struct cl_object_header *hdr; + struct cl_page *page; + struct cl_page **pvec; + const struct cl_page_slice *slice; + const struct lu_device_type *dtype; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + int res = CLP_GANG_OKAY; + int tree_lock = 1; + ENTRY; + + idx = start; + hdr = cl_object_header(obj); + pvec = cl_env_info(env)->clt_pvec; + dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type; + spin_lock(&hdr->coh_page_guard); + while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec, + idx, CLT_PVEC_SIZE)) > 0) { + int end_of_region = 0; + idx = pvec[nr - 1]->cp_index + 1; + for (i = 0, j = 0; i < nr; ++i) { + page = pvec[i]; + pvec[i] = NULL; + + LASSERT(page->cp_type == CPT_CACHEABLE); + if (page->cp_index > end) { + end_of_region = 1; + break; + } + if (page->cp_state == CPS_FREEING) + continue; + + slice = cl_page_at_trusted(page, dtype); + /* + * Pages for lsm-less file has no underneath sub-page + * for osc, in case of ... + */ + PASSERT(env, page, slice != NULL); + + page = slice->cpl_page; + /* + * Can safely call cl_page_get_trust() under + * radix-tree spin-lock. + * + * XXX not true, because @page is from object another + * than @hdr and protected by different tree lock. + */ + cl_page_get_trust(page); + lu_ref_add_atomic(&page->cp_reference, + "gang_lookup", current); + pvec[j++] = page; + } + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&hdr->coh_page_guard); + tree_lock = 0; + + for (i = 0; i < j; ++i) { + page = pvec[i]; + if (res == CLP_GANG_OKAY) + res = (*cb)(env, io, page, cbdata); + lu_ref_del(&page->cp_reference, + "gang_lookup", current); + cl_page_put(env, page); + } + if (nr < CLT_PVEC_SIZE || end_of_region) + break; + + if (res == CLP_GANG_OKAY && need_resched()) + res = CLP_GANG_RESCHED; + if (res != CLP_GANG_OKAY) + break; + + spin_lock(&hdr->coh_page_guard); + tree_lock = 1; + } + if (tree_lock) + spin_unlock(&hdr->coh_page_guard); + RETURN(res); +} +EXPORT_SYMBOL(cl_page_gang_lookup); + +static void cl_page_free(const struct lu_env *env, struct cl_page *page) +{ + struct cl_object *obj = page->cp_obj; + int pagesize = cl_object_header(obj)->coh_page_bufsize; + + PASSERT(env, page, list_empty(&page->cp_batch)); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, page->cp_req == NULL); + PASSERT(env, page, page->cp_parent == NULL); + PASSERT(env, page, page->cp_state == CPS_FREEING); + + ENTRY; + might_sleep(); + while (!list_empty(&page->cp_layers)) { + struct cl_page_slice *slice; + + slice = list_entry(page->cp_layers.next, + struct cl_page_slice, cpl_linkage); + list_del_init(page->cp_layers.next); + slice->cpl_ops->cpo_fini(env, slice); + } + CS_PAGE_DEC(obj, total); + CS_PAGESTATE_DEC(obj, page->cp_state); + lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page); + cl_object_put(env, obj); + lu_ref_fini(&page->cp_reference); + OBD_FREE(page, pagesize); + EXIT; +} + +/** + * Helper function updating page state. This is the only place in the code + * where cl_page::cp_state field is mutated. + */ +static inline void cl_page_state_set_trust(struct cl_page *page, + enum cl_page_state state) +{ + /* bypass const. */ + *(enum cl_page_state *)&page->cp_state = state; +} + +static struct cl_page *cl_page_alloc(const struct lu_env *env, + struct cl_object *o, pgoff_t ind, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page; + struct lu_object_header *head; + + ENTRY; + OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize, + __GFP_IO); + if (page != NULL) { + int result = 0; + atomic_set(&page->cp_ref, 1); + if (type == CPT_CACHEABLE) /* for radix tree */ + atomic_inc(&page->cp_ref); + page->cp_obj = o; + cl_object_get(o); + page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page); + page->cp_index = ind; + cl_page_state_set_trust(page, CPS_CACHED); + page->cp_type = type; + INIT_LIST_HEAD(&page->cp_layers); + INIT_LIST_HEAD(&page->cp_batch); + INIT_LIST_HEAD(&page->cp_flight); + mutex_init(&page->cp_mutex); + lu_ref_init(&page->cp_reference); + head = o->co_lu.lo_header; + list_for_each_entry(o, &head->loh_layers, + co_lu.lo_linkage) { + if (o->co_ops->coo_page_init != NULL) { + result = o->co_ops->coo_page_init(env, o, + page, vmpage); + if (result != 0) { + cl_page_delete0(env, page, 0); + cl_page_free(env, page); + page = ERR_PTR(result); + break; + } + } + } + if (result == 0) { + CS_PAGE_INC(o, total); + CS_PAGE_INC(o, create); + CS_PAGESTATE_DEC(o, CPS_CACHED); + } + } else { + page = ERR_PTR(-ENOMEM); + } + RETURN(page); +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +static struct cl_page *cl_page_find0(const struct lu_env *env, + struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type, + struct cl_page *parent) +{ + struct cl_page *page = NULL; + struct cl_page *ghost = NULL; + struct cl_object_header *hdr; + int err; + + LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + ENTRY; + + hdr = cl_object_header(o); + CS_PAGE_INC(o, lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* vmpage lock is used to protect the child/parent + * relationship */ + KLASSERT(PageLocked(vmpage)); + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + PINVRNT(env, page, + ergo(page != NULL, + cl_page_vmpage(env, page) == vmpage && + (void *)radix_tree_lookup(&hdr->coh_tree, + idx) == page)); + } + + if (page != NULL) { + CS_PAGE_INC(o, hit); + RETURN(page); + } + + /* allocate and initialize cl_page */ + page = cl_page_alloc(env, o, idx, vmpage, type); + if (IS_ERR(page)) + RETURN(page); + + if (type == CPT_TRANSIENT) { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + RETURN(page); + } + + /* + * XXX optimization: use radix_tree_preload() here, and change tree + * gfp mask to GFP_KERNEL in cl_object_header_init(). + */ + spin_lock(&hdr->coh_page_guard); + err = radix_tree_insert(&hdr->coh_tree, idx, page); + if (err != 0) { + ghost = page; + /* + * Noted by Jay: a lock on \a vmpage protects cl_page_find() + * from this race, but + * + * 0. it's better to have cl_page interface "locally + * consistent" so that its correctness can be reasoned + * about without appealing to the (obscure world of) VM + * locking. + * + * 1. handling this race allows ->coh_tree to remain + * consistent even when VM locking is somehow busted, + * which is very useful during diagnosing and debugging. + */ + page = ERR_PTR(err); + CL_PAGE_DEBUG(D_ERROR, env, ghost, + "fail to insert into radix tree: %d\n", err); + } else { + if (parent) { + LASSERT(page->cp_parent == NULL); + page->cp_parent = parent; + parent->cp_child = page; + } + hdr->coh_pages++; + } + spin_unlock(&hdr->coh_page_guard); + + if (unlikely(ghost != NULL)) { + cl_page_delete0(env, ghost, 0); + cl_page_free(env, ghost); + } + RETURN(page); +} + +struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + return cl_page_find0(env, o, idx, vmpage, type, NULL); +} +EXPORT_SYMBOL(cl_page_find); + + +struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + struct cl_page *parent) +{ + return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent); +} +EXPORT_SYMBOL(cl_page_find_sub); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + struct cl_object_header *header; + struct cl_page *parent; + struct cl_page *child; + struct cl_io *owner; + + /* + * Page invariant is protected by a VM lock. + */ + LINVRNT(cl_page_is_vmlocked(NULL, pg)); + + header = cl_object_header(pg->cp_obj); + parent = pg->cp_parent; + child = pg->cp_child; + owner = pg->cp_owner; + + return cl_page_in_use(pg) && + ergo(parent != NULL, parent->cp_child == pg) && + ergo(child != NULL, child->cp_parent == pg) && + ergo(child != NULL, pg->cp_obj != child->cp_obj) && + ergo(parent != NULL, pg->cp_obj != parent->cp_obj) && + ergo(owner != NULL && parent != NULL, + parent->cp_owner == pg->cp_owner->ci_parent) && + ergo(owner != NULL && child != NULL, + child->cp_owner->ci_parent == owner) && + /* + * Either page is early in initialization (has neither child + * nor parent yet), or it is in the object radix tree. + */ + ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE, + (void *)radix_tree_lookup(&header->coh_tree, + pg->cp_index) == pg || + (child == NULL && parent == NULL)); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + enum cl_page_state old; + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + ENTRY; + old = page->cp_state; + PASSERT(env, page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state); + for (; page != NULL; page = page->cp_child) { + PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, + equi(state == CPS_OWNED, page->cp_owner != NULL)); + + CS_PAGESTATE_DEC(page->cp_obj, page->cp_state); + CS_PAGESTATE_INC(page->cp_obj, state); + cl_page_state_set_trust(page, state); + } + EXIT; +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + ENTRY; + cl_page_get_trust(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page. + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, page, "%d\n", + atomic_read(&page->cp_ref)); + + if (atomic_dec_and_test(&page->cp_ref)) { + LASSERT(page->cp_state == CPS_FREEING); + + LASSERT(atomic_read(&page->cp_ref) == 0); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page); + } + + EXIT; +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a VM page associated with a given cl_page. + */ +struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + /* + * Find uppermost layer with ->cpo_vmpage() method, and return its + * result. + */ + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_vmpage != NULL) + RETURN(slice->cpl_ops->cpo_vmpage(env, slice)); + } + page = page->cp_child; + } while (page != NULL); + LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */ +} +EXPORT_SYMBOL(cl_page_vmpage); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj) +{ + struct cl_page *top; + struct cl_page *page; + + ENTRY; + KLASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + /* + * This loop assumes that ->private points to the top-most page. This + * can be rectified easily. + */ + top = (struct cl_page *)vmpage->private; + if (top == NULL) + RETURN(NULL); + + for (page = top; page != NULL; page = page->cp_child) { + if (cl_object_same(page->cp_obj, obj)) { + cl_page_get_trust(page); + break; + } + } + LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE)); + RETURN(page); +} +EXPORT_SYMBOL(cl_vmpage_page); + +/** + * Returns the top-page for a given page. + * + * \see cl_object_top(), cl_io_top() + */ +struct cl_page *cl_page_top(struct cl_page *page) +{ + return cl_page_top_trusted(page); +} +EXPORT_SYMBOL(cl_page_top); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname) + +#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \ +({ \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + int __result; \ + ptrdiff_t __op = (_op); \ + int (*__method)_proto; \ + \ + __result = 0; \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) { \ + __result = (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + if (__result != 0) \ + break; \ + } \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL && __result == 0); \ + if (__result > 0) \ + __result = 0; \ + __result; \ +}) + +#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL); \ +} while (0) + +#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + /* get to the bottom page. */ \ + while (__page->cp_child != NULL) \ + __page = __page->cp_child; \ + do { \ + list_for_each_entry_reverse(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_parent; \ + } while (__page != NULL); \ +} while (0) + +static int cl_page_invoke(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + RETURN(CL_PAGE_INVOKE(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io)); +} + +static void cl_page_invoid(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + CL_PAGE_INVOID(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), io); + EXIT; +} + +static void cl_page_owner_clear(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + page->cp_task = NULL; + } + } + EXIT; +} + +static void cl_page_owner_set(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + } + EXIT; +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + enum cl_page_state state; + + ENTRY; + state = pg->cp_state; + PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); + PINVRNT(env, pg, cl_page_invariant(pg)); + cl_page_owner_clear(pg); + + if (state == CPS_OWNED) + cl_page_state_set(env, pg, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + ENTRY; + RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io); +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Try to own a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(pg, io) + * \post result == 0 iff cl_page_is_owned(pg, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * or, page was owned by another thread, or in IO. + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + * \see cl_page_own_try() + * \see cl_page_own + */ +static int cl_page_own0(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, int nonblock) +{ + int result; + + PINVRNT(env, pg, !cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + if (pg->cp_state == CPS_FREEING) { + result = -ENOENT; + } else { + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own), + (const struct lu_env *, + const struct cl_page_slice *, + struct cl_io *, int), + io, nonblock); + if (result == 0) { + PASSERT(env, pg, pg->cp_owner == NULL); + PASSERT(env, pg, pg->cp_req == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + if (pg->cp_state != CPS_FREEING) { + cl_page_state_set(env, pg, CPS_OWNED); + } else { + cl_page_disown0(env, io, pg); + result = -ENOENT; + } + } + } + PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); + RETURN(result); +} + +/** + * Own a page, might be blocked. + * + * \see cl_page_own0() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 0); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Nonblock version of cl_page_own(). + * + * \see cl_page_own0() + */ +int cl_page_own_try(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + return cl_page_own0(env, io, pg, 1); +} +EXPORT_SYMBOL(cl_page_own_try); + + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); + PASSERT(env, pg, pg->cp_owner == NULL); + pg->cp_owner = io; + pg->cp_task = current; + cl_page_owner_set(pg); + cl_page_state_set(env, pg, CPS_OWNED); + EXIT; +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the + * underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_disown0(env, io, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when page is to be removed from the object, e.g., as a result of + * truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard)); +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix) +{ + struct cl_page *tmp = pg; + ENTRY; + + PASSERT(env, pg, pg == cl_page_top(pg)); + PASSERT(env, pg, pg->cp_state != CPS_FREEING); + + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(pg); + + /* + * unexport the page firstly before freeing it so that + * the page content is considered to be invalid. + * We have to do this because a CPS_FREEING cl_page may + * be NOT under the protection of a cl_lock. + * Afterwards, if this page is found by other threads, then this + * page will be forced to reread. + */ + cl_page_export(env, pg, 0); + cl_page_state_set0(env, pg, CPS_FREEING); + + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete), + (const struct lu_env *, const struct cl_page_slice *)); + + if (tmp->cp_type == CPT_CACHEABLE) { + if (!radix) + /* !radix means that @pg is not yet in the radix tree, + * skip removing it. + */ + tmp = pg->cp_child; + for (; tmp != NULL; tmp = tmp->cp_child) { + void *value; + struct cl_object_header *hdr; + + hdr = cl_object_header(tmp->cp_obj); + spin_lock(&hdr->coh_page_guard); + value = radix_tree_delete(&hdr->coh_tree, + tmp->cp_index); + PASSERT(env, tmp, value == tmp); + PASSERT(env, tmp, hdr->coh_pages > 0); + hdr->coh_pages--; + spin_unlock(&hdr->coh_page_guard); + cl_page_put(env, tmp); + } + } + + EXIT; +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre pg == cl_page_top(pg) + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + ENTRY; + cl_page_delete0(env, pg, 1); + EXIT; +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Unmaps page from user virtual memory. + * + * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The + * layer responsible for VM interaction has to unmap page from user space + * virtual memory. + * + * \see cl_page_operations::cpo_unmap() + */ +int cl_page_unmap(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap)); +} +EXPORT_SYMBOL(cl_page_unmap); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark/clear page as up-to-date + * by the \a uptodate argument. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export), + (const struct lu_env *, + const struct cl_page_slice *, int), uptodate); +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, iff \a pg is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) +{ + int result; + const struct cl_page_slice *slice; + + ENTRY; + pg = cl_page_top_trusted((struct cl_page *)pg); + slice = container_of(pg->cp_layers.next, + const struct cl_page_slice, cpl_linkage); + PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, pg, result == -EBUSY || result == -ENODATA); + RETURN(result == -EBUSY); +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + ENTRY; + RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN); +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + ENTRY; + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); + EXIT; +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + PINVRNT(env, pg, crt < CRT_NR); + + /* + * XXX this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + if (crt >= CRT_NR) + return -EINVAL; + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep)); + if (result == 0) + cl_page_io_start(env, pg, crt); + + KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE, + equi(result == 0, + PageWriteback(cl_page_vmpage(env, pg))))); + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * \post pg->cp_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret) +{ + struct cl_sync_io *anchor = pg->cp_sync_io; + + PASSERT(env, pg, crt < CRT_NR); + /* cl_page::cp_req already cleared by the caller (osc_completion()) */ + PASSERT(env, pg, pg->cp_req == NULL); + PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret); + if (crt == CRT_READ && ioret == 0) { + PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED)); + pg->cp_flags |= CPF_READ_COMPLETED; + } + + cl_page_state_set(env, pg, CPS_CACHED); + if (crt >= CRT_NR) + return; + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion), + (const struct lu_env *, + const struct cl_page_slice *, int), ioret); + if (anchor) { + LASSERT(cl_page_is_vmlocked(env, pg)); + LASSERT(pg->cp_sync_io == anchor); + pg->cp_sync_io = NULL; + } + /* + * As page->cp_obj is pinned by a reference from page->cp_req, it is + * safe to call cl_page_put() without risking object destruction in a + * non-blocking context. + */ + cl_page_put(env, pg); + + if (anchor) + cl_sync_io_note(anchor, ioret); + + EXIT; +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre pg->cp_state == CPS_CACHED + * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, crt < CRT_NR); + + ENTRY; + if (crt >= CRT_NR) + RETURN(-EINVAL); + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready), + (const struct lu_env *, + const struct cl_page_slice *)); + if (result == 0) { + PASSERT(env, pg, pg->cp_state == CPS_CACHED); + cl_page_io_start(env, pg, crt); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Notify layers that high level io decided to place this page into a cache + * for future transfer. + * + * The layer implementing transfer engine (osc) has to register this page in + * its queues. + * + * \pre cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_cache_add() + */ +int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + const struct cl_page_slice *scan; + int result = 0; + + PINVRNT(env, pg, crt < CRT_NR); + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + + if (crt >= CRT_NR) + RETURN(-EINVAL); + + list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) { + if (scan->cpl_ops->io[crt].cpo_cache_add == NULL) + continue; + + result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io); + if (result != 0) + break; + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_cache_add); + +/** + * Called if a pge is being written back by kernel's intention. + * + * \pre cl_page_is_owned(pg, io) + * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_flush() + */ +int cl_page_flush(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_flush); + +/** + * Checks whether page is protected by any extent lock is at least required + * mode. + * + * \return the same as in cl_page_operations::cpo_is_under_lock() method. + * \see cl_page_operations::cpo_is_under_lock() + */ +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + int rc; + + PINVRNT(env, page, cl_page_invariant(page)); + + ENTRY; + rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + PASSERT(env, page, rc != 0); + RETURN(rc); +} +EXPORT_SYMBOL(cl_page_is_under_lock); + +static int page_prune_cb(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, void *cbdata) +{ + cl_page_own(env, io, page); + cl_page_unmap(env, io, page); + cl_page_discard(env, io, page); + cl_page_disown(env, io, page); + return CLP_GANG_OKAY; +} + +/** + * Purges all cached pages belonging to the object \a obj. + */ +int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj) +{ + struct cl_thread_info *info; + struct cl_object *obj = cl_object_top(clobj); + struct cl_io *io; + int result; + + ENTRY; + info = cl_env_info(env); + io = &info->clt_io; + + /* + * initialize the io. This is ugly since we never do IO in this + * function, we just make cl_page_list functions happy. -jay + */ + io->ci_obj = obj; + io->ci_ignore_layout = 1; + result = cl_io_init(env, io, CIT_MISC, obj); + if (result != 0) { + cl_io_fini(env, io); + RETURN(io->ci_result); + } + + do { + result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, + page_prune_cb, NULL); + if (result == CLP_GANG_RESCHED) + cond_resched(); + } while (result != CLP_GANG_OKAY); + + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_pages_prune); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *pg, + int from, int to) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip), + (const struct lu_env *, + const struct cl_page_slice *,int, int), + from, to); +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_index, pg->cp_parent, pg->cp_child, + pg->cp_state, pg->cp_error, pg->cp_type, + pg->cp_owner, pg->cp_req, pg->cp_flags); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + struct cl_page *scan; + + for (scan = cl_page_top((struct cl_page *)pg); + scan != NULL; scan = scan->cp_child) + cl_page_header_print(env, cookie, printer, scan); + CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print), + (const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p), cookie, printer); + (*printer)(env, cookie, "end page@%p\n", pg); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Cancel a page which is still in a transfer. + */ +int cl_page_cancel(const struct lu_env *env, struct cl_page *page) +{ + return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel), + (const struct lu_env *, + const struct cl_page_slice *)); +} +EXPORT_SYMBOL(cl_page_cancel); + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + /* + * XXX for now. + */ + return (loff_t)idx << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + /* + * XXX for now. + */ + return offset >> PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +int cl_page_size(const struct cl_object *obj) +{ + return 1 << PAGE_CACHE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops) +{ + ENTRY; + list_add_tail(&slice->cpl_linkage, &page->cp_layers); + slice->cpl_obj = obj; + slice->cpl_ops = ops; + slice->cpl_page = page; + EXIT; +} +EXPORT_SYMBOL(cl_page_slice_add); + +int cl_page_init(void) +{ + return 0; +} + +void cl_page_fini(void) +{ +} diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c new file mode 100644 index 000000000000..af1c2d09c47b --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c @@ -0,0 +1,689 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +# include <asm/atomic.h> + +#include <obd_support.h> +#include <obd_class.h> +#include <linux/lnet/lnetctl.h> +#include <lustre_debug.h> +#include <lprocfs_status.h> +#include <lustre/lustre_build_version.h> +#include <linux/list.h> +#include <cl_object.h> +#include "llog_internal.h" + + +struct obd_device *obd_devs[MAX_OBD_DEVICES]; +EXPORT_SYMBOL(obd_devs); +struct list_head obd_types; +DEFINE_RWLOCK(obd_dev_lock); + +__u64 obd_max_pages = 0; +__u64 obd_max_alloc = 0; +DEFINE_SPINLOCK(obd_updatemax_lock); + +/* The following are visible and mutable through /proc/sys/lustre/. */ +unsigned int obd_alloc_fail_rate = 0; +EXPORT_SYMBOL(obd_alloc_fail_rate); +unsigned int obd_debug_peer_on_timeout; +EXPORT_SYMBOL(obd_debug_peer_on_timeout); +unsigned int obd_dump_on_timeout; +EXPORT_SYMBOL(obd_dump_on_timeout); +unsigned int obd_dump_on_eviction; +EXPORT_SYMBOL(obd_dump_on_eviction); +unsigned int obd_max_dirty_pages = 256; +EXPORT_SYMBOL(obd_max_dirty_pages); +atomic_t obd_dirty_pages; +EXPORT_SYMBOL(obd_dirty_pages); +unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(obd_timeout); +unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ +EXPORT_SYMBOL(ldlm_timeout); +unsigned int obd_timeout_set; +EXPORT_SYMBOL(obd_timeout_set); +unsigned int ldlm_timeout_set; +EXPORT_SYMBOL(ldlm_timeout_set); +/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */ +unsigned int at_min = 0; +EXPORT_SYMBOL(at_min); +unsigned int at_max = 600; +EXPORT_SYMBOL(at_max); +unsigned int at_history = 600; +EXPORT_SYMBOL(at_history); +int at_early_margin = 5; +EXPORT_SYMBOL(at_early_margin); +int at_extra = 30; +EXPORT_SYMBOL(at_extra); + +atomic_t obd_dirty_transit_pages; +EXPORT_SYMBOL(obd_dirty_transit_pages); + +char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE; +EXPORT_SYMBOL(obd_jobid_var); + +/* Get jobid of current process by reading the environment variable + * stored in between the "env_start" & "env_end" of task struct. + * + * TODO: + * It's better to cache the jobid for later use if there is any + * efficient way, the cl_env code probably could be reused for this + * purpose. + * + * If some job scheduler doesn't store jobid in the "env_start/end", + * then an upcall could be issued here to get the jobid by utilizing + * the userspace tools/api. Then, the jobid must be cached. + */ +int lustre_get_jobid(char *jobid) +{ + int jobid_len = JOBSTATS_JOBID_SIZE; + int rc = 0; + ENTRY; + + memset(jobid, 0, JOBSTATS_JOBID_SIZE); + /* Jobstats isn't enabled */ + if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0) + RETURN(0); + + /* Use process name + fsuid as jobid */ + if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) { + snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u", + current_comm(), current_fsuid()); + RETURN(0); + } + + rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len); + if (rc) { + if (rc == -EOVERFLOW) { + /* For the PBS_JOBID and LOADL_STEP_ID keys (which are + * variable length strings instead of just numbers), it + * might make sense to keep the unique parts for JobID, + * instead of just returning an error. That means a + * larger temp buffer for cfs_get_environ(), then + * truncating the string at some separator to fit into + * the specified jobid_len. Fix later if needed. */ + static bool printed; + if (unlikely(!printed)) { + LCONSOLE_ERROR_MSG(0x16b, "%s value too large " + "for JobID buffer (%d)\n", + obd_jobid_var, jobid_len); + printed = true; + } + } else { + CDEBUG((rc == -ENOENT || rc == -EINVAL || + rc == -EDEADLK) ? D_INFO : D_ERROR, + "Get jobid for (%s) failed: rc = %d\n", + obd_jobid_var, rc); + } + } + RETURN(rc); +} +EXPORT_SYMBOL(lustre_get_jobid); + +int obd_alloc_fail(const void *ptr, const char *name, const char *type, + size_t size, const char *file, int line) +{ + if (ptr == NULL || + (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) { + CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n", + ptr ? "force " :"", type, name, (__u64)size, file, + line); + CERROR(LPU64" total bytes and "LPU64" total pages " + "("LPU64" bytes) allocated by Lustre, " + "%d total bytes by LNET\n", + obd_memory_sum(), + obd_pages_sum() << PAGE_CACHE_SHIFT, + obd_pages_sum(), + atomic_read(&libcfs_kmemory)); + return 1; + } + return 0; +} +EXPORT_SYMBOL(obd_alloc_fail); + +static inline void obd_data2conn(struct lustre_handle *conn, + struct obd_ioctl_data *data) +{ + memset(conn, 0, sizeof *conn); + conn->cookie = data->ioc_cookie; +} + +static inline void obd_conn2data(struct obd_ioctl_data *data, + struct lustre_handle *conn) +{ + data->ioc_cookie = conn->cookie; +} + +int class_resolve_dev_name(__u32 len, const char *name) +{ + int rc; + int dev; + + ENTRY; + if (!len || !name) { + CERROR("No name passed,!\n"); + GOTO(out, rc = -EINVAL); + } + if (name[len - 1] != 0) { + CERROR("Name not nul terminated!\n"); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", name); + dev = class_name2dev(name); + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for name %s!\n", name); + GOTO(out, rc = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev); + rc = dev; + +out: + RETURN(rc); +} + +int class_handle_ioctl(unsigned int cmd, unsigned long arg) +{ + char *buf = NULL; + struct obd_ioctl_data *data; + struct libcfs_debug_ioctl_data *debug_data; + struct obd_device *obd = NULL; + int err = 0, len = 0; + ENTRY; + + /* only for debugging */ + if (cmd == LIBCFS_IOC_DEBUG_MASK) { + debug_data = (struct libcfs_debug_ioctl_data*)arg; + libcfs_subsystem_debug = debug_data->subs; + libcfs_debug = debug_data->debug; + return 0; + } + + CDEBUG(D_IOCTL, "cmd = %x\n", cmd); + if (obd_ioctl_getdata(&buf, &len, (void *)arg)) { + CERROR("OBD ioctl: data error\n"); + RETURN(-EINVAL); + } + data = (struct obd_ioctl_data *)buf; + + switch (cmd) { + case OBD_IOC_PROCESS_CFG: { + struct lustre_cfg *lcfg; + + if (!data->ioc_plen1 || !data->ioc_pbuf1) { + CERROR("No config buffer passed!\n"); + GOTO(out, err = -EINVAL); + } + OBD_ALLOC(lcfg, data->ioc_plen1); + if (lcfg == NULL) + GOTO(out, err = -ENOMEM); + err = copy_from_user(lcfg, data->ioc_pbuf1, + data->ioc_plen1); + if (!err) + err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1); + if (!err) + err = class_process_config(lcfg); + + OBD_FREE(lcfg, data->ioc_plen1); + GOTO(out, err); + } + + case OBD_GET_VERSION: + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + + if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + memcpy(data->ioc_bulk, BUILD_VERSION, + strlen(BUILD_VERSION) + 1); + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + GOTO(out, err); + + case OBD_IOC_NAME2DEV: { + /* Resolve a device name. This does not change the + * currently selected device. + */ + int dev; + + dev = class_resolve_dev_name(data->ioc_inllen1, + data->ioc_inlbuf1); + data->ioc_dev = dev; + if (dev < 0) + GOTO(out, err = -EINVAL); + + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_UUID2DEV: { + /* Resolve a device uuid. This does not change the + * currently selected device. + */ + int dev; + struct obd_uuid uuid; + + if (!data->ioc_inllen1 || !data->ioc_inlbuf1) { + CERROR("No UUID passed!\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) { + CERROR("UUID not NUL terminated!\n"); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1); + obd_str2uuid(&uuid, data->ioc_inlbuf1); + dev = class_uuid2dev(&uuid); + data->ioc_dev = dev; + if (dev == -1) { + CDEBUG(D_IOCTL, "No device for UUID %s!\n", + data->ioc_inlbuf1); + GOTO(out, err = -EINVAL); + } + + CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1, + dev); + err = obd_ioctl_popdata((void *)arg, data, sizeof(*data)); + if (err) + err = -EFAULT; + GOTO(out, err); + } + + case OBD_IOC_CLOSE_UUID: { + CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n", + data->ioc_inlbuf1); + GOTO(out, err = 0); + } + + case OBD_IOC_GETDEVICE: { + int index = data->ioc_count; + char *status, *str; + + if (!data->ioc_inlbuf1) { + CERROR("No buffer passed in ioctl\n"); + GOTO(out, err = -EINVAL); + } + if (data->ioc_inllen1 < 128) { + CERROR("ioctl buffer too small to hold version\n"); + GOTO(out, err = -EINVAL); + } + + obd = class_num2obd(index); + if (!obd) + GOTO(out, err = -ENOENT); + + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + str = (char *)data->ioc_bulk; + snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + err = obd_ioctl_popdata((void *)arg, data, len); + + GOTO(out, err = 0); + } + + } + + if (data->ioc_dev == OBD_DEV_BY_DEVNAME) { + if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL) + GOTO(out, err = -EINVAL); + if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME) + GOTO(out, err = -EINVAL); + obd = class_name2obd(data->ioc_inlbuf4); + } else if (data->ioc_dev < class_devno_max()) { + obd = class_num2obd(data->ioc_dev); + } else { + CERROR("OBD ioctl: No device\n"); + GOTO(out, err = -EINVAL); + } + + if (obd == NULL) { + CERROR("OBD ioctl : No Device %d\n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + + if (!obd->obd_set_up || obd->obd_stopping) { + CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev); + GOTO(out, err = -EINVAL); + } + + switch(cmd) { + case OBD_IOC_NO_TRANSNO: { + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + GOTO(out, err = -ENODEV); + } + CDEBUG(D_HA, "%s: disabling committed-transno notification\n", + obd->obd_name); + obd->obd_no_transno = 1; + GOTO(out, err = 0); + } + + default: { + err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL); + if (err) + GOTO(out, err); + + err = obd_ioctl_popdata((void *)arg, data, len); + if (err) + err = -EFAULT; + GOTO(out, err); + } + } + + out: + if (buf) + obd_ioctl_freedata(buf, len); + RETURN(err); +} /* class_handle_ioctl */ + +extern psdev_t obd_psdev; + +#define OBD_INIT_CHECK +int obd_init_checks(void) +{ + __u64 u64val, div64val; + char buf[64]; + int len, ret = 0; + + CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64); + + CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF); + + u64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPX64, u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + + div64val = OBD_OBJECT_EOF; + CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val); + if (u64val != OBD_OBJECT_EOF) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + ret = -EOVERFLOW; + } + if (u64val >> 8 != OBD_OBJECT_EOF >> 8) { + CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n", + u64val, (int)sizeof(u64val)); + return -EOVERFLOW; + } + if (do_div(div64val, 256) != (u64val & 255)) { + CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255); + return -EOVERFLOW; + } + if (u64val >> 8 != div64val) { + CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n", + u64val, div64val, u64val >> 8); + return -EOVERFLOW; + } + len = snprintf(buf, sizeof(buf), LPX64, u64val); + if (len != 18) { + CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPU64, u64val); + if (len != 20) { + CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len); + ret = -EINVAL; + } + len = snprintf(buf, sizeof(buf), LPD64, u64val); + if (len != 2) { + CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len); + ret = -EINVAL; + } + if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) { + CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val, + (__u64)PAGE_CACHE_SIZE); + ret = -EINVAL; + } + + return ret; +} + +extern spinlock_t obd_types_lock; +extern int class_procfs_init(void); +extern int class_procfs_clean(void); + +static int __init init_obdclass(void) +{ + int i, err; + int lustre_register_fs(void); + + for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++) + INIT_LIST_HEAD(&capa_list[i]); + + LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n"); + + spin_lock_init(&obd_types_lock); + obd_zombie_impexp_init(); +#ifdef LPROCFS + obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM, + LPROCFS_STATS_FLAG_NONE | + LPROCFS_STATS_FLAG_IRQ_SAFE); + if (obd_memory == NULL) { + CERROR("kmalloc of 'obd_memory' failed\n"); + RETURN(-ENOMEM); + } + + lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, + LPROCFS_CNTR_AVGMINMAX, + "memused", "bytes"); + lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT, + LPROCFS_CNTR_AVGMINMAX, + "pagesused", "pages"); +#endif + err = obd_init_checks(); + if (err == -EOVERFLOW) + return err; + + class_init_uuidlist(); + err = class_handle_init(); + if (err) + return err; + + INIT_LIST_HEAD(&obd_types); + + err = misc_register(&obd_psdev); + if (err) { + CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err); + return err; + } + + /* This struct is already zeroed for us (static global) */ + for (i = 0; i < class_devno_max(); i++) + obd_devs[i] = NULL; + + /* Default the dirty page cache cap to 1/2 of system memory. + * For clients with less memory, a larger fraction is needed + * for other purposes (mostly for BGL). */ + if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT)) + obd_max_dirty_pages = num_physpages / 4; + else + obd_max_dirty_pages = num_physpages / 2; + + err = obd_init_caches(); + if (err) + return err; + err = class_procfs_init(); + if (err) + return err; + + err = lu_global_init(); + if (err) + return err; + + err = cl_global_init(); + if (err != 0) + return err; + + + err = llog_info_init(); + if (err) + return err; + + err = lustre_register_fs(); + + return err; +} + +void obd_update_maxusage(void) +{ + __u64 max1, max2; + + max1 = obd_pages_sum(); + max2 = obd_memory_sum(); + + spin_lock(&obd_updatemax_lock); + if (max1 > obd_max_pages) + obd_max_pages = max1; + if (max2 > obd_max_alloc) + obd_max_alloc = max2; + spin_unlock(&obd_updatemax_lock); +} +EXPORT_SYMBOL(obd_update_maxusage); + +#ifdef LPROCFS +__u64 obd_memory_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_alloc; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_memory_max); + +__u64 obd_pages_max(void) +{ + __u64 ret; + + spin_lock(&obd_updatemax_lock); + ret = obd_max_pages; + spin_unlock(&obd_updatemax_lock); + + return ret; +} +EXPORT_SYMBOL(obd_pages_max); +#endif + +/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this + * ifdef to the end of the file to cover module and versioning goo.*/ +static void cleanup_obdclass(void) +{ + int i; + int lustre_unregister_fs(void); + __u64 memory_leaked, pages_leaked; + __u64 memory_max, pages_max; + ENTRY; + + lustre_unregister_fs(); + + misc_deregister(&obd_psdev); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + if (obd && obd->obd_set_up && + OBT(obd) && OBP(obd, detach)) { + /* XXX should this call generic detach otherwise? */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + OBP(obd, detach)(obd); + } + } + llog_info_fini(); + cl_global_fini(); + lu_global_fini(); + + obd_cleanup_caches(); + obd_sysctl_clean(); + + class_procfs_clean(); + + class_handle_cleanup(); + class_exit_uuidlist(); + obd_zombie_impexp_stop(); + + memory_leaked = obd_memory_sum(); + pages_leaked = obd_pages_sum(); + + memory_max = obd_memory_max(); + pages_max = obd_pages_max(); + + lprocfs_free_stats(&obd_memory); + CDEBUG((memory_leaked) ? D_ERROR : D_INFO, + "obd_memory max: "LPU64", leaked: "LPU64"\n", + memory_max, memory_leaked); + CDEBUG((pages_leaked) ? D_ERROR : D_INFO, + "obd_memory_pages max: "LPU64", leaked: "LPU64"\n", + pages_max, pages_leaked); + + EXIT; +} + +MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>"); +MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION); +MODULE_LICENSE("GPL"); + +cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass); diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c new file mode 100644 index 000000000000..15f71bbb7276 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/debug.c @@ -0,0 +1,124 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/debug.c + * + * Helper routines for dumping data structs for debugging. + */ + +#define DEBUG_SUBSYSTEM D_OTHER + + +#include <obd_ost.h> +#include <obd_support.h> +#include <lustre_debug.h> +#include <lustre_net.h> + +void dump_lniobuf(struct niobuf_local *nb) +{ + CDEBUG(D_RPCTRACE, + "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n", + nb->lnb_file_offset, nb->len, nb->page, nb->rc); + CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n", + nb->page ? page_index(nb->page) : -1); +} +EXPORT_SYMBOL(dump_lniobuf); + +void dump_lsm(int level, struct lov_stripe_md *lsm) +{ + CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X," + " stripe_size %u, stripe_count %u, refc: %d," + " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm, + POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic, + lsm->lsm_stripe_size, lsm->lsm_stripe_count, + atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen, + lsm->lsm_pool_name); +} +EXPORT_SYMBOL(dump_lsm); + +#define LPDS sizeof(__u64) +int block_debug_setup(void *addr, int len, __u64 off, __u64 id) +{ + LASSERT(addr); + + off = cpu_to_le64 (off); + id = cpu_to_le64 (id); + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + addr += len - LPDS - LPDS; + memcpy(addr, (char *)&off, LPDS); + memcpy(addr + LPDS, (char *)&id, LPDS); + + return 0; +} +EXPORT_SYMBOL(block_debug_setup); + +int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id) +{ + __u64 ne_off; + int err = 0; + + LASSERT(addr); + + ne_off = le64_to_cpu (off); + id = le64_to_cpu (id); + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n", + who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + addr += end - LPDS - LPDS; + if (memcmp(addr, (char *)&ne_off, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)addr, ne_off); + err = -EINVAL; + } + if (memcmp(addr + LPDS, (char *)&id, LPDS)) { + CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != " + LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id); + err = -EINVAL; + } + + return err; +} +EXPORT_SYMBOL(block_debug_check); +#undef LPDS diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c new file mode 100644 index 000000000000..1c962dd3bd2f --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c @@ -0,0 +1,1055 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/dt_object.c + * + * Dt Object. + * Generic functions from dt_object.h + * + * Author: Nikita Danilov <nikita@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd.h> +#include <dt_object.h> +#include <linux/list.h> +/* fid_be_to_cpu() */ +#include <lustre_fid.h> + +#include <lustre_quota.h> + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; +EXPORT_SYMBOL(dt_key); + +/* no lock is necessary to protect the list, because call-backs + * are added during system startup. Please refer to "struct dt_device". + */ +void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks); +} +EXPORT_SYMBOL(dt_txn_callback_add); + +void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb) +{ + list_del_init(&cb->dtc_linkage); +} +EXPORT_SYMBOL(dt_txn_callback_del); + +int dt_txn_hook_start(const struct lu_env *env, + struct dt_device *dev, struct thandle *th) +{ + int rc = 0; + struct dt_txn_callback *cb; + + if (th->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_start == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_start(env, th, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_start); + +int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn) +{ + struct dt_device *dev = txn->th_dev; + struct dt_txn_callback *cb; + int rc = 0; + + if (txn->th_local) + return 0; + + list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) { + if (cb->dtc_txn_stop == NULL || + !(cb->dtc_tag & env->le_ctx.lc_tags)) + continue; + rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie); + if (rc < 0) + break; + } + return rc; +} +EXPORT_SYMBOL(dt_txn_hook_stop); + +void dt_txn_hook_commit(struct thandle *txn) +{ + struct dt_txn_callback *cb; + + if (txn->th_local) + return; + + list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks, + dtc_linkage) { + if (cb->dtc_txn_commit) + cb->dtc_txn_commit(txn, cb->dtc_cookie); + } +} +EXPORT_SYMBOL(dt_txn_hook_commit); + +int dt_device_init(struct dt_device *dev, struct lu_device_type *t) +{ + + INIT_LIST_HEAD(&dev->dd_txn_callbacks); + return lu_device_init(&dev->dd_lu_dev, t); +} +EXPORT_SYMBOL(dt_device_init); + +void dt_device_fini(struct dt_device *dev) +{ + lu_device_fini(&dev->dd_lu_dev); +} +EXPORT_SYMBOL(dt_device_fini); + +int dt_object_init(struct dt_object *obj, + struct lu_object_header *h, struct lu_device *d) + +{ + return lu_object_init(&obj->do_lu, h, d); +} +EXPORT_SYMBOL(dt_object_init); + +void dt_object_fini(struct dt_object *obj) +{ + lu_object_fini(&obj->do_lu); +} +EXPORT_SYMBOL(dt_object_fini); + +int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) +{ + if (obj->do_index_ops == NULL) + obj->do_ops->do_index_try(env, obj, &dt_directory_features); + return obj->do_index_ops != NULL; +} +EXPORT_SYMBOL(dt_try_as_dir); + +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LBUG(); + break; + } + return result; +} +EXPORT_SYMBOL(dt_mode_to_dft); + +/** + * lookup fid for object named \a name in directory \a dir. + */ + +int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir, + const char *name, struct lu_fid *fid) +{ + if (dt_try_as_dir(env, dir)) + return dt_lookup(env, dir, (struct dt_rec *)fid, + (const struct dt_key *)name, BYPASS_CAPA); + return -ENOTDIR; +} +EXPORT_SYMBOL(dt_lookup_dir); + +/* this differs from dt_locate by top_dev as parameter + * but not one from lu_site */ +struct dt_object *dt_locate_at(const struct lu_env *env, + struct dt_device *dev, const struct lu_fid *fid, + struct lu_device *top_dev) +{ + struct lu_object *lo, *n; + ENTRY; + + lo = lu_object_find_at(env, top_dev, fid, NULL); + if (IS_ERR(lo)) + return (void *)lo; + + LASSERT(lo != NULL); + + list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) { + if (n->lo_dev == &dev->dd_lu_dev) + return container_of0(n, struct dt_object, do_lu); + } + return ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL(dt_locate_at); + +/** + * find a object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, void *data) +{ + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int result; + + result = dt_lookup_dir(env, obj, entry, fid); + lu_object_put(env, &obj->do_lu); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + dfh->dfh_o = obj; + return result; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +struct dt_object * +dt_store_resolve(const struct lu_env *env, struct dt_device *dt, + const char *path, struct lu_fid *fid) +{ + struct dt_thread_info *info = dt_info(env); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + char *local = info->dti_buf; + int result; + + + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; + + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, local, dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = dfh->dfh_o; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} +EXPORT_SYMBOL(dt_store_resolve); + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup_dir(env, p, name, fid); + if (result == 0){ + o = dt_locate(env, dt, fid); + } + else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, + filename, fid); + lu_object_put(env, &dir->do_lu); + } else { + file = dir; + } + return file; +} +EXPORT_SYMBOL(dt_store_open); + +struct dt_object *dt_find_or_create(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object_format *dof, + struct lu_attr *at) +{ + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + dto = dt_locate(env, dt, fid); + if (IS_ERR(dto)) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + RETURN(dto); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid)); + + rc = dt_create(env, dto, at, NULL, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, dt, th); +out: + if (rc) { + lu_object_put(env, &dto->do_lu); + RETURN(ERR_PTR(rc)); + } + RETURN(dto); +} +EXPORT_SYMBOL(dt_find_or_create); + +/* dt class init function. */ +int dt_global_init(void) +{ + int result; + + LU_CONTEXT_KEY_INIT(&dt_key); + result = lu_context_key_register(&dt_key); + return result; +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + +/** + * Generic read helper. May return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval real size of data read + * \retval -ve errno on failure + */ +int dt_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); +} +EXPORT_SYMBOL(dt_read); + +/** + * Read structures of fixed size from storage. Unlike dt_read(), using + * dt_record_read() will return an error for partial reads. + * + * \param env lustre environment + * \param dt object to be read + * \param buf lu_buf to be filled, with buffer pointer and length + * \param pos position to start reading, updated as data is read + * + * \retval 0 on successfully reading full buffer + * \retval -EFAULT on short read + * \retval -ve errno on failure + */ +int dt_record_read(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, loff_t *pos) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to read record\n"); + + rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA); + + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_read); + +int dt_record_write(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, loff_t *pos, struct thandle *th) +{ + int rc; + + LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); + LASSERT(th != NULL); + LASSERT(dt->do_body_ops); + LASSERT(dt->do_body_ops->dbo_write); + rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1); + if (rc == buf->lb_len) + rc = 0; + else if (rc >= 0) + rc = -EFAULT; + return rc; +} +EXPORT_SYMBOL(dt_record_write); + +int dt_declare_version_set(const struct lu_env *env, struct dt_object *o, + struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + + LASSERT(o); + vbuf.lb_buf = NULL; + vbuf.lb_len = sizeof(dt_obj_version_t); + return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th); + +} +EXPORT_SYMBOL(dt_declare_version_set); + +void dt_version_set(const struct lu_env *env, struct dt_object *o, + dt_obj_version_t version, struct thandle *th) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + + rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA); + if (rc < 0) + CDEBUG(D_INODE, "Can't set version, rc %d\n", rc); + return; +} +EXPORT_SYMBOL(dt_version_set); + +dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o) +{ + struct lu_buf vbuf; + char *xname = XATTR_NAME_VERSION; + dt_obj_version_t version; + int rc; + + LASSERT(o); + vbuf.lb_buf = &version; + vbuf.lb_len = sizeof(version); + rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA); + if (rc != sizeof(version)) { + CDEBUG(D_INODE, "Can't get version, rc %d\n", rc); + version = 0; + } + return version; +} +EXPORT_SYMBOL(dt_version_get); + +/* list of all supported index types */ + +/* directories */ +const struct dt_index_features dt_directory_features; +EXPORT_SYMBOL(dt_directory_features); + +/* scrub iterator */ +const struct dt_index_features dt_otable_features; +EXPORT_SYMBOL(dt_otable_features); + +/* lfsck */ +const struct dt_index_features dt_lfsck_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(struct lu_fid), + .dif_keysize_max = sizeof(struct lu_fid), + .dif_recsize_min = sizeof(__u8), + .dif_recsize_max = sizeof(__u8), + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_lfsck_features); + +/* accounting indexes */ +const struct dt_index_features dt_acct_features = { + .dif_flags = DT_IND_UPDATE, + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_acct_features); + +/* global quota files */ +const struct dt_index_features dt_quota_glb_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_glb_features); + +/* slave quota files */ +const struct dt_index_features dt_quota_slv_features = { + .dif_flags = DT_IND_UPDATE, + /* a different key would have to be used for per-directory quota */ + .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */ + .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */ + .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */ + .dif_ptrsize = 4 +}; +EXPORT_SYMBOL(dt_quota_slv_features); + +/* helper function returning what dt_index_features structure should be used + * based on the FID sequence. This is used by OBD_IDX_READ RPC */ +static inline const struct dt_index_features *dt_index_feat_select(__u64 seq, + __u32 mode) +{ + if (seq == FID_SEQ_QUOTA_GLB) { + /* global quota index */ + if (!S_ISREG(mode)) + /* global quota index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_glb_features; + } else if (seq == FID_SEQ_QUOTA) { + /* quota slave index */ + if (!S_ISREG(mode)) + /* slave index should be a regular file */ + return ERR_PTR(-ENOENT); + return &dt_quota_slv_features; + } else if (seq >= FID_SEQ_NORMAL) { + /* object is part of the namespace, verify that it is a + * directory */ + if (!S_ISDIR(mode)) + /* sorry, we can only deal with directory */ + return ERR_PTR(-ENOTDIR); + return &dt_directory_features; + } + + return ERR_PTR(-EOPNOTSUPP); +} + +/* + * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ + * RPC + * + * \param env - is the environment passed by the caller + * \param lp - is a pointer to the lu_page to fill + * \param nob - is the maximum number of bytes that should be copied + * \param iops - is the index operation vector associated with the index object + * \param it - is a pointer to the current iterator + * \param attr - is the index attribute to pass to iops->rec() + * \param arg - is a pointer to the idx_info structure + */ +static int dt_index_page_build(const struct lu_env *env, union lu_page *lp, + int nob, const struct dt_it_ops *iops, + struct dt_it *it, __u32 attr, void *arg) +{ + struct idx_info *ii = (struct idx_info *)arg; + struct lu_idxpage *lip = &lp->lp_idx; + char *entry; + int rc, size; + ENTRY; + + /* no support for variable key & record size for now */ + LASSERT((ii->ii_flags & II_FL_VARKEY) == 0); + LASSERT((ii->ii_flags & II_FL_VARREC) == 0); + + /* initialize the header of the new container */ + memset(lip, 0, LIP_HDR_SIZE); + lip->lip_magic = LIP_MAGIC; + nob -= LIP_HDR_SIZE; + + /* compute size needed to store a key/record pair */ + size = ii->ii_recsize + ii->ii_keysize; + if ((ii->ii_flags & II_FL_NOHASH) == 0) + /* add hash if the client wants it */ + size += sizeof(__u64); + + entry = lip->lip_entries; + do { + char *tmp_entry = entry; + struct dt_key *key; + __u64 hash; + + /* fetch 64-bit hash value */ + hash = iops->store(env, it); + ii->ii_hash_end = hash; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) { + if (lip->lip_nr != 0) + GOTO(out, rc = 0); + } + + if (nob < size) { + if (lip->lip_nr == 0) + GOTO(out, rc = -EINVAL); + GOTO(out, rc = 0); + } + + if ((ii->ii_flags & II_FL_NOHASH) == 0) { + /* client wants to the 64-bit hash value associated with + * each record */ + memcpy(tmp_entry, &hash, sizeof(hash)); + tmp_entry += sizeof(hash); + } + + /* then the key value */ + LASSERT(iops->key_size(env, it) == ii->ii_keysize); + key = iops->key(env, it); + memcpy(tmp_entry, key, ii->ii_keysize); + tmp_entry += ii->ii_keysize; + + /* and finally the record */ + rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr); + if (rc != -ESTALE) { + if (rc != 0) + GOTO(out, rc); + + /* hash/key/record successfully copied! */ + lip->lip_nr++; + if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0)) + ii->ii_hash_start = hash; + entry = tmp_entry + ii->ii_recsize; + nob -= size; + } + + /* move on to the next record */ + do { + rc = iops->next(env, it); + } while (rc == -ESTALE); + + } while (rc == 0); + + GOTO(out, rc); +out: + if (rc >= 0 && lip->lip_nr > 0) + /* one more container */ + ii->ii_count++; + if (rc > 0) + /* no more entries */ + ii->ii_hash_end = II_END_OFF; + return rc; +} + +/* + * Walk index and fill lu_page containers with key/record pairs + * + * \param env - is the environment passed by the caller + * \param obj - is the index object to parse + * \param rdpg - is the lu_rdpg descriptor associated with the transfer + * \param filler - is the callback function responsible for filling a lu_page + * with key/record pairs in the format wanted by the caller + * \param arg - is an opaq argument passed to the filler function + * + * \retval sum (in bytes) of all filled lu_pages + * \retval -ve errno on failure + */ +int dt_index_walk(const struct lu_env *env, struct dt_object *obj, + const struct lu_rdpg *rdpg, dt_index_page_build_t filler, + void *arg) +{ + struct dt_it *it; + const struct dt_it_ops *iops; + unsigned int pageidx, nob, nlupgs = 0; + int rc; + ENTRY; + + LASSERT(rdpg->rp_pages != NULL); + LASSERT(obj->do_index_ops != NULL); + + nob = rdpg->rp_count; + if (nob <= 0) + RETURN(-EFAULT); + + /* Iterate through index and fill containers from @rdpg */ + iops = &obj->do_index_ops->dio_it; + LASSERT(iops != NULL); + it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA); + if (IS_ERR(it)) + RETURN(PTR_ERR(it)); + + rc = iops->load(env, it, rdpg->rp_hash); + if (rc == 0) { + /* + * Iterator didn't find record with exactly the key requested. + * + * It is currently either + * + * - positioned above record with key less than + * requested---skip it. + * - or not positioned at all (is in IAM_IT_SKEWED + * state)---position it on the next item. + */ + rc = iops->next(env, it); + } else if (rc > 0) { + rc = 0; + } + + /* Fill containers one after the other. There might be multiple + * containers per physical page. + * + * At this point and across for-loop: + * rc == 0 -> ok, proceed. + * rc > 0 -> end of index. + * rc < 0 -> error. */ + for (pageidx = 0; rc == 0 && nob > 0; pageidx++) { + union lu_page *lp; + int i; + + LASSERT(pageidx < rdpg->rp_npages); + lp = kmap(rdpg->rp_pages[pageidx]); + + /* fill lu pages */ + for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) { + rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE), + iops, it, rdpg->rp_attrs, arg); + if (rc < 0) + break; + /* one more lu_page */ + nlupgs++; + if (rc > 0) + /* end of index */ + break; + } + kunmap(rdpg->rp_pages[i]); + } + + iops->put(env, it); + iops->fini(env, it); + + if (rc >= 0) + rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count); + + RETURN(rc); +} +EXPORT_SYMBOL(dt_index_walk); + +/** + * Walk key/record pairs of an index and copy them into 4KB containers to be + * transferred over the network. This is the common handler for OBD_IDX_READ + * RPC processing. + * + * \param env - is the environment passed by the caller + * \param dev - is the dt_device storing the index + * \param ii - is the idx_info structure packed by the client in the + * OBD_IDX_READ request + * \param rdpg - is the lu_rdpg descriptor + * + * \retval on success, return sum (in bytes) of all filled containers + * \retval appropriate error otherwise. + */ +int dt_index_read(const struct lu_env *env, struct dt_device *dev, + struct idx_info *ii, const struct lu_rdpg *rdpg) +{ + const struct dt_index_features *feat; + struct dt_object *obj; + int rc; + ENTRY; + + /* rp_count shouldn't be null and should be a multiple of the container + * size */ + if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0) + RETURN(-EFAULT); + + if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL) + /* we don't support directory transfer via OBD_IDX_READ for the + * time being */ + RETURN(-EOPNOTSUPP); + + if (!fid_is_quota(&ii->ii_fid)) + /* block access to all local files except quota files */ + RETURN(-EPERM); + + /* lookup index object subject to the transfer */ + obj = dt_locate(env, dev, &ii->ii_fid); + if (IS_ERR(obj)) + RETURN(PTR_ERR(obj)); + if (dt_object_exists(obj) == 0) + GOTO(out, rc = -ENOENT); + + /* fetch index features associated with index object */ + feat = dt_index_feat_select(fid_seq(&ii->ii_fid), + lu_object_attr(&obj->do_lu)); + if (IS_ERR(feat)) + GOTO(out, rc = PTR_ERR(feat)); + + /* load index feature if not done already */ + if (obj->do_index_ops == NULL) { + rc = obj->do_ops->do_index_try(env, obj, feat); + if (rc) + GOTO(out, rc); + } + + /* fill ii_flags with supported index features */ + ii->ii_flags &= II_FL_NOHASH; + + ii->ii_keysize = feat->dif_keysize_max; + if ((feat->dif_flags & DT_IND_VARKEY) != 0) { + /* key size is variable */ + ii->ii_flags |= II_FL_VARKEY; + /* we don't support variable key size for the time being */ + GOTO(out, rc = -EOPNOTSUPP); + } + + ii->ii_recsize = feat->dif_recsize_max; + if ((feat->dif_flags & DT_IND_VARREC) != 0) { + /* record size is variable */ + ii->ii_flags |= II_FL_VARREC; + /* we don't support variable record size for the time being */ + GOTO(out, rc = -EOPNOTSUPP); + } + + if ((feat->dif_flags & DT_IND_NONUNQ) != 0) + /* key isn't necessarily unique */ + ii->ii_flags |= II_FL_NONUNQ; + + dt_read_lock(env, obj, 0); + /* fetch object version before walking the index */ + ii->ii_version = dt_version_get(env, obj); + + /* walk the index and fill lu_idxpages with key/record pairs */ + rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii); + dt_read_unlock(env, obj); + + if (rc == 0) { + /* index is empty */ + LASSERT(ii->ii_count == 0); + ii->ii_hash_end = II_END_OFF; + } + + GOTO(out, rc); +out: + lu_object_put(env, &obj->do_lu); + return rc; +} +EXPORT_SYMBOL(dt_index_read); + +#ifdef LPROCFS + +int lprocfs_dt_rd_blksize(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, "%u\n", + (unsigned) osfs.os_bsize); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_blksize); + +int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal); + +int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree); + +int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + *eof = 1; + rc = snprintf(page, count, LPU64"\n", result); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail); + +int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_files); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filestotal); + +int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct dt_device *dt = data; + struct obd_statfs osfs; + + int rc = dt_statfs(NULL, dt, &osfs); + if (rc == 0) { + *eof = 1; + rc = snprintf(page, count, LPU64"\n", osfs.os_ffree); + } + + return rc; +} +EXPORT_SYMBOL(lprocfs_dt_rd_filesfree); + +#endif /* LPROCFS */ diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c new file mode 100644 index 000000000000..d96876e0bc68 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/genops.c @@ -0,0 +1,1853 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/genops.c + * + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include <obd_ost.h> +#include <obd_class.h> +#include <lprocfs_status.h> + +extern struct list_head obd_types; +spinlock_t obd_types_lock; + +struct kmem_cache *obd_device_cachep; +struct kmem_cache *obdo_cachep; +EXPORT_SYMBOL(obdo_cachep); +struct kmem_cache *import_cachep; + +struct list_head obd_zombie_imports; +struct list_head obd_zombie_exports; +spinlock_t obd_zombie_impexp_lock; +static void obd_zombie_impexp_notify(void); +static void obd_zombie_export_add(struct obd_export *exp); +static void obd_zombie_import_add(struct obd_import *imp); +static void print_export_data(struct obd_export *exp, + const char *status, int locks); + +int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c); +EXPORT_SYMBOL(ptlrpc_put_connection_superhack); + +/* + * support functions: we could use inter-module communication, but this + * is more portable to other OS's + */ +static struct obd_device *obd_device_alloc(void) +{ + struct obd_device *obd; + + OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO); + if (obd != NULL) { + obd->obd_magic = OBD_DEVICE_MAGIC; + } + return obd; +} + +static void obd_device_free(struct obd_device *obd) +{ + LASSERT(obd != NULL); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + if (obd->obd_namespace != NULL) { + CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n", + obd, obd->obd_namespace, obd->obd_force); + LBUG(); + } + lu_ref_fini(&obd->obd_reference); + OBD_SLAB_FREE_PTR(obd, obd_device_cachep); +} + +struct obd_type *class_search_type(const char *name) +{ + struct list_head *tmp; + struct obd_type *type; + + spin_lock(&obd_types_lock); + list_for_each(tmp, &obd_types) { + type = list_entry(tmp, struct obd_type, typ_chain); + if (strcmp(type->typ_name, name) == 0) { + spin_unlock(&obd_types_lock); + return type; + } + } + spin_unlock(&obd_types_lock); + return NULL; +} +EXPORT_SYMBOL(class_search_type); + +struct obd_type *class_get_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + + if (!type) { + const char *modname = name; + + if (strcmp(modname, "obdfilter") == 0) + modname = "ofd"; + + if (strcmp(modname, LUSTRE_LWP_NAME) == 0) + modname = LUSTRE_OSP_NAME; + + if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME))) + modname = LUSTRE_MDT_NAME; + + if (!request_module("%s", modname)) { + CDEBUG(D_INFO, "Loaded module '%s'\n", modname); + type = class_search_type(name); + } else { + LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n", + modname); + } + } + if (type) { + spin_lock(&type->obd_type_lock); + type->typ_refcnt++; + try_module_get(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); + } + return type; +} +EXPORT_SYMBOL(class_get_type); + +void class_put_type(struct obd_type *type) +{ + LASSERT(type); + spin_lock(&type->obd_type_lock); + type->typ_refcnt--; + module_put(type->typ_dt_ops->o_owner); + spin_unlock(&type->obd_type_lock); +} +EXPORT_SYMBOL(class_put_type); + +#define CLASS_MAX_NAME 1024 + +int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops, + struct lprocfs_vars *vars, const char *name, + struct lu_device_type *ldt) +{ + struct obd_type *type; + int rc = 0; + ENTRY; + + /* sanity check */ + LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME); + + if (class_search_type(name)) { + CDEBUG(D_IOCTL, "Type %s already registered\n", name); + RETURN(-EEXIST); + } + + rc = -ENOMEM; + OBD_ALLOC(type, sizeof(*type)); + if (type == NULL) + RETURN(rc); + + OBD_ALLOC_PTR(type->typ_dt_ops); + OBD_ALLOC_PTR(type->typ_md_ops); + OBD_ALLOC(type->typ_name, strlen(name) + 1); + + if (type->typ_dt_ops == NULL || + type->typ_md_ops == NULL || + type->typ_name == NULL) + GOTO (failed, rc); + + *(type->typ_dt_ops) = *dt_ops; + /* md_ops is optional */ + if (md_ops) + *(type->typ_md_ops) = *md_ops; + strcpy(type->typ_name, name); + spin_lock_init(&type->obd_type_lock); + +#ifdef LPROCFS + type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root, + vars, type); + if (IS_ERR(type->typ_procroot)) { + rc = PTR_ERR(type->typ_procroot); + type->typ_procroot = NULL; + GOTO (failed, rc); + } +#endif + if (ldt != NULL) { + type->typ_lu = ldt; + rc = lu_device_type_init(ldt); + if (rc != 0) + GOTO (failed, rc); + } + + spin_lock(&obd_types_lock); + list_add(&type->typ_chain, &obd_types); + spin_unlock(&obd_types_lock); + + RETURN (0); + + failed: + if (type->typ_name != NULL) + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(rc); +} +EXPORT_SYMBOL(class_register_type); + +int class_unregister_type(const char *name) +{ + struct obd_type *type = class_search_type(name); + ENTRY; + + if (!type) { + CERROR("unknown obd type\n"); + RETURN(-EINVAL); + } + + if (type->typ_refcnt) { + CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt); + /* This is a bad situation, let's make the best of it */ + /* Remove ops, but leave the name for debugging */ + OBD_FREE_PTR(type->typ_dt_ops); + OBD_FREE_PTR(type->typ_md_ops); + RETURN(-EBUSY); + } + + if (type->typ_procroot) { + lprocfs_remove(&type->typ_procroot); + } + + if (type->typ_lu) + lu_device_type_fini(type->typ_lu); + + spin_lock(&obd_types_lock); + list_del(&type->typ_chain); + spin_unlock(&obd_types_lock); + OBD_FREE(type->typ_name, strlen(name) + 1); + if (type->typ_dt_ops != NULL) + OBD_FREE_PTR(type->typ_dt_ops); + if (type->typ_md_ops != NULL) + OBD_FREE_PTR(type->typ_md_ops); + OBD_FREE(type, sizeof(*type)); + RETURN(0); +} /* class_unregister_type */ +EXPORT_SYMBOL(class_unregister_type); + +/** + * Create a new obd device. + * + * Find an empty slot in ::obd_devs[], create a new obd device in it. + * + * \param[in] type_name obd device type string. + * \param[in] name obd device name. + * + * \retval NULL if create fails, otherwise return the obd device + * pointer created. + */ +struct obd_device *class_newdev(const char *type_name, const char *name) +{ + struct obd_device *result = NULL; + struct obd_device *newdev; + struct obd_type *type = NULL; + int i; + int new_obd_minor = 0; + ENTRY; + + if (strlen(name) >= MAX_OBD_NAME) { + CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME); + RETURN(ERR_PTR(-EINVAL)); + } + + type = class_get_type(type_name); + if (type == NULL){ + CERROR("OBD: unknown type: %s\n", type_name); + RETURN(ERR_PTR(-ENODEV)); + } + + newdev = obd_device_alloc(); + if (newdev == NULL) + GOTO(out_type, result = ERR_PTR(-ENOMEM)); + + LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC); + + write_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && (strcmp(name, obd->obd_name) == 0)) { + CERROR("Device %s already exists at %d, won't add\n", + name, i); + if (result) { + LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", result, + result->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(result->obd_minor == new_obd_minor, + "%p obd_minor %d != %d\n", result, + result->obd_minor, new_obd_minor); + + obd_devs[result->obd_minor] = NULL; + result->obd_name[0]='\0'; + } + result = ERR_PTR(-EEXIST); + break; + } + if (!result && !obd) { + result = newdev; + result->obd_minor = i; + new_obd_minor = i; + result->obd_type = type; + strncpy(result->obd_name, name, + sizeof(result->obd_name) - 1); + obd_devs[i] = result; + } + } + write_unlock(&obd_dev_lock); + + if (result == NULL && i >= class_devno_max()) { + CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n", + class_devno_max()); + GOTO(out, result = ERR_PTR(-EOVERFLOW)); + } + + if (IS_ERR(result)) + GOTO(out, result); + + CDEBUG(D_IOCTL, "Adding new device %s (%p)\n", + result->obd_name, result); + + RETURN(result); +out: + obd_device_free(newdev); +out_type: + class_put_type(type); + return result; +} + +void class_release_dev(struct obd_device *obd) +{ + struct obd_type *obd_type = obd->obd_type; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, obd_devs[obd->obd_minor]); + LASSERT(obd_type != NULL); + + CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n", + obd->obd_name, obd->obd_minor, obd->obd_type->typ_name); + + write_lock(&obd_dev_lock); + obd_devs[obd->obd_minor] = NULL; + write_unlock(&obd_dev_lock); + obd_device_free(obd); + + class_put_type(obd_type); +} + +int class_name2dev(const char *name) +{ + int i; + + if (!name) + return -1; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && strcmp(name, obd->obd_name) == 0) { + /* Make sure we finished attaching before we give + out any references */ + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_attached) { + read_unlock(&obd_dev_lock); + return i; + } + break; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_name2dev); + +struct obd_device *class_name2obd(const char *name) +{ + int dev = class_name2dev(name); + + if (dev < 0 || dev > class_devno_max()) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_name2obd); + +int class_uuid2dev(struct obd_uuid *uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) { + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + read_unlock(&obd_dev_lock); + return i; + } + } + read_unlock(&obd_dev_lock); + + return -1; +} +EXPORT_SYMBOL(class_uuid2dev); + +struct obd_device *class_uuid2obd(struct obd_uuid *uuid) +{ + int dev = class_uuid2dev(uuid); + if (dev < 0) + return NULL; + return class_num2obd(dev); +} +EXPORT_SYMBOL(class_uuid2obd); + +/** + * Get obd device from ::obd_devs[] + * + * \param num [in] array index + * + * \retval NULL if ::obd_devs[\a num] does not contains an obd device + * otherwise return the obd device there. + */ +struct obd_device *class_num2obd(int num) +{ + struct obd_device *obd = NULL; + + if (num < class_devno_max()) { + obd = obd_devs[num]; + if (obd == NULL) + return NULL; + + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "%p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(obd->obd_minor == num, + "%p obd_minor %0d != %0d\n", + obd, obd->obd_minor, num); + } + + return obd; +} +EXPORT_SYMBOL(class_num2obd); + +/** + * Get obd devices count. Device in any + * state are counted + * \retval obd device count + */ +int get_devices_count(void) +{ + int index, max_index = class_devno_max(), dev_count = 0; + + read_lock(&obd_dev_lock); + for (index = 0; index <= max_index; index++) { + struct obd_device *obd = class_num2obd(index); + if (obd != NULL) + dev_count++; + } + read_unlock(&obd_dev_lock); + + return dev_count; +} +EXPORT_SYMBOL(get_devices_count); + +void class_obd_list(void) +{ + char *status; + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n", + i, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); + } + read_unlock(&obd_dev_lock); + return; +} + +/* Search for a client OBD connected to tgt_uuid. If grp_uuid is + specified, then only the client with that uuid is returned, + otherwise any client connected to the tgt is returned. */ +struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, + const char * typ_name, + struct obd_uuid *grp_uuid) +{ + int i; + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if ((strncmp(obd->obd_type->typ_name, typ_name, + strlen(typ_name)) == 0)) { + if (obd_uuid_equals(tgt_uuid, + &obd->u.cli.cl_target_uuid) && + ((grp_uuid)? obd_uuid_equals(grp_uuid, + &obd->obd_uuid) : 1)) { + read_unlock(&obd_dev_lock); + return obd; + } + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_find_client_obd); + +/* Iterate the obd_device list looking devices have grp_uuid. Start + searching at *next, and if a device is found, the next index to look + at is saved in *next. If next is NULL, then the first matching device + will always be returned. */ +struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next) +{ + int i; + + if (next == NULL) + i = 0; + else if (*next >= 0 && *next < class_devno_max()) + i = *next; + else + return NULL; + + read_lock(&obd_dev_lock); + for (; i < class_devno_max(); i++) { + struct obd_device *obd = class_num2obd(i); + + if (obd == NULL) + continue; + if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) { + if (next != NULL) + *next = i+1; + read_unlock(&obd_dev_lock); + return obd; + } + } + read_unlock(&obd_dev_lock); + + return NULL; +} +EXPORT_SYMBOL(class_devices_in_group); + +/** + * to notify sptlrpc log for \a fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, mdt, ost */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __FUNCTION__, obd); + read_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __FUNCTION__, obd); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); + +void obd_cleanup_caches(void) +{ + ENTRY; + if (obd_device_cachep) { + kmem_cache_destroy(obd_device_cachep); + obd_device_cachep = NULL; + } + if (obdo_cachep) { + kmem_cache_destroy(obdo_cachep); + obdo_cachep = NULL; + } + if (import_cachep) { + kmem_cache_destroy(import_cachep); + import_cachep = NULL; + } + if (capa_cachep) { + kmem_cache_destroy(capa_cachep); + capa_cachep = NULL; + } + EXIT; +} + +int obd_init_caches(void) +{ + ENTRY; + + LASSERT(obd_device_cachep == NULL); + obd_device_cachep = kmem_cache_create("ll_obd_dev_cache", + sizeof(struct obd_device), + 0, 0, NULL); + if (!obd_device_cachep) + GOTO(out, -ENOMEM); + + LASSERT(obdo_cachep == NULL); + obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo), + 0, 0, NULL); + if (!obdo_cachep) + GOTO(out, -ENOMEM); + + LASSERT(import_cachep == NULL); + import_cachep = kmem_cache_create("ll_import_cache", + sizeof(struct obd_import), + 0, 0, NULL); + if (!import_cachep) + GOTO(out, -ENOMEM); + + LASSERT(capa_cachep == NULL); + capa_cachep = kmem_cache_create("capa_cache", + sizeof(struct obd_capa), 0, 0, NULL); + if (!capa_cachep) + GOTO(out, -ENOMEM); + + RETURN(0); + out: + obd_cleanup_caches(); + RETURN(-ENOMEM); + +} + +/* map connection to client */ +struct obd_export *class_conn2export(struct lustre_handle *conn) +{ + struct obd_export *export; + ENTRY; + + if (!conn) { + CDEBUG(D_CACHE, "looking for null handle\n"); + RETURN(NULL); + } + + if (conn->cookie == -1) { /* this means assign a new connection */ + CDEBUG(D_CACHE, "want a new connection\n"); + RETURN(NULL); + } + + CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie); + export = class_handle2object(conn->cookie); + RETURN(export); +} +EXPORT_SYMBOL(class_conn2export); + +struct obd_device *class_exp2obd(struct obd_export *exp) +{ + if (exp) + return exp->exp_obd; + return NULL; +} +EXPORT_SYMBOL(class_exp2obd); + +struct obd_device *class_conn2obd(struct lustre_handle *conn) +{ + struct obd_export *export; + export = class_conn2export(conn); + if (export) { + struct obd_device *obd = export->exp_obd; + class_export_put(export); + return obd; + } + return NULL; +} +EXPORT_SYMBOL(class_conn2obd); + +struct obd_import *class_exp2cliimp(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_exp2cliimp); + +struct obd_import *class_conn2cliimp(struct lustre_handle *conn) +{ + struct obd_device *obd = class_conn2obd(conn); + if (obd == NULL) + return NULL; + return obd->u.cli.cl_import; +} +EXPORT_SYMBOL(class_conn2cliimp); + +/* Export management functions */ +static void class_export_destroy(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + ENTRY; + + LASSERT_ATOMIC_ZERO(&exp->exp_refcount); + LASSERT(obd != NULL); + + CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp, + exp->exp_client_uuid.uuid, obd->obd_name); + + /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */ + if (exp->exp_connection) + ptlrpc_put_connection_superhack(exp->exp_connection); + + LASSERT(list_empty(&exp->exp_outstanding_replies)); + LASSERT(list_empty(&exp->exp_uncommitted_replies)); + LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_hp_rpcs)); + obd_destroy_export(exp); + class_decref(obd, "export", exp); + + OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle); + EXIT; +} + +static void export_handle_addref(void *export) +{ + class_export_get(export); +} + +static struct portals_handle_ops export_handle_ops = { + .hop_addref = export_handle_addref, + .hop_free = NULL, +}; + +struct obd_export *class_export_get(struct obd_export *exp) +{ + atomic_inc(&exp->exp_refcount); + CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount)); + return exp; +} +EXPORT_SYMBOL(class_export_get); + +void class_export_put(struct obd_export *exp) +{ + LASSERT(exp != NULL); + LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON); + CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp, + atomic_read(&exp->exp_refcount) - 1); + + if (atomic_dec_and_test(&exp->exp_refcount)) { + LASSERT(!list_empty(&exp->exp_obd_chain)); + CDEBUG(D_IOCTL, "final put %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + /* release nid stat refererence */ + lprocfs_exp_cleanup(exp); + + obd_zombie_export_add(exp); + } +} +EXPORT_SYMBOL(class_export_put); + +/* Creates a new export, adds it to the hash table, and returns a + * pointer to it. The refcount is 2: one for the hash reference, and + * one for the pointer returned by this function. */ +struct obd_export *class_new_export(struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + cfs_hash_t *hash = NULL; + int rc = 0; + ENTRY; + + OBD_ALLOC_PTR(export); + if (!export) + return ERR_PTR(-ENOMEM); + + export->exp_conn_cnt = 0; + export->exp_lock_hash = NULL; + export->exp_flock_hash = NULL; + atomic_set(&export->exp_refcount, 2); + atomic_set(&export->exp_rpc_count, 0); + atomic_set(&export->exp_cb_count, 0); + atomic_set(&export->exp_locks_count, 0); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + INIT_LIST_HEAD(&export->exp_locks_list); + spin_lock_init(&export->exp_locks_list_guard); +#endif + atomic_set(&export->exp_replay_count, 0); + export->exp_obd = obd; + INIT_LIST_HEAD(&export->exp_outstanding_replies); + spin_lock_init(&export->exp_uncommitted_replies_lock); + INIT_LIST_HEAD(&export->exp_uncommitted_replies); + INIT_LIST_HEAD(&export->exp_req_replay_queue); + INIT_LIST_HEAD(&export->exp_handle.h_link); + INIT_LIST_HEAD(&export->exp_hp_rpcs); + class_handle_hash(&export->exp_handle, &export_handle_ops); + export->exp_last_request_time = cfs_time_current_sec(); + spin_lock_init(&export->exp_lock); + spin_lock_init(&export->exp_rpc_lock); + INIT_HLIST_NODE(&export->exp_uuid_hash); + INIT_HLIST_NODE(&export->exp_nid_hash); + spin_lock_init(&export->exp_bl_list_lock); + INIT_LIST_HEAD(&export->exp_bl_list); + + export->exp_sp_peer = LUSTRE_SP_ANY; + export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + export->exp_client_uuid = *cluuid; + obd_init_export(export); + + spin_lock(&obd->obd_dev_lock); + /* shouldn't happen, but might race */ + if (obd->obd_stopping) + GOTO(exit_unlock, rc = -ENODEV); + + hash = cfs_hash_getref(obd->obd_uuid_hash); + if (hash == NULL) + GOTO(exit_unlock, rc = -ENODEV); + spin_unlock(&obd->obd_dev_lock); + + if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) { + rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash); + if (rc != 0) { + LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n", + obd->obd_name, cluuid->uuid, rc); + GOTO(exit_err, rc = -EALREADY); + } + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + cfs_hash_del(hash, cluuid, &export->exp_uuid_hash); + GOTO(exit_unlock, rc = -ENODEV); + } + + class_incref(obd, "export", export); + list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports); + list_add_tail(&export->exp_obd_chain_timed, + &export->exp_obd->obd_exports_timed); + export->exp_obd->obd_num_exports++; + spin_unlock(&obd->obd_dev_lock); + cfs_hash_putref(hash); + RETURN(export); + +exit_unlock: + spin_unlock(&obd->obd_dev_lock); +exit_err: + if (hash) + cfs_hash_putref(hash); + class_handle_unhash(&export->exp_handle); + LASSERT(hlist_unhashed(&export->exp_uuid_hash)); + obd_destroy_export(export); + OBD_FREE_PTR(export); + return ERR_PTR(rc); +} +EXPORT_SYMBOL(class_new_export); + +void class_unlink_export(struct obd_export *exp) +{ + class_handle_unhash(&exp->exp_handle); + + spin_lock(&exp->exp_obd->obd_dev_lock); + /* delete an uuid-export hashitem from hashtables */ + if (!hlist_unhashed(&exp->exp_uuid_hash)) + cfs_hash_del(exp->exp_obd->obd_uuid_hash, + &exp->exp_client_uuid, + &exp->exp_uuid_hash); + + list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports); + list_del_init(&exp->exp_obd_chain_timed); + exp->exp_obd->obd_num_exports--; + spin_unlock(&exp->exp_obd->obd_dev_lock); + class_export_put(exp); +} +EXPORT_SYMBOL(class_unlink_export); + +/* Import management functions */ +void class_import_destroy(struct obd_import *imp) +{ + ENTRY; + + CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp, + imp->imp_obd->obd_name); + + LASSERT_ATOMIC_ZERO(&imp->imp_refcount); + + ptlrpc_put_connection_superhack(imp->imp_connection); + + while (!list_empty(&imp->imp_conn_list)) { + struct obd_import_conn *imp_conn; + + imp_conn = list_entry(imp->imp_conn_list.next, + struct obd_import_conn, oic_item); + list_del_init(&imp_conn->oic_item); + ptlrpc_put_connection_superhack(imp_conn->oic_conn); + OBD_FREE(imp_conn, sizeof(*imp_conn)); + } + + LASSERT(imp->imp_sec == NULL); + class_decref(imp->imp_obd, "import", imp); + OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle); + EXIT; +} + +static void import_handle_addref(void *import) +{ + class_import_get(import); +} + +static struct portals_handle_ops import_handle_ops = { + .hop_addref = import_handle_addref, + .hop_free = NULL, +}; + +struct obd_import *class_import_get(struct obd_import *import) +{ + atomic_inc(&import->imp_refcount); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount), + import->imp_obd->obd_name); + return import; +} +EXPORT_SYMBOL(class_import_get); + +void class_import_put(struct obd_import *imp) +{ + ENTRY; + + LASSERT(list_empty(&imp->imp_zombie_chain)); + LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON); + + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp, + atomic_read(&imp->imp_refcount) - 1, + imp->imp_obd->obd_name); + + if (atomic_dec_and_test(&imp->imp_refcount)) { + CDEBUG(D_INFO, "final put import %p\n", imp); + obd_zombie_import_add(imp); + } + + /* catch possible import put race */ + LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON); + EXIT; +} +EXPORT_SYMBOL(class_import_put); + +static void init_imp_at(struct imp_at *at) { + int i; + at_init(&at->iat_net_latency, 0, 0); + for (i = 0; i < IMP_AT_MAX_PORTALS; i++) { + /* max service estimates are tracked on the server side, so + don't use the AT history here, just use the last reported + val. (But keep hist for proc histogram, worst_ever) */ + at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT, + AT_FLG_NOHIST); + } +} + +struct obd_import *class_new_import(struct obd_device *obd) +{ + struct obd_import *imp; + + OBD_ALLOC(imp, sizeof(*imp)); + if (imp == NULL) + return NULL; + + INIT_LIST_HEAD(&imp->imp_pinger_chain); + INIT_LIST_HEAD(&imp->imp_zombie_chain); + INIT_LIST_HEAD(&imp->imp_replay_list); + INIT_LIST_HEAD(&imp->imp_sending_list); + INIT_LIST_HEAD(&imp->imp_delayed_list); + spin_lock_init(&imp->imp_lock); + imp->imp_last_success_conn = 0; + imp->imp_state = LUSTRE_IMP_NEW; + imp->imp_obd = class_incref(obd, "import", imp); + mutex_init(&imp->imp_sec_mutex); + init_waitqueue_head(&imp->imp_recovery_waitq); + + atomic_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); + atomic_set(&imp->imp_inflight, 0); + atomic_set(&imp->imp_replay_inflight, 0); + atomic_set(&imp->imp_inval_count, 0); + INIT_LIST_HEAD(&imp->imp_conn_list); + INIT_LIST_HEAD(&imp->imp_handle.h_link); + class_handle_hash(&imp->imp_handle, &import_handle_ops); + init_imp_at(&imp->imp_at); + + /* the default magic is V2, will be used in connect RPC, and + * then adjusted according to the flags in request/reply. */ + imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + + return imp; +} +EXPORT_SYMBOL(class_new_import); + +void class_destroy_import(struct obd_import *import) +{ + LASSERT(import != NULL); + LASSERT(import != LP_POISON); + + class_handle_unhash(&import->imp_handle); + + spin_lock(&import->imp_lock); + import->imp_generation++; + spin_unlock(&import->imp_lock); + class_import_put(import); +} +EXPORT_SYMBOL(class_destroy_import); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS + +void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + + LASSERT(lock->l_exp_refs_nr >= 0); + + if (lock->l_exp_refs_target != NULL && + lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n", + exp, lock, lock->l_exp_refs_target); + } + if ((lock->l_exp_refs_nr ++) == 0) { + list_add(&lock->l_exp_refs_link, &exp->exp_locks_list); + lock->l_exp_refs_target = exp; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_add_lock_ref); + +void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock) +{ + spin_lock(&exp->exp_locks_list_guard); + LASSERT(lock->l_exp_refs_nr > 0); + if (lock->l_exp_refs_target != exp) { + LCONSOLE_WARN("lock %p, " + "mismatching export pointers: %p, %p\n", + lock, lock->l_exp_refs_target, exp); + } + if (-- lock->l_exp_refs_nr == 0) { + list_del_init(&lock->l_exp_refs_link); + lock->l_exp_refs_target = NULL; + } + CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n", + lock, exp, lock->l_exp_refs_nr); + spin_unlock(&exp->exp_locks_list_guard); +} +EXPORT_SYMBOL(__class_export_del_lock_ref); +#endif + +/* A connection defines an export context in which preallocation can + be managed. This releases the export pointer reference, and returns + the export handle, so the export refcount is 1 when this function + returns. */ +int class_connect(struct lustre_handle *conn, struct obd_device *obd, + struct obd_uuid *cluuid) +{ + struct obd_export *export; + LASSERT(conn != NULL); + LASSERT(obd != NULL); + LASSERT(cluuid != NULL); + ENTRY; + + export = class_new_export(obd, cluuid); + if (IS_ERR(export)) + RETURN(PTR_ERR(export)); + + conn->cookie = export->exp_handle.h_cookie; + class_export_put(export); + + CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n", + cluuid->uuid, conn->cookie); + RETURN(0); +} +EXPORT_SYMBOL(class_connect); + +/* if export is involved in recovery then clean up related things */ +void class_export_recovery_cleanup(struct obd_export *exp) +{ + struct obd_device *obd = exp->exp_obd; + + spin_lock(&obd->obd_recovery_task_lock); + if (exp->exp_delayed) + obd->obd_delayed_clients--; + if (obd->obd_recovering) { + if (exp->exp_in_recovery) { + spin_lock(&exp->exp_lock); + exp->exp_in_recovery = 0; + spin_unlock(&exp->exp_lock); + LASSERT_ATOMIC_POS(&obd->obd_connected_clients); + atomic_dec(&obd->obd_connected_clients); + } + + /* if called during recovery then should update + * obd_stale_clients counter, + * lightweight exports are not counted */ + if (exp->exp_failed && + (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0) + exp->exp_obd->obd_stale_clients++; + } + spin_unlock(&obd->obd_recovery_task_lock); + /** Cleanup req replay fields */ + if (exp->exp_req_replay_needed) { + spin_lock(&exp->exp_lock); + exp->exp_req_replay_needed = 0; + spin_unlock(&exp->exp_lock); + LASSERT(atomic_read(&obd->obd_req_replay_clients)); + atomic_dec(&obd->obd_req_replay_clients); + } + /** Cleanup lock replay data */ + if (exp->exp_lock_replay_needed) { + spin_lock(&exp->exp_lock); + exp->exp_lock_replay_needed = 0; + spin_unlock(&exp->exp_lock); + LASSERT(atomic_read(&obd->obd_lock_replay_clients)); + atomic_dec(&obd->obd_lock_replay_clients); + } +} + +/* This function removes 1-3 references from the export: + * 1 - for export pointer passed + * and if disconnect really need + * 2 - removing from hash + * 3 - in client_unlink_export + * The export pointer passed to this function can destroyed */ +int class_disconnect(struct obd_export *export) +{ + int already_disconnected; + ENTRY; + + if (export == NULL) { + CWARN("attempting to free NULL export %p\n", export); + RETURN(-EINVAL); + } + + spin_lock(&export->exp_lock); + already_disconnected = export->exp_disconnected; + export->exp_disconnected = 1; + spin_unlock(&export->exp_lock); + + /* class_cleanup(), abort_recovery(), and class_fail_export() + * all end up in here, and if any of them race we shouldn't + * call extra class_export_puts(). */ + if (already_disconnected) { + LASSERT(hlist_unhashed(&export->exp_nid_hash)); + GOTO(no_disconn, already_disconnected); + } + + CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n", + export->exp_handle.h_cookie); + + if (!hlist_unhashed(&export->exp_nid_hash)) + cfs_hash_del(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + + class_export_recovery_cleanup(export); + class_unlink_export(export); +no_disconn: + class_export_put(export); + RETURN(0); +} +EXPORT_SYMBOL(class_disconnect); + +/* Return non-zero for a fully connected export */ +int class_connected_export(struct obd_export *exp) +{ + if (exp) { + int connected; + spin_lock(&exp->exp_lock); + connected = (exp->exp_conn_cnt > 0); + spin_unlock(&exp->exp_lock); + return connected; + } + return 0; +} +EXPORT_SYMBOL(class_connected_export); + +static void class_disconnect_export_list(struct list_head *list, + enum obd_option flags) +{ + int rc; + struct obd_export *exp; + ENTRY; + + /* It's possible that an export may disconnect itself, but + * nothing else will be added to this list. */ + while (!list_empty(list)) { + exp = list_entry(list->next, struct obd_export, + exp_obd_chain); + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + spin_lock(&exp->exp_lock); + exp->exp_flags = flags; + spin_unlock(&exp->exp_lock); + + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) { + CDEBUG(D_HA, + "exp %p export uuid == obd uuid, don't discon\n", + exp); + /* Need to delete this now so we don't end up pointing + * to work_list later when this export is cleaned up. */ + list_del_init(&exp->exp_obd_chain); + class_export_put(exp); + continue; + } + + class_export_get(exp); + CDEBUG(D_HA, "%s: disconnecting export at %s (%p), " + "last request at "CFS_TIME_T"\n", + exp->exp_obd->obd_name, obd_export_nid2str(exp), + exp, exp->exp_last_request_time); + /* release one export reference anyway */ + rc = obd_disconnect(exp); + + CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n", + obd_export_nid2str(exp), exp, rc); + class_export_put(exp); + } + EXIT; +} + +void class_disconnect_exports(struct obd_device *obd) +{ + struct list_head work_list; + ENTRY; + + /* Move all of the exports from obd_exports to a work list, en masse. */ + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_splice_init(&obd->obd_exports, &work_list); + list_splice_init(&obd->obd_delayed_exports, &work_list); + spin_unlock(&obd->obd_dev_lock); + + if (!list_empty(&work_list)) { + CDEBUG(D_HA, "OBD device %d (%p) has exports, " + "disconnecting them\n", obd->obd_minor, obd); + class_disconnect_export_list(&work_list, + exp_flags_from_obd(obd)); + } else + CDEBUG(D_HA, "OBD device %d (%p) has no exports\n", + obd->obd_minor, obd); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_exports); + +/* Remove exports that have not completed recovery. + */ +void class_disconnect_stale_exports(struct obd_device *obd, + int (*test_export)(struct obd_export *)) +{ + struct list_head work_list; + struct obd_export *exp, *n; + int evicted = 0; + ENTRY; + + INIT_LIST_HEAD(&work_list); + spin_lock(&obd->obd_dev_lock); + list_for_each_entry_safe(exp, n, &obd->obd_exports, + exp_obd_chain) { + /* don't count self-export as client */ + if (obd_uuid_equals(&exp->exp_client_uuid, + &exp->exp_obd->obd_uuid)) + continue; + + /* don't evict clients which have no slot in last_rcvd + * (e.g. lightweight connection) */ + if (exp->exp_target_data.ted_lr_idx == -1) + continue; + + spin_lock(&exp->exp_lock); + if (exp->exp_failed || test_export(exp)) { + spin_unlock(&exp->exp_lock); + continue; + } + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + list_move(&exp->exp_obd_chain, &work_list); + evicted++; + CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n", + obd->obd_name, exp->exp_client_uuid.uuid, + exp->exp_connection == NULL ? "<unknown>" : + libcfs_nid2str(exp->exp_connection->c_peer.nid)); + print_export_data(exp, "EVICTING", 0); + } + spin_unlock(&obd->obd_dev_lock); + + if (evicted) + LCONSOLE_WARN("%s: disconnecting %d stale clients\n", + obd->obd_name, evicted); + + class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) | + OBD_OPT_ABORT_RECOV); + EXIT; +} +EXPORT_SYMBOL(class_disconnect_stale_exports); + +void class_fail_export(struct obd_export *exp) +{ + int rc, already_failed; + + spin_lock(&exp->exp_lock); + already_failed = exp->exp_failed; + exp->exp_failed = 1; + spin_unlock(&exp->exp_lock); + + if (already_failed) { + CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n", + exp, exp->exp_client_uuid.uuid); + return; + } + + CDEBUG(D_HA, "disconnecting export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + + if (obd_dump_on_timeout) + libcfs_debug_dumplog(); + + /* need for safe call CDEBUG after obd_disconnect */ + class_export_get(exp); + + /* Most callers into obd_disconnect are removing their own reference + * (request, for example) in addition to the one from the hash table. + * We don't have such a reference here, so make one. */ + class_export_get(exp); + rc = obd_disconnect(exp); + if (rc) + CERROR("disconnecting export %p failed: %d\n", exp, rc); + else + CDEBUG(D_HA, "disconnected export %p/%s\n", + exp, exp->exp_client_uuid.uuid); + class_export_put(exp); +} +EXPORT_SYMBOL(class_fail_export); + +char *obd_export_nid2str(struct obd_export *exp) +{ + if (exp->exp_connection != NULL) + return libcfs_nid2str(exp->exp_connection->c_peer.nid); + + return "(no nid)"; +} +EXPORT_SYMBOL(obd_export_nid2str); + +int obd_export_evict_by_nid(struct obd_device *obd, const char *nid) +{ + cfs_hash_t *nid_hash; + struct obd_export *doomed_exp = NULL; + int exports_evicted = 0; + + lnet_nid_t nid_key = libcfs_str2nid((char *)nid); + + spin_lock(&obd->obd_dev_lock); + /* umount has run already, so evict thread should leave + * its task to umount thread now */ + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + nid_hash = obd->obd_nid_hash; + cfs_hash_getref(nid_hash); + spin_unlock(&obd->obd_dev_lock); + + do { + doomed_exp = cfs_hash_lookup(nid_hash, &nid_key); + if (doomed_exp == NULL) + break; + + LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key, + "nid %s found, wanted nid %s, requested nid %s\n", + obd_export_nid2str(doomed_exp), + libcfs_nid2str(nid_key), nid); + LASSERTF(doomed_exp != obd->obd_self_export, + "self-export is hashed by NID?\n"); + exports_evicted++; + LCONSOLE_WARN("%s: evicting %s (at %s) by administrative " + "request\n", obd->obd_name, + obd_uuid2str(&doomed_exp->exp_client_uuid), + obd_export_nid2str(doomed_exp)); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + } while (1); + + cfs_hash_putref(nid_hash); + + if (!exports_evicted) + CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n", + obd->obd_name, nid); + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_nid); + +int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid) +{ + cfs_hash_t *uuid_hash; + struct obd_export *doomed_exp = NULL; + struct obd_uuid doomed_uuid; + int exports_evicted = 0; + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + return exports_evicted; + } + uuid_hash = obd->obd_uuid_hash; + cfs_hash_getref(uuid_hash); + spin_unlock(&obd->obd_dev_lock); + + obd_str2uuid(&doomed_uuid, uuid); + if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) { + CERROR("%s: can't evict myself\n", obd->obd_name); + cfs_hash_putref(uuid_hash); + return exports_evicted; + } + + doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid); + + if (doomed_exp == NULL) { + CERROR("%s: can't disconnect %s: no exports found\n", + obd->obd_name, uuid); + } else { + CWARN("%s: evicting %s at adminstrative request\n", + obd->obd_name, doomed_exp->exp_client_uuid.uuid); + class_fail_export(doomed_exp); + class_export_put(doomed_exp); + exports_evicted++; + } + cfs_hash_putref(uuid_hash); + + return exports_evicted; +} +EXPORT_SYMBOL(obd_export_evict_by_uuid); + +#if LUSTRE_TRACKS_LOCK_EXP_REFS +void (*class_export_dump_hook)(struct obd_export*) = NULL; +EXPORT_SYMBOL(class_export_dump_hook); +#endif + +static void print_export_data(struct obd_export *exp, const char *status, + int locks) +{ + struct ptlrpc_reply_state *rs; + struct ptlrpc_reply_state *first_reply = NULL; + int nreplies = 0; + + spin_lock(&exp->exp_lock); + list_for_each_entry(rs, &exp->exp_outstanding_replies, + rs_exp_list) { + if (nreplies == 0) + first_reply = rs; + nreplies++; + } + spin_unlock(&exp->exp_lock); + + CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n", + exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid, + obd_export_nid2str(exp), atomic_read(&exp->exp_refcount), + atomic_read(&exp->exp_rpc_count), + atomic_read(&exp->exp_cb_count), + atomic_read(&exp->exp_locks_count), + exp->exp_disconnected, exp->exp_delayed, exp->exp_failed, + nreplies, first_reply, nreplies > 3 ? "..." : "", + exp->exp_last_committed); +#if LUSTRE_TRACKS_LOCK_EXP_REFS + if (locks && class_export_dump_hook != NULL) + class_export_dump_hook(exp); +#endif +} + +void dump_exports(struct obd_device *obd, int locks) +{ + struct obd_export *exp; + + spin_lock(&obd->obd_dev_lock); + list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain) + print_export_data(exp, "ACTIVE", locks); + list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain) + print_export_data(exp, "UNLINKED", locks); + list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain) + print_export_data(exp, "DELAYED", locks); + spin_unlock(&obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain) + print_export_data(exp, "ZOMBIE", locks); + spin_unlock(&obd_zombie_impexp_lock); +} +EXPORT_SYMBOL(dump_exports); + +void obd_exports_barrier(struct obd_device *obd) +{ + int waited = 2; + LASSERT(list_empty(&obd->obd_exports)); + spin_lock(&obd->obd_dev_lock); + while (!list_empty(&obd->obd_unlinked_exports)) { + spin_unlock(&obd->obd_dev_lock); + schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE, + cfs_time_seconds(waited)); + if (waited > 5 && IS_PO2(waited)) { + LCONSOLE_WARN("%s is waiting for obd_unlinked_exports " + "more than %d seconds. " + "The obd refcount = %d. Is it stuck?\n", + obd->obd_name, waited, + atomic_read(&obd->obd_refcount)); + dump_exports(obd, 1); + } + waited *= 2; + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); +} +EXPORT_SYMBOL(obd_exports_barrier); + +/* Total amount of zombies to be destroyed */ +static int zombies_count = 0; + +/** + * kill zombie imports and exports + */ +void obd_zombie_impexp_cull(void) +{ + struct obd_import *import; + struct obd_export *export; + ENTRY; + + do { + spin_lock(&obd_zombie_impexp_lock); + + import = NULL; + if (!list_empty(&obd_zombie_imports)) { + import = list_entry(obd_zombie_imports.next, + struct obd_import, + imp_zombie_chain); + list_del_init(&import->imp_zombie_chain); + } + + export = NULL; + if (!list_empty(&obd_zombie_exports)) { + export = list_entry(obd_zombie_exports.next, + struct obd_export, + exp_obd_chain); + list_del_init(&export->exp_obd_chain); + } + + spin_unlock(&obd_zombie_impexp_lock); + + if (import != NULL) { + class_import_destroy(import); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + if (export != NULL) { + class_export_destroy(export); + spin_lock(&obd_zombie_impexp_lock); + zombies_count--; + spin_unlock(&obd_zombie_impexp_lock); + } + + cond_resched(); + } while (import != NULL || export != NULL); + EXIT; +} + +static struct completion obd_zombie_start; +static struct completion obd_zombie_stop; +static unsigned long obd_zombie_flags; +static wait_queue_head_t obd_zombie_waitq; +static pid_t obd_zombie_pid; + +enum { + OBD_ZOMBIE_STOP = 0x0001, +}; + +/** + * check for work for kill zombie import/export thread. + */ +static int obd_zombie_impexp_check(void *arg) +{ + int rc; + + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0) && + !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + spin_unlock(&obd_zombie_impexp_lock); + + RETURN(rc); +} + +/** + * Add export to the obd_zombe thread and notify it. + */ +static void obd_zombie_export_add(struct obd_export *exp) { + spin_lock(&exp->exp_obd->obd_dev_lock); + LASSERT(!list_empty(&exp->exp_obd_chain)); + list_del_init(&exp->exp_obd_chain); + spin_unlock(&exp->exp_obd->obd_dev_lock); + spin_lock(&obd_zombie_impexp_lock); + zombies_count++; + list_add(&exp->exp_obd_chain, &obd_zombie_exports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * Add import to the obd_zombe thread and notify it. + */ +static void obd_zombie_import_add(struct obd_import *imp) { + LASSERT(imp->imp_sec == NULL); + LASSERT(imp->imp_rq_pool == NULL); + spin_lock(&obd_zombie_impexp_lock); + LASSERT(list_empty(&imp->imp_zombie_chain)); + zombies_count++; + list_add(&imp->imp_zombie_chain, &obd_zombie_imports); + spin_unlock(&obd_zombie_impexp_lock); + + obd_zombie_impexp_notify(); +} + +/** + * notify import/export destroy thread about new zombie. + */ +static void obd_zombie_impexp_notify(void) +{ + /* + * Make sure obd_zomebie_impexp_thread get this notification. + * It is possible this signal only get by obd_zombie_barrier, and + * barrier gulps this notification and sleeps away and hangs ensues + */ + wake_up_all(&obd_zombie_waitq); +} + +/** + * check whether obd_zombie is idle + */ +static int obd_zombie_is_idle(void) +{ + int rc; + + LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)); + spin_lock(&obd_zombie_impexp_lock); + rc = (zombies_count == 0); + spin_unlock(&obd_zombie_impexp_lock); + return rc; +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + struct l_wait_info lwi = { 0 }; + + if (obd_zombie_pid == current_pid()) + /* don't wait for myself */ + return; + l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi); +} +EXPORT_SYMBOL(obd_zombie_barrier); + + +/** + * destroy zombie export/import thread. + */ +static int obd_zombie_impexp_thread(void *unused) +{ + unshare_fs_struct(); + complete(&obd_zombie_start); + + obd_zombie_pid = current_pid(); + + while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) { + struct l_wait_info lwi = { 0 }; + + l_wait_event(obd_zombie_waitq, + !obd_zombie_impexp_check(NULL), &lwi); + obd_zombie_impexp_cull(); + + /* + * Notify obd_zombie_barrier callers that queues + * may be empty. + */ + wake_up(&obd_zombie_waitq); + } + + complete(&obd_zombie_stop); + + RETURN(0); +} + + +/** + * start destroy zombie import/export thread + */ +int obd_zombie_impexp_init(void) +{ + task_t *task; + + INIT_LIST_HEAD(&obd_zombie_imports); + INIT_LIST_HEAD(&obd_zombie_exports); + spin_lock_init(&obd_zombie_impexp_lock); + init_completion(&obd_zombie_start); + init_completion(&obd_zombie_stop); + init_waitqueue_head(&obd_zombie_waitq); + obd_zombie_pid = 0; + + task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid"); + if (IS_ERR(task)) + RETURN(PTR_ERR(task)); + + wait_for_completion(&obd_zombie_start); + RETURN(0); +} +/** + * stop destroy zombie import/export thread + */ +void obd_zombie_impexp_stop(void) +{ + set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags); + obd_zombie_impexp_notify(); + wait_for_completion(&obd_zombie_stop); +} + +/***** Kernel-userspace comm helpers *******/ + +/* Get length of entire message, including header */ +int kuc_len(int payload_len) +{ + return sizeof(struct kuc_hdr) + payload_len; +} +EXPORT_SYMBOL(kuc_len); + +/* Get a pointer to kuc header, given a ptr to the payload + * @param p Pointer to payload area + * @returns Pointer to kuc header + */ +struct kuc_hdr * kuc_ptr(void *p) +{ + struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1; + LASSERT(lh->kuc_magic == KUC_MAGIC); + return lh; +} +EXPORT_SYMBOL(kuc_ptr); + +/* Test if payload is part of kuc message + * @param p Pointer to payload area + * @returns boolean + */ +int kuc_ispayload(void *p) +{ + struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1; + + if (kh->kuc_magic == KUC_MAGIC) + return 1; + else + return 0; +} +EXPORT_SYMBOL(kuc_ispayload); + +/* Alloc space for a message, and fill in header + * @return Pointer to payload area + */ +void *kuc_alloc(int payload_len, int transport, int type) +{ + struct kuc_hdr *lh; + int len = kuc_len(payload_len); + + OBD_ALLOC(lh, len); + if (lh == NULL) + return ERR_PTR(-ENOMEM); + + lh->kuc_magic = KUC_MAGIC; + lh->kuc_transport = transport; + lh->kuc_msgtype = type; + lh->kuc_msglen = len; + + return (void *)(lh + 1); +} +EXPORT_SYMBOL(kuc_alloc); + +/* Takes pointer to payload area */ +inline void kuc_free(void *p, int payload_len) +{ + struct kuc_hdr *lh = kuc_ptr(p); + OBD_FREE(lh, kuc_len(payload_len)); +} +EXPORT_SYMBOL(kuc_free); diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c new file mode 100644 index 000000000000..622f8d165275 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/idmap.c @@ -0,0 +1,474 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/idmap.c + * + * Lustre user identity mapping. + * + * Author: Fan Yong <fanyong@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_SEC + +#include <lustre_idmap.h> +#include <md_object.h> +#include <obd_support.h> + +#define lustre_get_group_info(group_info) do { \ + atomic_inc(&(group_info)->usage); \ +} while (0) + +#define lustre_put_group_info(group_info) do { \ + if (atomic_dec_and_test(&(group_info)->usage)) \ + groups_free(group_info); \ +} while (0) + +/* + * groups_search() is copied from linux kernel! + * A simple bsearch. + */ +static int lustre_groups_search(group_info_t *group_info, + gid_t grp) +{ + int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + int mid = (left + right) / 2; + int cmp = grp - CFS_GROUP_AT(group_info, mid); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return 1; + } + return 0; +} + +void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist) +{ + int i; + int count = ginfo->ngroups; + + /* fill group_info from gid array */ + for (i = 0; i < ginfo->nblocks && count > 0; i++) { + int cp_count = min(CFS_NGROUPS_PER_BLOCK, count); + int off = i * CFS_NGROUPS_PER_BLOCK; + int len = cp_count * sizeof(*glist); + + memcpy(ginfo->blocks[i], glist + off, len); + count -= cp_count; + } +} +EXPORT_SYMBOL(lustre_groups_from_list); + +/* groups_sort() is copied from linux kernel! */ +/* a simple shell-metzner sort */ +void lustre_groups_sort(group_info_t *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = CFS_GROUP_AT(group_info, right); + + while (left >= 0 && + CFS_GROUP_AT(group_info, left) > tmp) { + CFS_GROUP_AT(group_info, right) = + CFS_GROUP_AT(group_info, left); + right = left; + left -= stride; + } + CFS_GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} +EXPORT_SYMBOL(lustre_groups_sort); + +int lustre_in_group_p(struct lu_ucred *mu, gid_t grp) +{ + int rc = 1; + + if (grp != mu->uc_fsgid) { + group_info_t *group_info = NULL; + + if (mu->uc_ginfo || !mu->uc_identity || + mu->uc_valid == UCRED_OLD) + if (grp == mu->uc_suppgids[0] || + grp == mu->uc_suppgids[1]) + return 1; + + if (mu->uc_ginfo) + group_info = mu->uc_ginfo; + else if (mu->uc_identity) + group_info = mu->uc_identity->mi_ginfo; + + if (!group_info) + return 0; + + lustre_get_group_info(group_info); + rc = lustre_groups_search(group_info, grp); + lustre_put_group_info(group_info); + } + return rc; +} +EXPORT_SYMBOL(lustre_in_group_p); + +struct lustre_idmap_entry { + struct list_head lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */ + struct list_head lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */ + struct list_head lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */ + struct list_head lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */ + uid_t lie_rmt_uid; /* remote uid */ + uid_t lie_lcl_uid; /* local uid */ + gid_t lie_rmt_gid; /* remote gid */ + gid_t lie_lcl_gid; /* local gid */ +}; + +static inline __u32 lustre_idmap_hashfunc(__u32 id) +{ + return id & (CFS_IDMAP_HASHSIZE - 1); +} + +static +struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + struct lustre_idmap_entry *e; + + OBD_ALLOC_PTR(e); + if (e == NULL) + return NULL; + + INIT_LIST_HEAD(&e->lie_rmt_uid_hash); + INIT_LIST_HEAD(&e->lie_lcl_uid_hash); + INIT_LIST_HEAD(&e->lie_rmt_gid_hash); + INIT_LIST_HEAD(&e->lie_lcl_gid_hash); + e->lie_rmt_uid = rmt_uid; + e->lie_lcl_uid = lcl_uid; + e->lie_rmt_gid = rmt_gid; + e->lie_lcl_gid = lcl_gid; + + return e; +} + +static void idmap_entry_free(struct lustre_idmap_entry *e) +{ + if (!list_empty(&e->lie_rmt_uid_hash)) + list_del(&e->lie_rmt_uid_hash); + if (!list_empty(&e->lie_lcl_uid_hash)) + list_del(&e->lie_lcl_uid_hash); + if (!list_empty(&e->lie_rmt_gid_hash)) + list_del(&e->lie_rmt_gid_hash); + if (!list_empty(&e->lie_lcl_gid_hash)) + list_del(&e->lie_lcl_gid_hash); + OBD_FREE_PTR(e); +} + +/* + * return value + * NULL: not found entry + * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry + * others: found normal entry + */ +static +struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t, + uid_t rmt_uid, uid_t lcl_uid, + gid_t rmt_gid, gid_t lcl_gid) +{ + struct list_head *head; + struct lustre_idmap_entry *e; + + head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)]; + list_for_each_entry(e, head, lie_rmt_uid_hash) + if (e->lie_rmt_uid == rmt_uid) { + if (e->lie_lcl_uid == lcl_uid) { + if (e->lie_rmt_gid == rmt_gid && + e->lie_lcl_gid == lcl_gid) + /* must be quaternion match */ + return e; + } else { + /* 1:N uid mapping */ + CERROR("rmt uid %u already be mapped to %u" + " (new %u)\n", e->lie_rmt_uid, + e->lie_lcl_uid, lcl_uid); + return ERR_PTR(-EACCES); + } + } + + head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)]; + list_for_each_entry(e, head, lie_rmt_gid_hash) + if (e->lie_rmt_gid == rmt_gid) { + if (e->lie_lcl_gid == lcl_gid) { + if (unlikely(e->lie_rmt_uid == rmt_uid && + e->lie_lcl_uid == lcl_uid)) + /* after uid mapping search above, + * we should never come here */ + LBUG(); + } else { + /* 1:N gid mapping */ + CERROR("rmt gid %u already be mapped to %u" + " (new %u)\n", e->lie_rmt_gid, + e->lie_lcl_gid, lcl_gid); + return ERR_PTR(-EACCES); + } + } + + return NULL; +} + +static __u32 idmap_lookup_uid(struct list_head *hash, int reverse, + __u32 uid) +{ + struct list_head *head = &hash[lustre_idmap_hashfunc(uid)]; + struct lustre_idmap_entry *e; + + if (!reverse) { + list_for_each_entry(e, head, lie_rmt_uid_hash) + if (e->lie_rmt_uid == uid) + return e->lie_lcl_uid; + } else { + list_for_each_entry(e, head, lie_lcl_uid_hash) + if (e->lie_lcl_uid == uid) + return e->lie_rmt_uid; + } + + return CFS_IDMAP_NOTFOUND; +} + +static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid) +{ + struct list_head *head = &hash[lustre_idmap_hashfunc(gid)]; + struct lustre_idmap_entry *e; + + if (!reverse) { + list_for_each_entry(e, head, lie_rmt_gid_hash) + if (e->lie_rmt_gid == gid) + return e->lie_lcl_gid; + } else { + list_for_each_entry(e, head, lie_lcl_gid_hash) + if (e->lie_lcl_gid == gid) + return e->lie_rmt_gid; + } + + return CFS_IDMAP_NOTFOUND; +} + +int lustre_idmap_add(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid) +{ + struct lustre_idmap_entry *e0, *e1; + + LASSERT(t); + + spin_lock(&t->lit_lock); + e0 = idmap_search_entry(t, ruid, luid, rgid, lgid); + spin_unlock(&t->lit_lock); + if (!e0) { + e0 = idmap_entry_alloc(ruid, luid, rgid, lgid); + if (!e0) + return -ENOMEM; + + spin_lock(&t->lit_lock); + e1 = idmap_search_entry(t, ruid, luid, rgid, lgid); + if (e1 == NULL) { + list_add_tail(&e0->lie_rmt_uid_hash, + &t->lit_idmaps[RMT_UIDMAP_IDX] + [lustre_idmap_hashfunc(ruid)]); + list_add_tail(&e0->lie_lcl_uid_hash, + &t->lit_idmaps[LCL_UIDMAP_IDX] + [lustre_idmap_hashfunc(luid)]); + list_add_tail(&e0->lie_rmt_gid_hash, + &t->lit_idmaps[RMT_GIDMAP_IDX] + [lustre_idmap_hashfunc(rgid)]); + list_add_tail(&e0->lie_lcl_gid_hash, + &t->lit_idmaps[LCL_GIDMAP_IDX] + [lustre_idmap_hashfunc(lgid)]); + } + spin_unlock(&t->lit_lock); + if (e1 != NULL) { + idmap_entry_free(e0); + if (IS_ERR(e1)) + return PTR_ERR(e1); + } + } else if (IS_ERR(e0)) { + return PTR_ERR(e0); + } + + return 0; +} +EXPORT_SYMBOL(lustre_idmap_add); + +int lustre_idmap_del(struct lustre_idmap_table *t, + uid_t ruid, uid_t luid, + gid_t rgid, gid_t lgid) +{ + struct lustre_idmap_entry *e; + int rc = 0; + + LASSERT(t); + + spin_lock(&t->lit_lock); + e = idmap_search_entry(t, ruid, luid, rgid, lgid); + if (IS_ERR(e)) + rc = PTR_ERR(e); + else if (e) + idmap_entry_free(e); + spin_unlock(&t->lit_lock); + + return rc; +} +EXPORT_SYMBOL(lustre_idmap_del); + +int lustre_idmap_lookup_uid(struct lu_ucred *mu, + struct lustre_idmap_table *t, + int reverse, uid_t uid) +{ + struct list_head *hash; + + if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) { + if (!reverse) { + if (uid == mu->uc_o_uid) + return mu->uc_uid; + else if (uid == mu->uc_o_fsuid) + return mu->uc_fsuid; + } else { + if (uid == mu->uc_uid) + return mu->uc_o_uid; + else if (uid == mu->uc_fsuid) + return mu->uc_o_fsuid; + } + } + + if (t == NULL) + return CFS_IDMAP_NOTFOUND; + + hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX]; + + spin_lock(&t->lit_lock); + uid = idmap_lookup_uid(hash, reverse, uid); + spin_unlock(&t->lit_lock); + + return uid; +} +EXPORT_SYMBOL(lustre_idmap_lookup_uid); + +int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t, + int reverse, gid_t gid) +{ + struct list_head *hash; + + if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) { + if (!reverse) { + if (gid == mu->uc_o_gid) + return mu->uc_gid; + else if (gid == mu->uc_o_fsgid) + return mu->uc_fsgid; + } else { + if (gid == mu->uc_gid) + return mu->uc_o_gid; + else if (gid == mu->uc_fsgid) + return mu->uc_o_fsgid; + } + } + + if (t == NULL) + return CFS_IDMAP_NOTFOUND; + + hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX]; + + spin_lock(&t->lit_lock); + gid = idmap_lookup_gid(hash, reverse, gid); + spin_unlock(&t->lit_lock); + + return gid; +} +EXPORT_SYMBOL(lustre_idmap_lookup_gid); + +struct lustre_idmap_table *lustre_idmap_init(void) +{ + struct lustre_idmap_table *t; + int i, j; + + OBD_ALLOC_PTR(t); + if(unlikely(t == NULL)) + return (ERR_PTR(-ENOMEM)); + + spin_lock_init(&t->lit_lock); + for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++) + for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++) + INIT_LIST_HEAD(&t->lit_idmaps[i][j]); + + return t; +} +EXPORT_SYMBOL(lustre_idmap_init); + +void lustre_idmap_fini(struct lustre_idmap_table *t) +{ + struct list_head *list; + struct lustre_idmap_entry *e; + int i; + LASSERT(t); + + list = t->lit_idmaps[RMT_UIDMAP_IDX]; + spin_lock(&t->lit_lock); + for (i = 0; i < CFS_IDMAP_HASHSIZE; i++) + while (!list_empty(&list[i])) { + e = list_entry(list[i].next, + struct lustre_idmap_entry, + lie_rmt_uid_hash); + idmap_entry_free(e); + } + spin_unlock(&t->lit_lock); + + OBD_FREE_PTR(t); +} +EXPORT_SYMBOL(lustre_idmap_fini); diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c new file mode 100644 index 000000000000..b5c19ac1470f --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linkea.c @@ -0,0 +1,194 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2013, Intel Corporation. + * Use is subject to license terms. + * + * Author: Di Wang <di.wang@intel.com> + */ + +#include <lustre/lustre_idl.h> +#include <obd.h> +#include <lustre_linkea.h> + +int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf) +{ + ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE); + if (ldata->ld_buf->lb_buf == NULL) + return -ENOMEM; + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_leh->leh_magic = LINK_EA_MAGIC; + ldata->ld_leh->leh_len = sizeof(struct link_ea_header); + ldata->ld_leh->leh_reccount = 0; + return 0; +} +EXPORT_SYMBOL(linkea_data_new); + +int linkea_init(struct linkea_data *ldata) +{ + struct link_ea_header *leh; + + LASSERT(ldata->ld_buf != NULL); + leh = ldata->ld_buf->lb_buf; + if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) { + leh->leh_magic = LINK_EA_MAGIC; + leh->leh_reccount = __swab32(leh->leh_reccount); + leh->leh_len = __swab64(leh->leh_len); + /* entries are swabbed by linkea_entry_unpack */ + } + if (leh->leh_magic != LINK_EA_MAGIC) + return -EINVAL; + if (leh->leh_reccount == 0) + return -ENODATA; + + ldata->ld_leh = leh; + return 0; +} +EXPORT_SYMBOL(linkea_init); + +/** + * Pack a link_ea_entry. + * All elements are stored as chars to avoid alignment issues. + * Numbers are always big-endian + * \retval record length + */ +static int linkea_entry_pack(struct link_ea_entry *lee, + const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_fid tmpfid; + int reclen; + + fid_cpu_to_be(&tmpfid, pfid); + if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH)) + tmpfid.f_ver = ~0; + memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid)); + memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen); + reclen = sizeof(struct link_ea_entry) + lname->ln_namelen; + + lee->lee_reclen[0] = (reclen >> 8) & 0xff; + lee->lee_reclen[1] = reclen & 0xff; + return reclen; +} + +void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen, + struct lu_name *lname, struct lu_fid *pfid) +{ + *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1]; + memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid)); + fid_be_to_cpu(pfid, pfid); + lname->ln_name = lee->lee_name; + lname->ln_namelen = *reclen - sizeof(struct link_ea_entry); +} +EXPORT_SYMBOL(linkea_entry_unpack); + +/** + * Add a record to the end of link ea buf + **/ +int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + LASSERT(ldata->ld_leh != NULL); + + if (lname == NULL || pfid == NULL) + return -EINVAL; + + ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry); + if (ldata->ld_leh->leh_len + ldata->ld_reclen > + ldata->ld_buf->lb_len) { + if (lu_buf_check_and_grow(ldata->ld_buf, + ldata->ld_leh->leh_len + + ldata->ld_reclen) < 0) + return -ENOMEM; + } + + ldata->ld_leh = ldata->ld_buf->lb_buf; + ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len; + ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid); + ldata->ld_leh->leh_len += ldata->ld_reclen; + ldata->ld_leh->leh_reccount++; + CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n", + lname->ln_namelen, lname->ln_name); + return 0; +} +EXPORT_SYMBOL(linkea_add_buf); + +/** Del the current record from the link ea buf */ +void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname) +{ + LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL); + + ldata->ld_leh->leh_reccount--; + ldata->ld_leh->leh_len -= ldata->ld_reclen; + memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen, + (char *)ldata->ld_leh + ldata->ld_leh->leh_len - + (char *)ldata->ld_lee); + CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n", + lname->ln_namelen, lname->ln_name); +} +EXPORT_SYMBOL(linkea_del_buf); + +/** + * Check if such a link exists in linkEA. + * + * \param ldata link data the search to be done on + * \param lname name in the parent's directory entry pointing to this object + * \param pfid parent fid the link to be found for + * + * \retval 0 success + * \retval -ENOENT link does not exist + * \retval -ve on error + */ +int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname, + const struct lu_fid *pfid) +{ + struct lu_name tmpname; + struct lu_fid tmpfid; + int count; + + LASSERT(ldata->ld_leh != NULL); + + /* link #0 */ + ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1); + + for (count = 0; count < ldata->ld_leh->leh_reccount; count++) { + linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen, + &tmpname, &tmpfid); + if (tmpname.ln_namelen == lname->ln_namelen && + lu_fid_eq(&tmpfid, pfid) && + (strncmp(tmpname.ln_name, lname->ln_name, + tmpname.ln_namelen) == 0)) + break; + ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee + + ldata->ld_reclen); + } + + if (count == ldata->ld_leh->leh_reccount) { + CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n", + lname->ln_namelen, lname->ln_name); + ldata->ld_lee = NULL; + return -ENOENT; + } + return 0; +} +EXPORT_SYMBOL(linkea_links_find); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c new file mode 100644 index 000000000000..d2c3072541d1 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c @@ -0,0 +1,408 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-module.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/module.h> +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/major.h> +#include <linux/sched.h> +#include <linux/lp.h> +#include <linux/slab.h> +#include <linux/ioport.h> +#include <linux/fcntl.h> +#include <linux/delay.h> +#include <linux/skbuff.h> +#include <linux/proc_fs.h> +#include <linux/fs.h> +#include <linux/poll.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/highmem.h> +#include <asm/io.h> +#include <asm/ioctls.h> +#include <asm/poll.h> +#include <asm/uaccess.h> +#include <linux/miscdevice.h> +#include <linux/seq_file.h> + +#include <linux/libcfs/libcfs.h> +#include <obd_support.h> +#include <obd_class.h> +#include <linux/lnet/lnetctl.h> +#include <lprocfs_status.h> +#include <lustre_ver.h> +#include <lustre/lustre_build_version.h> + +int proc_version; + +/* buffer MUST be at least the size of obd_ioctl_hdr */ +int obd_ioctl_getdata(char **buf, int *len, void *arg) +{ + struct obd_ioctl_hdr hdr; + struct obd_ioctl_data *data; + int err; + int offset = 0; + ENTRY; + + err = copy_from_user(&hdr, (void *)arg, sizeof(hdr)); + if ( err ) + RETURN(err); + + if (hdr.ioc_version != OBD_IOCTL_VERSION) { + CERROR("Version mismatch kernel (%x) vs application (%x)\n", + OBD_IOCTL_VERSION, hdr.ioc_version); + RETURN(-EINVAL); + } + + if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) { + CERROR("User buffer len %d exceeds %d max buffer\n", + hdr.ioc_len, OBD_MAX_IOCTL_BUFFER); + RETURN(-EINVAL); + } + + if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) { + CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len); + RETURN(-EINVAL); + } + + /* When there are lots of processes calling vmalloc on multi-core + * system, the high lock contention will hurt performance badly, + * obdfilter-survey is an example, which relies on ioctl. So we'd + * better avoid vmalloc on ioctl path. LU-66 */ + OBD_ALLOC_LARGE(*buf, hdr.ioc_len); + if (*buf == NULL) { + CERROR("Cannot allocate control buffer of len %d\n", + hdr.ioc_len); + RETURN(-EINVAL); + } + *len = hdr.ioc_len; + data = (struct obd_ioctl_data *)*buf; + + err = copy_from_user(*buf, (void *)arg, hdr.ioc_len); + if ( err ) { + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(err); + } + + if (obd_ioctl_is_invalid(data)) { + CERROR("ioctl not correctly formatted\n"); + OBD_FREE_LARGE(*buf, hdr.ioc_len); + RETURN(-EINVAL); + } + + if (data->ioc_inllen1) { + data->ioc_inlbuf1 = &data->ioc_bulk[0]; + offset += cfs_size_round(data->ioc_inllen1); + } + + if (data->ioc_inllen2) { + data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen2); + } + + if (data->ioc_inllen3) { + data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset; + offset += cfs_size_round(data->ioc_inllen3); + } + + if (data->ioc_inllen4) { + data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset; + } + + EXIT; + return 0; +} +EXPORT_SYMBOL(obd_ioctl_getdata); + +int obd_ioctl_popdata(void *arg, void *data, int len) +{ + int err; + + err = copy_to_user(arg, data, len); + if (err) + err = -EFAULT; + return err; +} +EXPORT_SYMBOL(obd_ioctl_popdata); + +/* opening /dev/obd */ +static int obd_class_open(struct inode * inode, struct file * file) +{ + ENTRY; + + try_module_get(THIS_MODULE); + RETURN(0); +} + +/* closing /dev/obd */ +static int obd_class_release(struct inode * inode, struct file * file) +{ + ENTRY; + + module_put(THIS_MODULE); + RETURN(0); +} + +/* to control /dev/obd */ +static long obd_class_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int err = 0; + ENTRY; + + /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */ + if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET)) + RETURN(err = -EACCES); + if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */ + RETURN(err = -ENOTTY); + + err = class_handle_ioctl(cmd, (unsigned long)arg); + + RETURN(err); +} + +/* declare character device */ +static struct file_operations obd_psdev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */ + .open = obd_class_open, /* open */ + .release = obd_class_release, /* release */ +}; + +/* modules setup */ +psdev_t obd_psdev = { + .minor = OBD_DEV_MINOR, + .name = OBD_DEV_NAME, + .fops = &obd_psdev_fops, +}; + + +#ifdef LPROCFS +int obd_proc_version_seq_show(struct seq_file *m, void *v) +{ + return seq_printf(m, "lustre: %s\nkernel: %s\nbuild: %s\n", + LUSTRE_VERSION_STRING, "patchless_client", + BUILD_VERSION); +} +LPROC_SEQ_FOPS_RO(obd_proc_version); + +int obd_proc_pinger_seq_show(struct seq_file *m, void *v) +{ + return seq_printf(m, "%s\n", "on"); +} +LPROC_SEQ_FOPS_RO(obd_proc_pinger); + +static int obd_proc_health_seq_show(struct seq_file *m, void *v) +{ + int rc = 0, i; + + if (libcfs_catastrophe) + seq_printf(m, "LBUG\n"); + + read_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + struct obd_device *obd; + + obd = class_num2obd(i); + if (obd == NULL || !obd->obd_attached || !obd->obd_set_up) + continue; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + continue; + + class_incref(obd, __FUNCTION__, current); + read_unlock(&obd_dev_lock); + + if (obd_health_check(NULL, obd)) { + seq_printf(m, "device %s reported unhealthy\n", + obd->obd_name); + rc++; + } + class_decref(obd, __FUNCTION__, current); + read_lock(&obd_dev_lock); + } + read_unlock(&obd_dev_lock); + + if (rc == 0) + return seq_printf(m, "healthy\n"); + + seq_printf(m, "NOT HEALTHY\n"); + return 0; +} +LPROC_SEQ_FOPS_RO(obd_proc_health); + +static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v) +{ + return seq_printf(m, "%s\n", obd_jobid_var); +} + +static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffer, + size_t count, loff_t *off) +{ + if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN) + return -EINVAL; + + memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1); + /* Trim the trailing '\n' if any */ + memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n')); + return count; +} +LPROC_SEQ_FOPS(obd_proc_jobid_var); + +/* Root for /proc/fs/lustre */ +struct proc_dir_entry *proc_lustre_root = NULL; +EXPORT_SYMBOL(proc_lustre_root); + +struct lprocfs_vars lprocfs_base[] = { + { "version", &obd_proc_version_fops }, + { "pinger", &obd_proc_pinger_fops }, + { "health_check", &obd_proc_health_fops }, + { "jobid_var", &obd_proc_jobid_var_fops }, + { 0 } +}; +#else +#define lprocfs_base NULL +#endif /* LPROCFS */ + +static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos) +{ + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static void obd_device_list_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + ++*pos; + if (*pos >= class_devno_max()) + return NULL; + + return pos; +} + +static int obd_device_list_seq_show(struct seq_file *p, void *v) +{ + loff_t index = *(loff_t *)v; + struct obd_device *obd = class_num2obd((int)index); + char *status; + + if (obd == NULL) + return 0; + + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + if (obd->obd_stopping) + status = "ST"; + else if (obd->obd_inactive) + status = "IN"; + else if (obd->obd_set_up) + status = "UP"; + else if (obd->obd_attached) + status = "AT"; + else + status = "--"; + + return seq_printf(p, "%3d %s %s %s %s %d\n", + (int)index, status, obd->obd_type->typ_name, + obd->obd_name, obd->obd_uuid.uuid, + atomic_read(&obd->obd_refcount)); +} + +struct seq_operations obd_device_list_sops = { + .start = obd_device_list_seq_start, + .stop = obd_device_list_seq_stop, + .next = obd_device_list_seq_next, + .show = obd_device_list_seq_show, +}; + +static int obd_device_list_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = seq_open(file, &obd_device_list_sops); + + if (rc) + return rc; + + seq = file->private_data; + seq->private = PDE_DATA(inode); + + return 0; +} + +struct file_operations obd_device_list_fops = { + .owner = THIS_MODULE, + .open = obd_device_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int class_procfs_init(void) +{ + int rc; + ENTRY; + + obd_sysctl_init(); + proc_lustre_root = lprocfs_register("fs/lustre", NULL, + lprocfs_base, NULL); + rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444, + &obd_device_list_fops, NULL); + if (rc) + CERROR("error adding /proc/fs/lustre/devices file\n"); + RETURN(0); +} + +int class_procfs_clean(void) +{ + ENTRY; + if (proc_lustre_root) { + lprocfs_remove(&proc_lustre_root); + } + RETURN(0); +} diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c new file mode 100644 index 000000000000..6ee347153a16 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c @@ -0,0 +1,222 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/linux/linux-obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/module.h> +#include <obd_class.h> +#include <lustre/lustre_idl.h> + +#include <linux/fs.h> +#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */ + +/*FIXME: Just copy from obdo_from_inode*/ +void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid) +{ + obd_flag newvalid = 0; + + if (valid & LA_ATIME) { + dst->o_atime = la->la_atime; + newvalid |= OBD_MD_FLATIME; + } + if (valid & LA_MTIME) { + dst->o_mtime = la->la_mtime; + newvalid |= OBD_MD_FLMTIME; + } + if (valid & LA_CTIME) { + dst->o_ctime = la->la_ctime; + newvalid |= OBD_MD_FLCTIME; + } + if (valid & LA_SIZE) { + dst->o_size = la->la_size; + newvalid |= OBD_MD_FLSIZE; + } + if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = la->la_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & LA_TYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (la->la_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & LA_MODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (la->la_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & LA_UID) { + dst->o_uid = la->la_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & LA_GID) { + dst->o_gid = la->la_gid; + newvalid |= OBD_MD_FLGID; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_la); + +/*FIXME: Just copy from obdo_from_inode*/ +void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid) +{ + __u64 newvalid = 0; + + valid &= obdo->o_valid; + + if (valid & OBD_MD_FLATIME) { + dst->la_atime = obdo->o_atime; + newvalid |= LA_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->la_mtime = obdo->o_mtime; + newvalid |= LA_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->la_ctime = obdo->o_ctime; + newvalid |= LA_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->la_size = obdo->o_size; + newvalid |= LA_SIZE; + } + if (valid & OBD_MD_FLBLOCKS) { + dst->la_blocks = obdo->o_blocks; + newvalid |= LA_BLOCKS; + } + if (valid & OBD_MD_FLTYPE) { + dst->la_mode = (dst->la_mode & S_IALLUGO) | + (obdo->o_mode & S_IFMT); + newvalid |= LA_TYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->la_mode = (dst->la_mode & S_IFMT) | + (obdo->o_mode & S_IALLUGO); + newvalid |= LA_MODE; + } + if (valid & OBD_MD_FLUID) { + dst->la_uid = obdo->o_uid; + newvalid |= LA_UID; + } + if (valid & OBD_MD_FLGID) { + dst->la_gid = obdo->o_gid; + newvalid |= LA_GID; + } + dst->la_valid = newvalid; +} +EXPORT_SYMBOL(la_from_obdo); + +void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid) +{ + valid &= src->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime)) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime)) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + /* optimum IO size */ + if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits)) + dst->i_blkbits = ffs(src->o_blksize) - 1; + + if (dst->i_blkbits < PAGE_CACHE_SHIFT) + dst->i_blkbits = PAGE_CACHE_SHIFT; + + /* allocation of space */ + if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks) + /* + * XXX shouldn't overflow be checked here like in + * obdo_to_inode(). + */ + dst->i_blocks = src->o_blocks; +} +EXPORT_SYMBOL(obdo_refresh_inode); + +void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid) +{ + valid &= src->o_valid; + + LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID | + OBD_MD_FLID | OBD_MD_FLGROUP)), + "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid); + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, + "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n", + src->o_valid, LTIME_S(dst->i_mtime), + LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime); + + if (valid & OBD_MD_FLATIME) + LTIME_S(dst->i_atime) = src->o_atime; + if (valid & OBD_MD_FLMTIME) + LTIME_S(dst->i_mtime) = src->o_mtime; + if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) + LTIME_S(dst->i_ctime) = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + i_size_write(dst, src->o_size); + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */ + dst->i_blocks = src->o_blocks; + if (dst->i_blocks < src->o_blocks) /* overflow */ + dst->i_blocks = -1; + + } + if (valid & OBD_MD_FLBLKSZ) + dst->i_blkbits = ffs(src->o_blksize)-1; + if (valid & OBD_MD_FLMODE) + dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->i_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->i_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->i_flags = src->o_flags; +} +EXPORT_SYMBOL(obdo_to_inode); diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c new file mode 100644 index 000000000000..46aad6813cab --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c @@ -0,0 +1,445 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#include <linux/module.h> +#include <linux/sysctl.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <linux/sysctl.h> +#include <linux/version.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/ctype.h> +#include <asm/bitops.h> +#include <asm/uaccess.h> +#include <linux/utsname.h> + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd_support.h> +#include <lprocfs_status.h> + +#ifdef CONFIG_SYSCTL +ctl_table_header_t *obd_table_header = NULL; +#endif + + +#define OBD_SYSCTL 300 + +enum { + OBD_TIMEOUT = 3, /* RPC timeout before recovery/intr */ + OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */ + OBD_MEMUSED, /* bytes currently OBD_ALLOCated */ + OBD_PAGESUSED, /* pages currently OBD_PAGE_ALLOCated */ + OBD_MAXMEMUSED, /* maximum bytes OBD_ALLOCated concurrently */ + OBD_MAXPAGESUSED, /* maximum pages OBD_PAGE_ALLOCated concurrently */ + OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */ + OBD_LDLM_TIMEOUT, /* LDLM timeout for ASTs before client eviction */ + OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */ + OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */ + OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */ + OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */ + OBD_AT_MIN, /* Adaptive timeouts params */ + OBD_AT_MAX, + OBD_AT_EXTRA, + OBD_AT_EARLY_MARGIN, + OBD_AT_HISTORY, +}; + + +int LL_PROC_PROTO(proc_set_timeout) +{ + int rc; + + rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + return rc; +} + +int LL_PROC_PROTO(proc_memory_alloc) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_pages_alloc) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_mem_max) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_pages_max) +{ + char buf[22]; + int len; + DECLARE_LL_PROC_PPOS_DECL; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) + return -EINVAL; + + len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max()); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + *ppos += *lenp; + return 0; +} + +int LL_PROC_PROTO(proc_max_dirty_pages_in_mb) +{ + int rc = 0; + DECLARE_LL_PROC_PPOS_DECL; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int*)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + /* Don't allow them to let dirty pages exceed 90% of system + * memory and set a hard minimum of 4MB. */ + if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) { + CERROR("Refusing to set max dirty pages to %u, which " + "is more than 90%% of available RAM; setting " + "to %lu\n", obd_max_dirty_pages, + ((num_physpages / 10) * 9)); + obd_max_dirty_pages = ((num_physpages / 10) * 9); + } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) { + obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT); + } + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, sizeof(buf), + *(unsigned int*)table->data, + 1 << (20 - PAGE_CACHE_SHIFT)); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +int LL_PROC_PROTO(proc_alloc_fail_rate) +{ + int rc = 0; + DECLARE_LL_PROC_PPOS_DECL; + + if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + if (write) { + rc = lprocfs_write_frac_helper(buffer, *lenp, + (unsigned int*)table->data, + OBD_ALLOC_FAIL_MULT); + } else { + char buf[21]; + int len; + + len = lprocfs_read_frac_helper(buf, 21, + *(unsigned int*)table->data, + OBD_ALLOC_FAIL_MULT); + if (len > *lenp) + len = *lenp; + buf[len] = '\0'; + if (copy_to_user(buffer, buf, len)) + return -EFAULT; + *lenp = len; + } + *ppos += *lenp; + return rc; +} + +int LL_PROC_PROTO(proc_at_min) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_max) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_extra) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_early_margin) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} +int LL_PROC_PROTO(proc_at_history) +{ + return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos); +} + +#ifdef CONFIG_SYSCTL +static ctl_table_t obd_table[] = { + { + INIT_CTL_NAME(OBD_TIMEOUT) + .procname = "timeout", + .data = &obd_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT) + .procname = "debug_peer_on_timeout", + .data = &obd_debug_peer_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT) + .procname = "dump_on_timeout", + .data = &obd_dump_on_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_DUMP_ON_EVICTION) + .procname = "dump_on_eviction", + .data = &obd_dump_on_eviction, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + INIT_CTL_NAME(OBD_MEMUSED) + .procname = "memused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_memory_alloc + }, + { + INIT_CTL_NAME(OBD_PAGESUSED) + .procname = "pagesused", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_alloc + }, + { + INIT_CTL_NAME(OBD_MAXMEMUSED) + .procname = "memused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_mem_max + }, + { + INIT_CTL_NAME(OBD_MAXPAGESUSED) + .procname = "pagesused_max", + .data = NULL, + .maxlen = 0, + .mode = 0444, + .proc_handler = &proc_pages_max + }, + { + INIT_CTL_NAME(OBD_LDLM_TIMEOUT) + .procname = "ldlm_timeout", + .data = &ldlm_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_set_timeout + }, + { + INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE) + .procname = "alloc_fail_rate", + .data = &obd_alloc_fail_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_alloc_fail_rate + }, + { + INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES) + .procname = "max_dirty_mb", + .data = &obd_max_dirty_pages, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_max_dirty_pages_in_mb + }, + { + INIT_CTL_NAME(OBD_AT_MIN) + .procname = "at_min", + .data = &at_min, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_min + }, + { + INIT_CTL_NAME(OBD_AT_MAX) + .procname = "at_max", + .data = &at_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_max + }, + { + INIT_CTL_NAME(OBD_AT_EXTRA) + .procname = "at_extra", + .data = &at_extra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_extra + }, + { + INIT_CTL_NAME(OBD_AT_EARLY_MARGIN) + .procname = "at_early_margin", + .data = &at_early_margin, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_early_margin + }, + { + INIT_CTL_NAME(OBD_AT_HISTORY) + .procname = "at_history", + .data = &at_history, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_at_history + }, + { INIT_CTL_NAME(0) } +}; + +static ctl_table_t parent_table[] = { + { + INIT_CTL_NAME(OBD_SYSCTL) + .procname = "lustre", + .data = NULL, + .maxlen = 0, + .mode = 0555, + .child = obd_table + }, + { INIT_CTL_NAME(0) } +}; +#endif + +void obd_sysctl_init (void) +{ +#ifdef CONFIG_SYSCTL + if ( !obd_table_header ) + obd_table_header = cfs_register_sysctl_table(parent_table, 0); +#endif +} + +void obd_sysctl_clean (void) +{ +#ifdef CONFIG_SYSCTL + if ( obd_table_header ) + unregister_sysctl_table(obd_table_header); + obd_table_header = NULL; +#endif +} diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c new file mode 100644 index 000000000000..b1d215e56991 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog.c @@ -0,0 +1,966 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger <adilger@clusterfs.com> + * Author: Alex Zhuravlev <bzzz@whamcloud.com> + * Author: Mikhail Pershin <tappro@whamcloud.com> + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include <obd_class.h> +#include <lustre_log.h> +#include "llog_internal.h" + +/* + * Allocate a new log or catalog handle + * Used inside llog_open(). + */ +struct llog_handle *llog_alloc_handle(void) +{ + struct llog_handle *loghandle; + + OBD_ALLOC_PTR(loghandle); + if (loghandle == NULL) + return ERR_PTR(-ENOMEM); + + init_rwsem(&loghandle->lgh_lock); + spin_lock_init(&loghandle->lgh_hdr_lock); + INIT_LIST_HEAD(&loghandle->u.phd.phd_entry); + atomic_set(&loghandle->lgh_refcount, 1); + + return loghandle; +} + +/* + * Free llog handle and header data if exists. Used in llog_close() only + */ +void llog_free_handle(struct llog_handle *loghandle) +{ + LASSERT(loghandle != NULL); + + /* failed llog_init_handle */ + if (!loghandle->lgh_hdr) + goto out; + + if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) + LASSERT(list_empty(&loghandle->u.phd.phd_entry)); + else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + LASSERT(list_empty(&loghandle->u.chd.chd_head)); + LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE); + OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE); +out: + OBD_FREE_PTR(loghandle); +} + +void llog_handle_get(struct llog_handle *loghandle) +{ + atomic_inc(&loghandle->lgh_refcount); +} + +void llog_handle_put(struct llog_handle *loghandle) +{ + LASSERT(atomic_read(&loghandle->lgh_refcount) > 0); + if (atomic_dec_and_test(&loghandle->lgh_refcount)) + llog_free_handle(loghandle); +} + +/* returns negative on error; 0 if success; 1 if success & log destroyed */ +int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle, + int index) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int rc = 0; + ENTRY; + + CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n", + index, POSTID(&loghandle->lgh_id.lgl_oi)); + + if (index == 0) { + CERROR("Can't cancel index 0 which is header\n"); + RETURN(-EINVAL); + } + + spin_lock(&loghandle->lgh_hdr_lock); + if (!ext2_clear_bit(index, llh->llh_bitmap)) { + spin_unlock(&loghandle->lgh_hdr_lock); + CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index); + RETURN(-ENOENT); + } + + llh->llh_count--; + + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1) && + (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) { + spin_unlock(&loghandle->lgh_hdr_lock); + rc = llog_destroy(env, loghandle); + if (rc < 0) { + CERROR("%s: can't destroy empty llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + GOTO(out_err, rc); + } + RETURN(1); + } + spin_unlock(&loghandle->lgh_hdr_lock); + + rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0); + if (rc < 0) { + CERROR("%s: fail to write header for llog #"DOSTID + "#%08x: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, rc); + GOTO(out_err, rc); + } + RETURN(0); +out_err: + spin_lock(&loghandle->lgh_hdr_lock); + ext2_set_bit(index, llh->llh_bitmap); + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + return rc; +} +EXPORT_SYMBOL(llog_cancel_rec); + +static int llog_read_header(const struct lu_env *env, + struct llog_handle *handle, + struct obd_uuid *uuid) +{ + struct llog_operations *lop; + int rc; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + if (lop->lop_read_header == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_read_header(env, handle); + if (rc == LLOG_EEMPTY) { + struct llog_log_hdr *llh = handle->lgh_hdr; + + handle->lgh_last_idx = 0; /* header is record with index 0 */ + llh->llh_count = 1; /* for the header record */ + llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC; + llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE; + llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0; + llh->llh_timestamp = cfs_time_current_sec(); + if (uuid) + memcpy(&llh->llh_tgtuuid, uuid, + sizeof(llh->llh_tgtuuid)); + llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap); + ext2_set_bit(0, llh->llh_bitmap); + rc = 0; + } + return rc; +} + +int llog_init_handle(const struct lu_env *env, struct llog_handle *handle, + int flags, struct obd_uuid *uuid) +{ + struct llog_log_hdr *llh; + int rc; + + ENTRY; + LASSERT(handle->lgh_hdr == NULL); + + OBD_ALLOC_PTR(llh); + if (llh == NULL) + RETURN(-ENOMEM); + handle->lgh_hdr = llh; + /* first assign flags to use llog_client_ops */ + llh->llh_flags = flags; + rc = llog_read_header(env, handle, uuid); + if (rc == 0) { + if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN && + flags & LLOG_F_IS_CAT) || + (llh->llh_flags & LLOG_F_IS_CAT && + flags & LLOG_F_IS_PLAIN))) { + CERROR("%s: llog type is %s but initializing %s\n", + handle->lgh_ctxt->loc_obd->obd_name, + llh->llh_flags & LLOG_F_IS_CAT ? + "catalog" : "plain", + flags & LLOG_F_IS_CAT ? "catalog" : "plain"); + GOTO(out, rc = -EINVAL); + } else if (llh->llh_flags & + (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) { + /* + * it is possible to open llog without specifying llog + * type so it is taken from llh_flags + */ + flags = llh->llh_flags; + } else { + /* for some reason the llh_flags has no type set */ + CERROR("llog type is not specified!\n"); + GOTO(out, rc = -EINVAL); + } + if (unlikely(uuid && + !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) { + CERROR("%s: llog uuid mismatch: %s/%s\n", + handle->lgh_ctxt->loc_obd->obd_name, + (char *)uuid->uuid, + (char *)llh->llh_tgtuuid.uuid); + GOTO(out, rc = -EEXIST); + } + } + if (flags & LLOG_F_IS_CAT) { + LASSERT(list_empty(&handle->u.chd.chd_head)); + INIT_LIST_HEAD(&handle->u.chd.chd_head); + llh->llh_size = sizeof(struct llog_logid_rec); + } else if (!(flags & LLOG_F_IS_PLAIN)) { + CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n", + handle->lgh_ctxt->loc_obd->obd_name, + flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN); + rc = -EINVAL; + } +out: + if (rc) { + OBD_FREE_PTR(llh); + handle->lgh_hdr = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_init_handle); + +int llog_copy_handler(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, + void *data) +{ + struct llog_rec_hdr local_rec = *rec; + struct llog_handle *local_llh = (struct llog_handle *)data; + char *cfg_buf = (char*) (rec + 1); + struct lustre_cfg *lcfg; + int rc = 0; + ENTRY; + + /* Append all records */ + local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail); + rc = llog_write(env, local_llh, &local_rec, NULL, 0, + (void *)cfg_buf, -1); + + lcfg = (struct lustre_cfg *)cfg_buf; + CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n", + rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command, + lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1)); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_copy_handler); + +static int llog_process_thread(void *arg) +{ + struct llog_process_info *lpi = arg; + struct llog_handle *loghandle = lpi->lpi_loghandle; + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = lpi->lpi_catdata; + char *buf; + __u64 cur_offset = LLOG_CHUNK_SIZE; + __u64 last_offset; + int rc = 0, index = 1, last_index; + int saved_index = 0; + int last_called_index = 0; + + ENTRY; + + LASSERT(llh); + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) { + lpi->lpi_rc = -ENOMEM; + RETURN(0); + } + + if (cd != NULL) { + last_called_index = cd->lpcd_first_idx; + index = cd->lpcd_first_idx + 1; + } + if (cd != NULL && cd->lpcd_last_idx) + last_index = cd->lpcd_last_idx; + else + last_index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + + /* skip records not set in bitmap */ + while (index <= last_index && + !ext2_test_bit(index, llh->llh_bitmap)) + ++index; + + LASSERT(index <= last_index + 1); + if (index == last_index + 1) + break; +repeat: + CDEBUG(D_OTHER, "index: %d last_index %d\n", + index, last_index); + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + last_offset = cur_offset; + rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index, + index, &cur_offset, buf, LLOG_CHUNK_SIZE); + if (rc) + GOTO(out, rc); + + /* NB: when rec->lrh_len is accessed it is already swabbed + * since it is used at the "end" of the loop and the rec + * swabbing is done at the beginning of the loop. */ + for (rec = (struct llog_rec_hdr *)buf; + (char *)rec < buf + LLOG_CHUNK_SIZE; + rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){ + + CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n", + rec, rec->lrh_type); + + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n", + rec->lrh_type, rec->lrh_index); + + if (rec->lrh_index == 0) { + /* probably another rec just got added? */ + if (index <= loghandle->lgh_last_idx) + GOTO(repeat, rc = 0); + GOTO(out, rc = 0); /* no more records */ + } + if (rec->lrh_len == 0 || + rec->lrh_len > LLOG_CHUNK_SIZE) { + CWARN("invalid length %d in llog record for " + "index %d/%d\n", rec->lrh_len, + rec->lrh_index, index); + GOTO(out, rc = -EINVAL); + } + + if (rec->lrh_index < index) { + CDEBUG(D_OTHER, "skipping lrh_index %d\n", + rec->lrh_index); + continue; + } + + CDEBUG(D_OTHER, + "lrh_index: %d lrh_len: %d (%d remains)\n", + rec->lrh_index, rec->lrh_len, + (int)(buf + LLOG_CHUNK_SIZE - (char *)rec)); + + loghandle->lgh_cur_idx = rec->lrh_index; + loghandle->lgh_cur_offset = (char *)rec - (char *)buf + + last_offset; + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec, + lpi->lpi_cbdata); + last_called_index = index; + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(lpi->lpi_env, + loghandle, + rec->lrh_index); + rc = 0; + } + if (rc) + GOTO(out, rc); + } else { + CDEBUG(D_OTHER, "Skipped index %d\n", index); + } + + /* next record, still in buffer? */ + ++index; + if (index > last_index) + GOTO(out, rc = 0); + } + } + +out: + if (cd != NULL) + cd->lpcd_last_idx = last_called_index; + + OBD_FREE(buf, LLOG_CHUNK_SIZE); + lpi->lpi_rc = rc; + return 0; +} + +static int llog_process_thread_daemonize(void *arg) +{ + struct llog_process_info *lpi = arg; + struct lu_env env; + int rc; + + unshare_fs_struct(); + + /* client env has no keys, tags is just 0 */ + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + goto out; + lpi->lpi_env = &env; + + rc = llog_process_thread(arg); + + lu_env_fini(&env); +out: + complete(&lpi->lpi_completion); + return rc; +} + +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork) +{ + struct llog_process_info *lpi; + int rc; + + ENTRY; + + OBD_ALLOC_PTR(lpi); + if (lpi == NULL) { + CERROR("cannot alloc pointer\n"); + RETURN(-ENOMEM); + } + lpi->lpi_loghandle = loghandle; + lpi->lpi_cb = cb; + lpi->lpi_cbdata = data; + lpi->lpi_catdata = catdata; + + if (fork) { + /* The new thread can't use parent env, + * init the new one in llog_process_thread_daemonize. */ + lpi->lpi_env = NULL; + init_completion(&lpi->lpi_completion); + rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi, + "llog_process_thread")); + if (IS_ERR_VALUE(rc)) { + CERROR("%s: cannot start thread: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + OBD_FREE_PTR(lpi); + RETURN(rc); + } + wait_for_completion(&lpi->lpi_completion); + } else { + lpi->lpi_env = env; + llog_process_thread(lpi); + } + rc = lpi->lpi_rc; + OBD_FREE_PTR(lpi); + RETURN(rc); +} +EXPORT_SYMBOL(llog_process_or_fork); + +int llog_process(const struct lu_env *env, struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata) +{ + return llog_process_or_fork(env, loghandle, cb, data, catdata, true); +} +EXPORT_SYMBOL(llog_process); + +inline int llog_get_size(struct llog_handle *loghandle) +{ + if (loghandle && loghandle->lgh_hdr) + return loghandle->lgh_hdr->llh_count; + return 0; +} +EXPORT_SYMBOL(llog_get_size); + +int llog_reverse_process(const struct lu_env *env, + struct llog_handle *loghandle, llog_cb_t cb, + void *data, void *catdata) +{ + struct llog_log_hdr *llh = loghandle->lgh_hdr; + struct llog_process_cat_data *cd = catdata; + void *buf; + int rc = 0, first_index = 1, index, idx; + ENTRY; + + OBD_ALLOC(buf, LLOG_CHUNK_SIZE); + if (!buf) + RETURN(-ENOMEM); + + if (cd != NULL) + first_index = cd->lpcd_first_idx + 1; + if (cd != NULL && cd->lpcd_last_idx) + index = cd->lpcd_last_idx; + else + index = LLOG_BITMAP_BYTES * 8 - 1; + + while (rc == 0) { + struct llog_rec_hdr *rec; + struct llog_rec_tail *tail; + + /* skip records not set in bitmap */ + while (index >= first_index && + !ext2_test_bit(index, llh->llh_bitmap)) + --index; + + LASSERT(index >= first_index - 1); + if (index == first_index - 1) + break; + + /* get the buf with our target record; avoid old garbage */ + memset(buf, 0, LLOG_CHUNK_SIZE); + rc = llog_prev_block(env, loghandle, index, buf, + LLOG_CHUNK_SIZE); + if (rc) + GOTO(out, rc); + + rec = buf; + idx = rec->lrh_index; + CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx); + while (idx < index) { + rec = (void *)rec + rec->lrh_len; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + idx ++; + } + LASSERT(idx == index); + tail = (void *)rec + rec->lrh_len - sizeof(*tail); + + /* process records in buffer, starting where we found one */ + while ((void *)tail > buf) { + if (tail->lrt_index == 0) + GOTO(out, rc = 0); /* no more records */ + + /* if set, process the callback on this record */ + if (ext2_test_bit(index, llh->llh_bitmap)) { + rec = (void *)tail - tail->lrt_len + + sizeof(*tail); + + rc = cb(env, loghandle, rec, data); + if (rc == LLOG_PROC_BREAK) { + GOTO(out, rc); + } else if (rc == LLOG_DEL_RECORD) { + llog_cancel_rec(env, loghandle, + tail->lrt_index); + rc = 0; + } + if (rc) + GOTO(out, rc); + } + + /* previous record, still in buffer? */ + --index; + if (index < first_index) + GOTO(out, rc = 0); + tail = (void *)tail - tail->lrt_len; + } + } + +out: + if (buf) + OBD_FREE(buf, LLOG_CHUNK_SIZE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_reverse_process); + +/** + * new llog API + * + * API functions: + * llog_open - open llog, may not exist + * llog_exist - check if llog exists + * llog_close - close opened llog, pair for open, frees llog_handle + * llog_declare_create - declare llog creation + * llog_create - create new llog on disk, need transaction handle + * llog_declare_write_rec - declaration of llog write + * llog_write_rec - write llog record on disk, need transaction handle + * llog_declare_add - declare llog catalog record addition + * llog_add - add llog record in catalog, need transaction handle + */ +int llog_exist(struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_exist == NULL) + RETURN(-EOPNOTSUPP); + + rc = lop->lop_exist(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_exist); + +int llog_declare_create(const struct lu_env *env, + struct llog_handle *loghandle, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_declare_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_create(env, loghandle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_create); + +int llog_create(const struct lu_env *env, struct llog_handle *handle, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + if (lop->lop_create == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_create(env, handle, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_create); + +int llog_declare_write_rec(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, int idx, + struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + LASSERT(lop); + if (lop->lop_declare_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_declare_write_rec(env, handle, rec, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_write_rec); + +int llog_write_rec(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + int numcookies, void *buf, int idx, struct thandle *th) +{ + struct llog_operations *lop; + int raised, rc, buflen; + + ENTRY; + + rc = llog_handle2ops(handle, &lop); + if (rc) + RETURN(rc); + + LASSERT(lop); + if (lop->lop_write_rec == NULL) + RETURN(-EOPNOTSUPP); + + if (buf) + buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) + + sizeof(struct llog_rec_tail); + else + buflen = rec->lrh_len; + LASSERT(cfs_size_round(buflen) == buflen); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies, + buf, idx, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_write_rec); + +int llog_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct llog_cookie *logcookies, + void *buf, struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_add); + +int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh, + struct llog_rec_hdr *rec, struct thandle *th) +{ + int raised, rc; + + ENTRY; + + if (lgh->lgh_logops->lop_declare_add == NULL) + RETURN(-EOPNOTSUPP); + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_declare_add); + +/** + * Helper function to open llog or create it if doesn't exist. + * It hides all transaction handling from caller. + */ +int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **res, struct llog_logid *logid, + char *name) +{ + struct thandle *th; + int rc; + + ENTRY; + + rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW); + if (rc) + RETURN(rc); + + if (llog_exist(*res)) + RETURN(0); + + if ((*res)->lgh_obj != NULL) { + struct dt_device *d; + + d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = llog_declare_create(env, *res, th); + if (rc == 0) { + rc = dt_trans_start_local(env, d, th); + if (rc == 0) + rc = llog_create(env, *res, th); + } + dt_trans_stop(env, d, th); + } else { + /* lvfs compat code */ + LASSERT((*res)->lgh_file == NULL); + rc = llog_create(env, *res, NULL); + } +out: + if (rc) + llog_close(env, *res); + RETURN(rc); +} +EXPORT_SYMBOL(llog_open_create); + +/** + * Helper function to delete existent llog. + */ +int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_logid *logid, char *name) +{ + struct llog_handle *handle; + int rc = 0, rc2; + + ENTRY; + + /* nothing to erase */ + if (name == NULL && logid == NULL) + RETURN(0); + + rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS); + if (rc < 0) + RETURN(rc); + + rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) + rc = llog_destroy(env, handle); + + rc2 = llog_close(env, handle); + if (rc == 0) + rc = rc2; + RETURN(rc); +} +EXPORT_SYMBOL(llog_erase); + +/* + * Helper function for write record in llog. + * It hides all transaction handling from caller. + * Valid only with local llog. + */ +int llog_write(const struct lu_env *env, struct llog_handle *loghandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + int cookiecount, void *buf, int idx) +{ + int rc; + + ENTRY; + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + if (loghandle->lgh_obj != NULL) { + struct dt_device *dt; + struct thandle *th; + + dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_declare_write_rec(env, loghandle, rec, idx, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + + down_write(&loghandle->lgh_lock); + rc = llog_write_rec(env, loghandle, rec, reccookie, + cookiecount, buf, idx, th); + up_write(&loghandle->lgh_lock); +out_trans: + dt_trans_stop(env, dt, th); + } else { /* lvfs compatibility */ + down_write(&loghandle->lgh_lock); + rc = llog_write_rec(env, loghandle, rec, reccookie, + cookiecount, buf, idx, NULL); + up_write(&loghandle->lgh_lock); + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_write); + +int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_handle **lgh, struct llog_logid *logid, + char *name, enum llog_open_param open_param) +{ + int raised; + int rc; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_logops); + + if (ctxt->loc_logops->lop_open == NULL) { + *lgh = NULL; + RETURN(-EOPNOTSUPP); + } + + *lgh = llog_alloc_handle(); + if (*lgh == NULL) + RETURN(-ENOMEM); + (*lgh)->lgh_ctxt = ctxt; + (*lgh)->lgh_logops = ctxt->loc_logops; + + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + if (rc) { + llog_free_handle(*lgh); + *lgh = NULL; + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_open); + +int llog_close(const struct lu_env *env, struct llog_handle *loghandle) +{ + struct llog_operations *lop; + int rc; + + ENTRY; + + rc = llog_handle2ops(loghandle, &lop); + if (rc) + GOTO(out, rc); + if (lop->lop_close == NULL) + GOTO(out, rc = -EOPNOTSUPP); + rc = lop->lop_close(env, loghandle); +out: + llog_handle_put(loghandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_close); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c new file mode 100644 index 000000000000..cf00b2f550ac --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c @@ -0,0 +1,833 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_cat.c + * + * OST<->MDS recovery logging infrastructure. + * + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger <adilger@clusterfs.com> + * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com> + * Author: Mikhail Pershin <mike.pershin@intel.com> + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include <obd_class.h> + +#include "llog_internal.h" + +/* Create a new log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + */ +static int llog_cat_new_log(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_handle *loghandle, + struct thandle *th) +{ + + struct llog_log_hdr *llh; + struct llog_logid_rec rec = { { 0 }, }; + int rc, index, bitmap_size; + ENTRY; + + llh = cathandle->lgh_hdr; + bitmap_size = LLOG_BITMAP_SIZE(llh); + + index = (cathandle->lgh_last_idx + 1) % bitmap_size; + + /* maximum number of available slots in catlog is bitmap_size - 2 */ + if (llh->llh_cat_idx == index) { + CERROR("no free catalog slots for log...\n"); + RETURN(-ENOSPC); + } + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED)) + RETURN(-ENOSPC); + + rc = llog_create(env, loghandle, th); + /* if llog is already created, no need to initialize it */ + if (rc == -EEXIST) { + RETURN(0); + } else if (rc != 0) { + CERROR("%s: can't create new plain llog in catalog: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &cathandle->lgh_hdr->llh_tgtuuid); + if (rc) + GOTO(out_destroy, rc); + + if (index == 0) + index = 1; + + spin_lock(&loghandle->lgh_hdr_lock); + llh->llh_count++; + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("argh, index %u already set in log bitmap?\n", + index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + spin_unlock(&loghandle->lgh_hdr_lock); + + cathandle->lgh_last_idx = index; + llh->llh_tail.lrt_index = index; + + CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog" + DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, index, + POSTID(&cathandle->lgh_id.lgl_oi)); + /* build the record for this log in the catalog */ + rec.lid_hdr.lrh_len = sizeof(rec); + rec.lid_hdr.lrh_index = index; + rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + rec.lid_id = loghandle->lgh_id; + rec.lid_tail.lrt_len = sizeof(rec); + rec.lid_tail.lrt_index = index; + + /* update the catalog: header and record */ + rc = llog_write_rec(env, cathandle, &rec.lid_hdr, + &loghandle->u.phd.phd_cookie, 1, NULL, index, th); + if (rc < 0) + GOTO(out_destroy, rc); + + loghandle->lgh_hdr->llh_cat_idx = index; + RETURN(0); +out_destroy: + llog_destroy(env, loghandle); + RETURN(rc); +} + +/* Open an existent log handle and add it to the open list. + * This log handle will be closed when all of the records in it are removed. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * We return a lock on the handle to ensure nobody yanks it from us. + * + * This takes extra reference on llog_handle via llog_handle_get() and require + * this reference to be put by caller using llog_handle_put() + */ +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid) +{ + struct llog_handle *loghandle; + int rc = 0; + + ENTRY; + + if (cathandle == NULL) + RETURN(-EBADF); + + down_write(&cathandle->lgh_lock); + list_for_each_entry(loghandle, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_logid *cgl = &loghandle->lgh_id; + + if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) && + ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) { + if (cgl->lgl_ogen != logid->lgl_ogen) { + CERROR("%s: log "DOSTID" generation %x != %x\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), cgl->lgl_ogen, + logid->lgl_ogen); + continue; + } + loghandle->u.phd.phd_cat_handle = cathandle; + up_write(&cathandle->lgh_lock); + GOTO(out, rc = 0); + } + } + up_write(&cathandle->lgh_lock); + + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL, + LLOG_OPEN_EXISTS); + if (rc < 0) { + CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + RETURN(rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL); + if (rc < 0) { + llog_close(env, loghandle); + loghandle = NULL; + RETURN(rc); + } + + down_write(&cathandle->lgh_lock); + list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head); + up_write(&cathandle->lgh_lock); + + loghandle->u.phd.phd_cat_handle = cathandle; + loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; + loghandle->u.phd.phd_cookie.lgc_index = + loghandle->lgh_hdr->llh_cat_idx; + EXIT; +out: + llog_handle_get(loghandle); + *res = loghandle; + return 0; +} + +int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle) +{ + struct llog_handle *loghandle, *n; + int rc; + + ENTRY; + + list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head, + u.phd.phd_entry) { + struct llog_log_hdr *llh = loghandle->lgh_hdr; + int index; + + /* unlink open-not-created llogs */ + list_del_init(&loghandle->u.phd.phd_entry); + llh = loghandle->lgh_hdr; + if (loghandle->lgh_obj != NULL && llh != NULL && + (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: failure destroying log during " + "cleanup: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, + rc); + + index = loghandle->u.phd.phd_cookie.lgc_index; + llog_cat_cleanup(env, cathandle, NULL, index); + } + llog_close(env, loghandle); + } + /* if handle was stored in ctxt, remove it too */ + if (cathandle->lgh_ctxt->loc_handle == cathandle) + cathandle->lgh_ctxt->loc_handle = NULL; + rc = llog_close(env, cathandle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_close); + +/** + * lockdep markers for nested struct llog_handle::lgh_lock locking. + */ +enum { + LLOGH_CAT, + LLOGH_LOG +}; + +/** Return the currently active log handle. If the current log handle doesn't + * have enough space left for the current record, start a new one. + * + * If reclen is 0, we only want to know what the currently active log is, + * otherwise we get a lock on this log so nobody can steal our space. + * + * Assumes caller has already pushed us into the kernel context and is locking. + * + * NOTE: loghandle is write-locked upon successful return + */ +static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle, + struct thandle *th) +{ + struct llog_handle *loghandle = NULL; + ENTRY; + + down_read_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + if (llh == NULL || + loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_read(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + up_read(&cathandle->lgh_lock); + + /* time to use next log */ + + /* first, we have to make sure the state hasn't changed */ + down_write_nested(&cathandle->lgh_lock, LLOGH_CAT); + loghandle = cathandle->u.chd.chd_current_log; + if (loghandle) { + struct llog_log_hdr *llh; + + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + llh = loghandle->lgh_hdr; + LASSERT(llh); + if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) { + up_write(&cathandle->lgh_lock); + RETURN(loghandle); + } else { + up_write(&loghandle->lgh_lock); + } + } + + CDEBUG(D_INODE, "use next log\n"); + + loghandle = cathandle->u.chd.chd_next_log; + cathandle->u.chd.chd_current_log = loghandle; + cathandle->u.chd.chd_next_log = NULL; + down_write_nested(&loghandle->lgh_lock, LLOGH_LOG); + up_write(&cathandle->lgh_lock); + LASSERT(loghandle); + RETURN(loghandle); +} + +/* Add a single record to the recovery log(s) using a catalog + * Returns as llog_write_record + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf, struct thandle *th) +{ + struct llog_handle *loghandle; + int rc; + ENTRY; + + LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE); + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + + /* loghandle is already locked by llog_cat_current_log() for us */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th); + if (rc < 0) + CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR, + "llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + if (rc == -ENOSPC) { + /* try to use next log */ + loghandle = llog_cat_current_log(cathandle, th); + LASSERT(!IS_ERR(loghandle)); + /* new llog can be created concurrently */ + if (!llog_exist(loghandle)) { + rc = llog_cat_new_log(env, cathandle, loghandle, th); + if (rc < 0) { + up_write(&loghandle->lgh_lock); + RETURN(rc); + } + } + /* now let's try to add the record */ + rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, + -1, th); + if (rc < 0) + CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle); + up_write(&loghandle->lgh_lock); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add_rec); + +int llog_cat_declare_add_rec(const struct lu_env *env, + struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct thandle *th) +{ + struct llog_handle *loghandle, *next; + int rc = 0; + + ENTRY; + + if (cathandle->u.chd.chd_current_log == NULL) { + /* declare new plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_current_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } else if (cathandle->u.chd.chd_next_log == NULL) { + /* declare next plain llog */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_next_log == NULL) { + rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, + NULL, NULL, LLOG_OPEN_NEW); + if (rc == 0) { + cathandle->u.chd.chd_next_log = loghandle; + list_add_tail(&loghandle->u.phd.phd_entry, + &cathandle->u.chd.chd_head); + } + } + up_write(&cathandle->lgh_lock); + } + if (rc) + GOTO(out, rc); + + if (!llog_exist(cathandle->u.chd.chd_current_log)) { + rc = llog_declare_create(env, cathandle->u.chd.chd_current_log, + th); + if (rc) + GOTO(out, rc); + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + /* declare records in the llogs */ + rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log, + rec, -1, th); + if (rc) + GOTO(out, rc); + + next = cathandle->u.chd.chd_next_log; + if (next) { + if (!llog_exist(next)) { + rc = llog_declare_create(env, next, th); + llog_declare_write_rec(env, cathandle, NULL, -1, th); + } + llog_declare_write_rec(env, next, rec, -1, th); + } +out: + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_declare_add_rec); + +int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, struct llog_cookie *reccookie, + void *buf) +{ + struct llog_ctxt *ctxt; + struct dt_device *dt; + struct thandle *th = NULL; + int rc; + + ctxt = cathandle->lgh_ctxt; + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + + if (cathandle->lgh_obj != NULL) { + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc) + GOTO(out_trans, rc); + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th); +out_trans: + dt_trans_stop(env, dt, th); + } else { /* lvfs compat code */ + LASSERT(cathandle->lgh_file != NULL); + rc = llog_cat_declare_add_rec(env, cathandle, rec, th); + if (rc == 0) + rc = llog_cat_add_rec(env, cathandle, rec, reccookie, + buf, th); + } + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_add); + +/* For each cookie in the cookie array, we clear the log in-use bit and either: + * - the log is empty, so mark it free in the catalog header and delete it + * - the log is not empty, just write out the log header + * + * The cookies may be in different log files, so we need to get new logs + * each time. + * + * Assumes caller has already pushed us into the kernel context. + */ +int llog_cat_cancel_records(const struct lu_env *env, + struct llog_handle *cathandle, int count, + struct llog_cookie *cookies) +{ + int i, index, rc = 0, failed = 0; + + ENTRY; + + for (i = 0; i < count; i++, cookies++) { + struct llog_handle *loghandle; + struct llog_logid *lgl = &cookies->lgc_lgl; + int lrc; + + rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lgl->lgl_oi), rc); + failed++; + continue; + } + + lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index); + if (lrc == 1) { /* log has been destroyed */ + index = loghandle->u.phd.phd_cookie.lgc_index; + rc = llog_cat_cleanup(env, cathandle, loghandle, + index); + } else if (lrc == -ENOENT) { + if (rc == 0) /* ENOENT shouldn't rewrite any error */ + rc = lrc; + } else if (lrc < 0) { + failed++; + rc = lrc; + } + llog_handle_put(loghandle); + } + if (rc) + CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, failed, count, + rc); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_cancel_records); + +int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + RETURN(rc); + } + + if (rec->lrh_index < d->lpd_startcat) + /* Skip processing of the logs until startcat */ + RETURN(0); + + if (d->lpd_startidx > 0) { + struct llog_process_cat_data cd; + + cd.lpcd_first_idx = d->lpd_startidx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + &cd, false); + /* Continue processing the next log from idx 0 */ + d->lpd_startidx = 0; + } else { + rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data, + NULL, false); + } + llog_handle_put(llh); + + RETURN(rc); +} + +int llog_cat_process_or_fork(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, + int startidx, bool fork) +{ + struct llog_process_data d; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + d.lpd_startcat = startcat; + d.lpd_startidx = startidx; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + struct llog_process_cat_data cd; + + CWARN("catlog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = llh->llh_cat_idx; + cd.lpcd_last_idx = 0; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, &cd, fork); + } else { + rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb, + &d, NULL, fork); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_process_or_fork); + +int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh, + llog_cb_t cb, void *data, int startcat, int startidx) +{ + return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat, + startidx, false); +} +EXPORT_SYMBOL(llog_cat_process); + +static int llog_cat_reverse_process_cb(const struct lu_env *env, + struct llog_handle *cat_llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_process_data *d = data; + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *llh; + int rc; + + if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cat_llh->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + RETURN(rc); + } + + rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL); + llog_handle_put(llh); + RETURN(rc); +} + +int llog_cat_reverse_process(const struct lu_env *env, + struct llog_handle *cat_llh, + llog_cb_t cb, void *data) +{ + struct llog_process_data d; + struct llog_process_cat_data cd; + struct llog_log_hdr *llh = cat_llh->lgh_hdr; + int rc; + ENTRY; + + LASSERT(llh->llh_flags & LLOG_F_IS_CAT); + d.lpd_data = data; + d.lpd_cb = cb; + + if (llh->llh_cat_idx > cat_llh->lgh_last_idx) { + CWARN("catalog "DOSTID" crosses index zero\n", + POSTID(&cat_llh->lgh_id.lgl_oi)); + + cd.lpcd_first_idx = 0; + cd.lpcd_last_idx = cat_llh->lgh_last_idx; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + if (rc != 0) + RETURN(rc); + + cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx); + cd.lpcd_last_idx = 0; + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, &cd); + } else { + rc = llog_reverse_process(env, cat_llh, + llog_cat_reverse_process_cb, + &d, NULL); + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cat_reverse_process); + +int llog_cat_set_first_idx(struct llog_handle *cathandle, int index) +{ + struct llog_log_hdr *llh = cathandle->lgh_hdr; + int i, bitmap_size, idx; + ENTRY; + + bitmap_size = LLOG_BITMAP_SIZE(llh); + if (llh->llh_cat_idx == (index - 1)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + if (idx == cathandle->lgh_last_idx) + goto out; + for (i = (index + 1) % bitmap_size; + i != cathandle->lgh_last_idx; + i = (i + 1) % bitmap_size) { + if (!ext2_test_bit(i, llh->llh_bitmap)) { + idx = llh->llh_cat_idx + 1; + llh->llh_cat_idx = idx; + } else if (i == 0) { + llh->llh_cat_idx = 0; + } else { + break; + } + } +out: + CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n", + POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx); + } + + RETURN(0); +} + +/* Cleanup deleted plain llog traces from catalog */ +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index) +{ + int rc; + + LASSERT(index); + if (loghandle != NULL) { + /* remove destroyed llog from catalog list and + * chd_current_log variable */ + down_write(&cathandle->lgh_lock); + if (cathandle->u.chd.chd_current_log == loghandle) + cathandle->u.chd.chd_current_log = NULL; + list_del_init(&loghandle->u.phd.phd_entry); + up_write(&cathandle->lgh_lock); + LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index); + /* llog was opened and keep in a list, close it now */ + llog_close(env, loghandle); + } + /* remove plain llog entry from catalog by index */ + llog_cat_set_first_idx(cathandle, index); + rc = llog_cancel_rec(env, cathandle, index); + if (rc == 0) + CDEBUG(D_HA, "cancel plain log at index" + " %u of catalog "DOSTID"\n", + index, POSTID(&cathandle->lgh_id.lgl_oi)); + return rc; +} + +int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + struct llog_log_hdr *llh; + int rc; + + ENTRY; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog " + DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen, + rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi)); + + rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id); + if (rc) { + CERROR("%s: cannot find handle for llog "DOSTID": %d\n", + cathandle->lgh_ctxt->loc_obd->obd_name, + POSTID(&lir->lid_id.lgl_oi), rc); + if (rc == -ENOENT || rc == -ESTALE) { + /* remove index from catalog */ + llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index); + } + RETURN(rc); + } + + llh = loghandle->lgh_hdr; + if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) && + (llh->llh_count == 1)) { + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("%s: fail to destroy empty log: rc = %d\n", + loghandle->lgh_ctxt->loc_obd->obd_name, rc); + + llog_cat_cleanup(env, cathandle, loghandle, + loghandle->u.phd.phd_cookie.lgc_index); + } + llog_handle_put(loghandle); + + RETURN(rc); +} +EXPORT_SYMBOL(cat_cancel_cb); + +/* helper to initialize catalog llog and process it to cancel */ +int llog_cat_init_and_process(const struct lu_env *env, + struct llog_handle *llh) +{ + int rc; + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL); + if (rc) + RETURN(rc); + + rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false); + if (rc) + CERROR("%s: llog_process() with cat_cancel_cb failed: rc = " + "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc); + RETURN(0); +} +EXPORT_SYMBOL(llog_cat_init_and_process); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h new file mode 100644 index 000000000000..539e1d4f9d4c --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h @@ -0,0 +1,98 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#ifndef __LLOG_INTERNAL_H__ +#define __LLOG_INTERNAL_H__ + +#include <lustre_log.h> + +struct llog_process_info { + struct llog_handle *lpi_loghandle; + llog_cb_t lpi_cb; + void *lpi_cbdata; + void *lpi_catdata; + int lpi_rc; + struct completion lpi_completion; + const struct lu_env *lpi_env; + +}; + +struct llog_thread_info { + struct lu_attr lgi_attr; + struct lu_fid lgi_fid; + struct dt_object_format lgi_dof; + struct lu_buf lgi_buf; + loff_t lgi_off; + struct llog_rec_hdr lgi_lrh; + struct llog_rec_tail lgi_tail; +}; + +extern struct lu_context_key llog_thread_key; + +static inline struct llog_thread_info *llog_info(const struct lu_env *env) +{ + struct llog_thread_info *lgi; + + lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key); + LASSERT(lgi); + return lgi; +} + +static inline void +lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen) +{ + ostid_set_seq_llog(&logid->lgl_oi); + ostid_set_id(&logid->lgl_oi, ino); + logid->lgl_ogen = gen; +} + +int llog_info_init(void); +void llog_info_fini(void); + +void llog_handle_get(struct llog_handle *loghandle); +void llog_handle_put(struct llog_handle *loghandle); +int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle **res, struct llog_logid *logid); +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data); +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size); +int llog_process_or_fork(const struct lu_env *env, + struct llog_handle *loghandle, + llog_cb_t cb, void *data, void *catdata, bool fork); +int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle, + struct llog_handle *loghandle, int index); +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c new file mode 100644 index 000000000000..0732874e26c5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c @@ -0,0 +1,427 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#include <obd_class.h> +#include <lustre_log.h> +#include "llog_internal.h" + +static int str2logid(struct llog_logid *logid, char *str, int len) +{ + char *start, *end, *endp; + __u64 id, seq; + + ENTRY; + start = str; + if (*start != '#') + RETURN(-EINVAL); + + start++; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + id = simple_strtoull(start, &endp, 0); + if (endp != end) + RETURN(-EINVAL); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + end = strchr(start, '#'); + if (end == NULL || end == start) + RETURN(-EINVAL); + + *end = '\0'; + seq = simple_strtoull(start, &endp, 0); + if (endp != end) + RETURN(-EINVAL); + + ostid_set_seq(&logid->lgl_oi, seq); + ostid_set_id(&logid->lgl_oi, id); + + start = ++end; + if (start - str >= len - 1) + RETURN(-EINVAL); + logid->lgl_ogen = simple_strtoul(start, &endp, 16); + if (*endp != '\0') + RETURN(-EINVAL); + + RETURN(0); +} + +static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data; + static int l, remains, from, to; + static char *out; + char *endp; + int cur_index, rc = 0; + + ENTRY; + + if (ioc_data && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + cfs_size_round(ioc_data->ioc_inllen1) + + cfs_size_round(ioc_data->ioc_inllen2) + + cfs_size_round(ioc_data->ioc_inllen3); + from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + ioc_data->ioc_inllen1 = 0; + out = ioc_data->ioc_bulk; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct llog_handle *loghandle; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d failed\n", + cur_index, rec->lrh_type, + rec->lrh_len); + } + if (handle->lgh_ctxt == NULL) + RETURN(-EOPNOTSUPP); + rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n", + POSTID(&lir->lid_id.lgl_oi), + lir->lid_id.lgl_ogen); + RETURN(rc); + } + rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL); + llog_handle_put(loghandle); + } else { + bool ok; + + switch (rec->lrh_type) { + case OST_SZ_REC: + case MDS_UNLINK_REC: + case MDS_UNLINK64_REC: + case MDS_SETATTR64_REC: + case OBD_CFG_REC: + case LLOG_GEN_REC: + case LLOG_HDR_MAGIC: + ok = true; + break; + default: + ok = false; + } + + l = snprintf(out, remains, "[index]: %05d [type]: " + "%02x [len]: %04d %s\n", + cur_index, rec->lrh_type, rec->lrh_len, + ok ? "ok" : "failed"); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: no space to print log records\n", + handle->lgh_ctxt->loc_obd->obd_name); + RETURN(-LLOG_EEMPTY); + } + } + RETURN(rc); +} + +static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data; + static int l, remains, from, to; + static char *out; + char *endp; + int cur_index; + + ENTRY; + if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) { + l = 0; + remains = ioc_data->ioc_inllen4 + + cfs_size_round(ioc_data->ioc_inllen1) + + cfs_size_round(ioc_data->ioc_inllen2) + + cfs_size_round(ioc_data->ioc_inllen3); + from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + RETURN(-EINVAL); + out = ioc_data->ioc_bulk; + ioc_data->ioc_inllen1 = 0; + } + + cur_index = rec->lrh_index; + if (cur_index < from) + RETURN(0); + if (to > 0 && cur_index > to) + RETURN(-LLOG_EEMPTY); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) { + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + l = snprintf(out, remains, + "[index]: %05d [logid]: #"DOSTID"#%08x\n", + cur_index, POSTID(&lir->lid_id.lgl_oi), + lir->lid_id.lgl_ogen); + } else if (rec->lrh_type == OBD_CFG_REC) { + int rc; + + rc = class_config_parse_rec(rec, out, remains); + if (rc < 0) + RETURN(rc); + l = rc; + } else { + l = snprintf(out, remains, + "[index]: %05d [type]: %02x [len]: %04d\n", + cur_index, rec->lrh_type, rec->lrh_len); + } + out += l; + remains -= l; + if (remains <= 0) { + CERROR("not enough space for print log records\n"); + RETURN(-LLOG_EEMPTY); + } + + RETURN(0); +} +static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat, + struct llog_logid *logid) +{ + struct llog_handle *log; + int rc; + + ENTRY; + + rc = llog_cat_id2handle(env, cat, &log, logid); + if (rc) { + CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n", + POSTID(&logid->lgl_oi), logid->lgl_ogen); + RETURN(-ENOENT); + } + + rc = llog_destroy(env, log); + if (rc) { + CDEBUG(D_IOCTL, "cannot destroy log\n"); + GOTO(out, rc); + } + llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index); +out: + llog_handle_put(log); + RETURN(rc); + +} + +static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + int rc; + + ENTRY; + if (rec->lrh_type != LLOG_LOGID_MAGIC) + RETURN(-EINVAL); + rc = llog_remove_log(env, handle, &lir->lid_id); + + RETURN(rc); +} + + +int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd, + struct obd_ioctl_data *data) +{ + struct llog_logid logid; + int rc = 0; + struct llog_handle *handle = NULL; + + ENTRY; + + if (*data->ioc_inlbuf1 == '#') { + rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1); + if (rc) + RETURN(rc); + rc = llog_open(env, ctxt, &handle, &logid, NULL, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else if (*data->ioc_inlbuf1 == '$') { + char *name = data->ioc_inlbuf1 + 1; + + rc = llog_open(env, ctxt, &handle, NULL, name, + LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + } else { + RETURN(-EINVAL); + } + + rc = llog_init_handle(env, handle, 0, NULL); + if (rc) + GOTO(out_close, rc = -ENOENT); + + switch (cmd) { + case OBD_IOC_LLOG_INFO: { + int l; + int remains = data->ioc_inllen2 + + cfs_size_round(data->ioc_inllen1); + char *out = data->ioc_bulk; + + l = snprintf(out, remains, + "logid: #"DOSTID"#%08x\n" + "flags: %x (%s)\n" + "records count: %d\n" + "last index: %d\n", + POSTID(&handle->lgh_id.lgl_oi), + handle->lgh_id.lgl_ogen, + handle->lgh_hdr->llh_flags, + handle->lgh_hdr->llh_flags & + LLOG_F_IS_CAT ? "cat" : "plain", + handle->lgh_hdr->llh_count, + handle->lgh_last_idx); + out += l; + remains -= l; + if (remains <= 0) { + CERROR("%s: not enough space for log header info\n", + ctxt->loc_obd->obd_name); + rc = -ENOSPC; + } + break; + } + case OBD_IOC_LLOG_CHECK: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_check_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_PRINT: + LASSERT(data->ioc_inllen1 > 0); + rc = llog_process(env, handle, llog_print_cb, data, NULL); + if (rc == -LLOG_EEMPTY) + rc = 0; + else if (rc) + GOTO(out_close, rc); + break; + case OBD_IOC_LLOG_CANCEL: { + struct llog_cookie cookie; + struct llog_logid plain; + char *endp; + + cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0); + if (*endp != '\0') + GOTO(out_close, rc = -EINVAL); + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_cancel_rec(NULL, handle, cookie.lgc_index); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */ + GOTO(out_close, rc = -ENOTTY); + + rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + cookie.lgc_lgl = plain; + rc = llog_cat_cancel_records(env, handle, 1, &cookie); + if (rc) + GOTO(out_close, rc); + break; + } + case OBD_IOC_LLOG_REMOVE: { + struct llog_logid plain; + + if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) { + rc = llog_destroy(env, handle); + GOTO(out_close, rc); + } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) { + GOTO(out_close, rc = -EINVAL); + } + + if (data->ioc_inlbuf2 > 0) { + /* remove indicate log from the catalog */ + rc = str2logid(&plain, data->ioc_inlbuf2, + data->ioc_inllen2); + if (rc) + GOTO(out_close, rc); + rc = llog_remove_log(env, handle, &plain); + } else { + /* remove all the log of the catalog */ + rc = llog_process(env, handle, llog_delete_cb, NULL, + NULL); + if (rc) + GOTO(out_close, rc); + } + break; + } + default: + CERROR("%s: Unknown ioctl cmd %#x\n", + ctxt->loc_obd->obd_name, cmd); + GOTO(out_close, rc = -ENOTTY); + } + +out_close: + if (handle->lgh_hdr && + handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) + llog_cat_close(env, handle); + else + llog_close(env, handle); + RETURN(rc); +} +EXPORT_SYMBOL(llog_ioctl); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c new file mode 100644 index 000000000000..7e12dc62141f --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c @@ -0,0 +1,862 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_lvfs.c + * + * OST<->MDS recovery logging infrastructure. + * Invariants in implementation: + * - we do not share logs among different OST<->MDS connections, so that + * if an OST or MDS fails it need only look at log(s) relevant to itself + * + * Author: Andreas Dilger <adilger@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include <obd.h> +#include <obd_class.h> +#include <lustre_log.h> +#include <obd_ost.h> +#include <linux/list.h> +#include <lvfs.h> +#include <lustre_fsfilt.h> +#include <lustre_disk.h> +#include "llog_internal.h" + +#if defined(LLOG_LVFS) + +static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file, + int len, int index) +{ + struct llog_rec_hdr rec = { 0 }; + struct llog_rec_tail tail; + int rc; + ENTRY; + + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + tail.lrt_len = rec.lrh_len = len; + tail.lrt_index = rec.lrh_index = index; + rec.lrh_type = LLOG_PAD_MAGIC; + + rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0); + if (rc) { + CERROR("error writing padding record: rc %d\n", rc); + goto out; + } + + file->f_pos += len - sizeof(rec) - sizeof(tail); + rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0); + if (rc) { + CERROR("error writing padding record: rc %d\n", rc); + goto out; + } + + out: + RETURN(rc); +} + +static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file, + struct llog_rec_hdr *rec, void *buf, loff_t off) +{ + int rc; + struct llog_rec_tail end; + loff_t saved_off = file->f_pos; + int buflen = rec->lrh_len; + + ENTRY; + + file->f_pos = off; + + if (buflen == 0) + CWARN("0-length record\n"); + + if (!buf) { + rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0); + if (rc) { + CERROR("error writing log record: rc %d\n", rc); + goto out; + } + GOTO(out, rc = 0); + } + + /* the buf case */ + rec->lrh_len = sizeof(*rec) + buflen + sizeof(end); + rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0); + if (rc) { + CERROR("error writing log hdr: rc %d\n", rc); + goto out; + } + + rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0); + if (rc) { + CERROR("error writing log buffer: rc %d\n", rc); + goto out; + } + + end.lrt_len = rec->lrh_len; + end.lrt_index = rec->lrh_index; + rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0); + if (rc) { + CERROR("error writing log tail: rc %d\n", rc); + goto out; + } + + rc = 0; + out: + if (saved_off > file->f_pos) + file->f_pos = saved_off; + LASSERT(rc <= 0); + RETURN(rc); +} + +static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file, + void *buf, int size, loff_t off) +{ + loff_t offset = off; + int rc; + ENTRY; + + rc = fsfilt_read_record(obd, file, buf, size, &offset); + if (rc) { + CERROR("error reading log record: rc %d\n", rc); + RETURN(rc); + } + RETURN(0); +} + +static int llog_lvfs_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct obd_device *obd; + int rc; + ENTRY; + + LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE); + + obd = handle->lgh_ctxt->loc_exp->exp_obd; + + if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + RETURN(LLOG_EEMPTY); + } + + rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr, + LLOG_CHUNK_SIZE, 0); + if (rc) { + CERROR("error reading log header from %.*s\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name); + } else { + struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr; + + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("bad log %.*s header magic: %#x (expected %#x)\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name, + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + rc = -EIO; + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("incorrectly sized log %.*s header: %#x " + "(expected %#x)\n", + handle->lgh_file->f_dentry->d_name.len, + handle->lgh_file->f_dentry->d_name.name, + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + CERROR("you may need to re-run lconf --write_conf.\n"); + rc = -EIO; + } + } + + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode); + + RETURN(rc); +} + +/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */ +/* appends if idx == -1, otherwise overwrites record idx. */ +static int llog_lvfs_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, int cookiecount, + void *buf, int idx, struct thandle *th) +{ + struct llog_log_hdr *llh; + int reclen = rec->lrh_len, index, rc; + struct llog_rec_tail *lrt; + struct obd_device *obd; + struct file *file; + size_t left; + ENTRY; + + llh = loghandle->lgh_hdr; + file = loghandle->lgh_file; + obd = loghandle->lgh_ctxt->loc_exp->exp_obd; + + /* record length should not bigger than LLOG_CHUNK_SIZE */ + if (buf) + rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail)) ? -E2BIG : 0; + else + rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0; + if (rc) + RETURN(rc); + + if (buf) + /* write_blob adds header and tail to lrh_len. */ + reclen = sizeof(*rec) + rec->lrh_len + + sizeof(struct llog_rec_tail); + + if (idx != -1) { + loff_t saved_offset; + + /* no header: only allowed to insert record 1 */ + if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) { + CERROR("idx != -1 in empty log\n"); + LBUG(); + } + + if (idx && llh->llh_size && llh->llh_size != rec->lrh_len) + RETURN(-EINVAL); + + if (!ext2_test_bit(idx, llh->llh_bitmap)) + CERROR("Modify unset record %u\n", idx); + if (idx != rec->lrh_index) + CERROR("Index mismatch %d %u\n", idx, rec->lrh_index); + + rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0); + /* we are done if we only write the header or on error */ + if (rc || idx == 0) + RETURN(rc); + + if (buf) { + /* We assume that caller has set lgh_cur_* */ + saved_offset = loghandle->lgh_cur_offset; + CDEBUG(D_OTHER, + "modify record "DOSTID": idx:%d/%u/%d, len:%u " + "offset %llu\n", + POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index, + loghandle->lgh_cur_idx, rec->lrh_len, + (long long)(saved_offset - sizeof(*llh))); + if (rec->lrh_index != loghandle->lgh_cur_idx) { + CERROR("modify idx mismatch %u/%d\n", + idx, loghandle->lgh_cur_idx); + RETURN(-EFAULT); + } + } else { + /* Assumes constant lrh_len */ + saved_offset = sizeof(*llh) + (idx - 1) * reclen; + } + + rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /* Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + * + * We know that llog_current_log() will return a loghandle that is + * big enough to hold reclen, so all we care about is padding here. + */ + left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1)); + + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_lvfs_pad(obd, file, left, index); + if (rc) + RETURN(rc); + loghandle->lgh_last_idx++; /*for pad rec*/ + } + /* if it's the last idx in log file, then return -ENOSPC */ + if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1) + RETURN(-ENOSPC); + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + rec->lrh_index = index; + if (buf == NULL) { + lrt = (struct llog_rec_tail *) + ((char *)rec + rec->lrh_len - sizeof(*lrt)); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + } + /*The caller should make sure only 1 process access the lgh_last_idx, + *Otherwise it might hit the assert.*/ + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + spin_lock(&loghandle->lgh_hdr_lock); + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("argh, index %u already set in log bitmap?\n", index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + llh->llh_tail.lrt_index = index; + + rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0); + if (rc) + RETURN(rc); + + rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos); + if (rc) + RETURN(rc); + + CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n", + POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + if (rc == 0 && rec->lrh_type == LLOG_GEN_REC) + rc = 1; + + RETURN(rc); +} + +/* We can skip reading at least as many log blocks as the number of +* minimum sized log records we are skipping. If it turns out +* that we are not far enough along the log (because the +* actual records are larger than minimum size) we just skip +* some more records. */ + +static void llog_skip_over(__u64 *off, int curr, int goal) +{ + if (goal <= curr) + return; + *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) & + ~(LLOG_CHUNK_SIZE - 1); +} + + +/* sets: + * - cur_offset to the furthest point read in the log file + * - cur_idx to the log index preceeding cur_offset + * returns -EIO/-EINVAL on error + */ +static int llog_lvfs_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + int rc; + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n", + next_idx, *cur_idx, *cur_offset); + + while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + loff_t ppos; + int llen; + + llog_skip_over(cur_offset, *cur_idx, next_idx); + + /* read up to next LLOG_CHUNK_SIZE block */ + ppos = *cur_offset; + llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1)); + rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, + loghandle->lgh_file, buf, llen, + cur_offset); + if (rc < 0) { + CERROR("Cant read llog block at log id "DOSTID + "/%u offset "LPU64"\n", + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + *cur_offset); + RETURN(rc); + } + + /* put number of bytes read into rc to make code simpler */ + rc = *cur_offset - ppos; + if (rc < len) { + /* signal the end of the valid buffer to llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) /* end of file, nothing to do */ + RETURN(0); + + if (rc < sizeof(*tail)) { + CERROR("Invalid llog block at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + RETURN(-EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)(buf + rc - + sizeof(struct llog_rec_tail)); + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)(buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("Invalid llog tail at log id "DOSTID"/%u offset " + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + RETURN(-EINVAL); + } + if (tail->lrt_index < next_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > next_idx) { + CERROR("missed desired record? %u > %u\n", + rec->lrh_index, next_idx); + RETURN(-ENOENT); + } + RETURN(0); + } + RETURN(-EIO); +} + +static int llog_lvfs_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + __u64 cur_offset; + int rc; + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + cur_offset = LLOG_CHUNK_SIZE; + llog_skip_over(&cur_offset, 0, prev_idx); + + while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + loff_t ppos = cur_offset; + + rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd, + loghandle->lgh_file, buf, len, + &cur_offset); + if (rc < 0) { + CERROR("Cant read llog block at log id "DOSTID + "/%u offset "LPU64"\n", + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, + cur_offset); + RETURN(rc); + } + + /* put number of bytes read into rc to make code simpler */ + rc = cur_offset - ppos; + + if (rc == 0) /* end of file, nothing to do */ + RETURN(0); + + if (rc < sizeof(*tail)) { + CERROR("Invalid llog block at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)(buf + rc - + sizeof(struct llog_rec_tail)); + + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)(buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("Invalid llog tail at log id "DOSTID"/%u offset" + LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + RETURN(-EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("missed desired record? %u > %u\n", + rec->lrh_index, prev_idx); + RETURN(-ENOENT); + } + RETURN(0); + } + RETURN(-EIO); +} + +static struct file *llog_filp_open(char *dir, char *name, int flags, int mode) +{ + char *logname; + struct file *filp; + int len; + + OBD_ALLOC(logname, PATH_MAX); + if (logname == NULL) + return ERR_PTR(-ENOMEM); + + len = snprintf(logname, PATH_MAX, "%s/%s", dir, name); + if (len >= PATH_MAX - 1) { + filp = ERR_PTR(-ENAMETOOLONG); + } else { + filp = l_filp_open(logname, flags, mode); + if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT) + CERROR("logfile creation %s: %ld\n", logname, + PTR_ERR(filp)); + } + OBD_FREE(logname, PATH_MAX); + return filp; +} + +static int llog_lvfs_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct l_dentry *dchild = NULL; + struct obd_device *obd; + int rc = 0; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + obd = ctxt->loc_exp->exp_obd; + + LASSERT(handle); + if (logid != NULL) { + dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi, + logid->lgl_ogen); + if (IS_ERR(dchild)) { + rc = PTR_ERR(dchild); + CERROR("%s: error looking up logfile #"DOSTID "#%08x:" + " rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + if (dchild->d_inode == NULL) { + l_dput(dchild); + rc = -ENOENT; + CERROR("%s: nonexistent llog #"DOSTID"#%08x:" + "rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, + O_RDWR | O_LARGEFILE); + l_dput(dchild); + if (IS_ERR(handle->lgh_file)) { + rc = PTR_ERR(handle->lgh_file); + handle->lgh_file = NULL; + CERROR("%s: error opening llog #"DOSTID"#%08x:" + "rc = %d\n", ctxt->loc_obd->obd_name, + POSTID(&logid->lgl_oi), logid->lgl_ogen, rc); + GOTO(out, rc); + } + handle->lgh_id = *logid; + } else if (name) { + handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name, + O_RDWR | O_LARGEFILE, 0644); + if (IS_ERR(handle->lgh_file)) { + rc = PTR_ERR(handle->lgh_file); + handle->lgh_file = NULL; + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + rc = 0; + } else { + GOTO(out, rc); + } + } else { + lustre_build_llog_lvfs_oid(&handle->lgh_id, + handle->lgh_file->f_dentry->d_inode->i_ino, + handle->lgh_file->f_dentry->d_inode->i_generation); + } + } else { + LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param); + handle->lgh_file = NULL; + } + + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL) + GOTO(out_name, rc = -ENOENT); + + RETURN(0); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + RETURN(rc); +} + +static int llog_lvfs_exist(struct llog_handle *handle) +{ + return (handle->lgh_file != NULL); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_lvfs_create(const struct lu_env *env, + struct llog_handle *handle, + struct thandle *th) +{ + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct obd_device *obd; + struct l_dentry *dchild = NULL; + struct file *file; + struct obdo *oa = NULL; + int rc = 0; + int open_flags = O_RDWR | O_CREAT | O_LARGEFILE; + + ENTRY; + + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + obd = ctxt->loc_exp->exp_obd; + LASSERT(handle->lgh_file == NULL); + + if (handle->lgh_name) { + file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name, + open_flags, 0644); + if (IS_ERR(file)) + RETURN(PTR_ERR(file)); + + lustre_build_llog_lvfs_oid(&handle->lgh_id, + file->f_dentry->d_inode->i_ino, + file->f_dentry->d_inode->i_generation); + handle->lgh_file = file; + } else { + OBDO_ALLOC(oa); + if (oa == NULL) + RETURN(-ENOMEM); + + ostid_set_seq_llog(&oa->o_oi); + oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP; + + rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL); + if (rc) + GOTO(out, rc); + + /* FIXME: rationalize the misuse of o_generation in + * this API along with mds_obd_{create,destroy}. + * Hopefully it is only an internal API issue. */ +#define o_generation o_parent_oid + dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi, + oa->o_generation); + if (IS_ERR(dchild)) + GOTO(out, rc = PTR_ERR(dchild)); + + file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags); + l_dput(dchild); + if (IS_ERR(file)) + GOTO(out, rc = PTR_ERR(file)); + handle->lgh_id.lgl_oi = oa->o_oi; + handle->lgh_id.lgl_ogen = oa->o_generation; + handle->lgh_file = file; +out: + OBDO_FREE(oa); + } + RETURN(rc); +} + +static int llog_lvfs_close(const struct lu_env *env, + struct llog_handle *handle) +{ + int rc; + + ENTRY; + + if (handle->lgh_file == NULL) + RETURN(0); + rc = filp_close(handle->lgh_file, 0); + if (rc) + CERROR("%s: error closing llog #"DOSTID"#%08x: " + "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name, + POSTID(&handle->lgh_id.lgl_oi), + handle->lgh_id.lgl_ogen, rc); + handle->lgh_file = NULL; + if (handle->lgh_name) { + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + handle->lgh_name = NULL; + } + RETURN(rc); +} + +static int llog_lvfs_destroy(const struct lu_env *env, + struct llog_handle *handle) +{ + struct dentry *fdentry; + struct obdo *oa; + struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd; + char *dir; + void *th; + struct inode *inode; + int rc, rc1; + ENTRY; + + dir = MOUNT_CONFIGS_DIR; + + LASSERT(handle->lgh_file); + fdentry = handle->lgh_file->f_dentry; + inode = fdentry->d_parent->d_inode; + if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) { + struct lvfs_run_ctxt saved; + struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + dget(fdentry); + rc = llog_lvfs_close(env, handle); + if (rc == 0) { + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + rc = ll_vfs_unlink(inode, fdentry, mnt); + mutex_unlock(&inode->i_mutex); + } + mntput(mnt); + + dput(fdentry); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + RETURN(rc); + } + + OBDO_ALLOC(oa); + if (oa == NULL) + RETURN(-ENOMEM); + + oa->o_oi = handle->lgh_id.lgl_oi; + oa->o_generation = handle->lgh_id.lgl_ogen; +#undef o_generation + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER; + + rc = llog_lvfs_close(env, handle); + if (rc) + GOTO(out, rc); + + th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1); + if (IS_ERR(th)) { + CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th)); + GOTO(out, rc = PTR_ERR(th)); + } + + rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa, + NULL, NULL, NULL, NULL); + + rc1 = fsfilt_commit(obd, inode, th, 0); + if (rc == 0 && rc1 != 0) + rc = rc1; + out: + OBDO_FREE(oa); + RETURN(rc); +} + +static int llog_lvfs_declare_create(const struct lu_env *env, + struct llog_handle *res, + struct thandle *th) +{ + return 0; +} + +static int llog_lvfs_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + return 0; +} + +struct llog_operations llog_lvfs_ops = { + .lop_write_rec = llog_lvfs_write_rec, + .lop_next_block = llog_lvfs_next_block, + .lop_prev_block = llog_lvfs_prev_block, + .lop_read_header = llog_lvfs_read_header, + .lop_create = llog_lvfs_create, + .lop_destroy = llog_lvfs_destroy, + .lop_close = llog_lvfs_close, + .lop_open = llog_lvfs_open, + .lop_exist = llog_lvfs_exist, + .lop_declare_create = llog_lvfs_declare_create, + .lop_declare_write_rec = llog_lvfs_declare_write_rec, +}; +EXPORT_SYMBOL(llog_lvfs_ops); +#else /* !__KERNEL__ */ +struct llog_operations llog_lvfs_ops = {}; +#endif diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c new file mode 100644 index 000000000000..7e2290796315 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c @@ -0,0 +1,319 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include <obd_class.h> +#include <lustre_log.h> +#include "llog_internal.h" + +/* helper functions for calling the llog obd methods */ +static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + OBD_ALLOC_PTR(ctxt); + if (!ctxt) + return NULL; + + ctxt->loc_obd = obd; + atomic_set(&ctxt->loc_refcount, 1); + + return ctxt; +} + +static void llog_ctxt_destroy(struct llog_ctxt *ctxt) +{ + if (ctxt->loc_exp) { + class_export_put(ctxt->loc_exp); + ctxt->loc_exp = NULL; + } + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + OBD_FREE_PTR(ctxt); +} + +int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct obd_llog_group *olg = ctxt->loc_olg; + struct obd_device *obd; + int rc = 0; + + spin_lock(&olg->olg_lock); + if (!atomic_dec_and_test(&ctxt->loc_refcount)) { + spin_unlock(&olg->olg_lock); + return rc; + } + olg->olg_ctxts[ctxt->loc_idx] = NULL; + spin_unlock(&olg->olg_lock); + + obd = ctxt->loc_obd; + spin_lock(&obd->obd_dev_lock); + /* sync with llog ctxt user thread */ + spin_unlock(&obd->obd_dev_lock); + + /* obd->obd_starting is needed for the case of cleanup + * in error case while obd is starting up. */ + LASSERTF(obd->obd_starting == 1 || + obd->obd_stopping == 1 || obd->obd_set_up == 0, + "wrong obd state: %d/%d/%d\n", !!obd->obd_starting, + !!obd->obd_stopping, !!obd->obd_set_up); + + /* cleanup the llog ctxt here */ + if (CTXTP(ctxt, cleanup)) + rc = CTXTP(ctxt, cleanup)(env, ctxt); + + llog_ctxt_destroy(ctxt); + wake_up(&olg->olg_waitq); + return rc; +} +EXPORT_SYMBOL(__llog_ctxt_put); + +int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); + struct obd_llog_group *olg; + int rc, idx; + ENTRY; + + LASSERT(ctxt != NULL); + LASSERT(ctxt != LP_POISON); + + olg = ctxt->loc_olg; + LASSERT(olg != NULL); + LASSERT(olg != LP_POISON); + + idx = ctxt->loc_idx; + + /* + * Banlance the ctxt get when calling llog_cleanup() + */ + LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON); + LASSERT(atomic_read(&ctxt->loc_refcount) > 1); + llog_ctxt_put(ctxt); + + /* + * Try to free the ctxt. + */ + rc = __llog_ctxt_put(env, ctxt); + if (rc) + CERROR("Error %d while cleaning up ctxt %p\n", + rc, ctxt); + + l_wait_event(olg->olg_waitq, + llog_group_ctxt_null(olg, idx), &lwi); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_cleanup); + +int llog_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int index, + struct obd_device *disk_obd, struct llog_operations *op) +{ + struct llog_ctxt *ctxt; + int rc = 0; + ENTRY; + + if (index < 0 || index >= LLOG_MAX_CTXTS) + RETURN(-EINVAL); + + LASSERT(olg != NULL); + + ctxt = llog_new_ctxt(obd); + if (!ctxt) + RETURN(-ENOMEM); + + ctxt->loc_obd = obd; + ctxt->loc_olg = olg; + ctxt->loc_idx = index; + ctxt->loc_logops = op; + mutex_init(&ctxt->loc_mutex); + ctxt->loc_exp = class_export_get(disk_obd->obd_self_export); + ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED; + + rc = llog_group_set_ctxt(olg, ctxt, index); + if (rc) { + llog_ctxt_destroy(ctxt); + if (rc == -EEXIST) { + ctxt = llog_group_get_ctxt(olg, index); + if (ctxt) { + /* + * mds_lov_update_desc() might call here multiple + * times. So if the llog is already set up then + * don't to do it again. + */ + CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n", + obd->obd_name, index); + LASSERT(ctxt->loc_olg == olg); + LASSERT(ctxt->loc_obd == obd); + LASSERT(ctxt->loc_exp == disk_obd->obd_self_export); + LASSERT(ctxt->loc_logops == op); + llog_ctxt_put(ctxt); + } + rc = 0; + } + RETURN(rc); + } + + if (op->lop_setup) { + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP)) + rc = -EOPNOTSUPP; + else + rc = op->lop_setup(env, obd, olg, index, disk_obd); + } + + if (rc) { + CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n", + obd->obd_name, index, op->lop_setup, rc); + llog_group_clear_ctxt(olg, index); + llog_ctxt_destroy(ctxt); + } else { + CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n", + obd->obd_name, index); + ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED; + } + + RETURN(rc); +} +EXPORT_SYMBOL(llog_setup); + +int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags) +{ + int rc = 0; + ENTRY; + + if (!ctxt) + RETURN(0); + + if (CTXTP(ctxt, sync)) + rc = CTXTP(ctxt, sync)(ctxt, exp, flags); + + RETURN(rc); +} +EXPORT_SYMBOL(llog_sync); + +int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt, + struct llog_rec_hdr *rec, struct lov_stripe_md *lsm, + struct llog_cookie *logcookies, int numcookies) +{ + int raised, rc; + ENTRY; + + if (!ctxt) { + CERROR("No ctxt\n"); + RETURN(-ENODEV); + } + + if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED) + RETURN(-ENXIO); + + CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP); + raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); + if (!raised) + cfs_cap_raise(CFS_CAP_SYS_RESOURCE); + rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies, + numcookies); + if (!raised) + cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + RETURN(rc); +} +EXPORT_SYMBOL(llog_obd_add); + +int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt, + struct lov_stripe_md *lsm, int count, + struct llog_cookie *cookies, int flags) +{ + int rc; + ENTRY; + + if (!ctxt) { + CERROR("No ctxt\n"); + RETURN(-ENODEV); + } + + CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP); + rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags); + RETURN(rc); +} +EXPORT_SYMBOL(llog_cancel); + +int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg, + struct obd_device *disk_obd, int *index) +{ + int rc; + ENTRY; + OBD_CHECK_DT_OP(obd, llog_init, 0); + OBD_COUNTER_INCREMENT(obd, llog_init); + + rc = OBP(obd, llog_init)(obd, olg, disk_obd, index); + RETURN(rc); +} +EXPORT_SYMBOL(obd_llog_init); + +int obd_llog_finish(struct obd_device *obd, int count) +{ + int rc; + ENTRY; + OBD_CHECK_DT_OP(obd, llog_finish, 0); + OBD_COUNTER_INCREMENT(obd, llog_finish); + + rc = OBP(obd, llog_finish)(obd, count); + RETURN(rc); +} +EXPORT_SYMBOL(obd_llog_finish); + +/* context key constructor/destructor: llog_key_init, llog_key_fini */ +LU_KEY_INIT_FINI(llog, struct llog_thread_info); +/* context key: llog_thread_key */ +LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL); +LU_KEY_INIT_GENERIC(llog); +EXPORT_SYMBOL(llog_thread_key); + +int llog_info_init(void) +{ + llog_key_init_generic(&llog_thread_key, NULL); + lu_context_key_register(&llog_thread_key); + return 0; +} + +void llog_info_fini(void) +{ + lu_context_key_degister(&llog_thread_key); +} diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c new file mode 100644 index 000000000000..6dbd21a863c2 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c @@ -0,0 +1,1323 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API + * + * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com> + * Author: Mikhail Pershin <mike.pershin@intel.com> + */ + +#define DEBUG_SUBSYSTEM S_LOG + +#ifndef EXPORT_SYMTAB +#define EXPORT_SYMTAB +#endif + +#include <obd.h> +#include <obd_class.h> +#include <lustre_fid.h> +#include <dt_object.h> + +#include "llog_internal.h" +#include "local_storage.h" + +/* + * - multi-chunks or big-declaration approach + * - use unique sequence instead of llog sb tracking unique ids + * - re-use existing environment + * - named llog support (can be used for testing only at the present) + * - llog_origin_connect() work with OSD API + */ + +static int llog_osd_declare_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_declare_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +static int llog_osd_create_new_object(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + return local_object_create(env, los, o, &lgi->lgi_attr, + &lgi->lgi_dof, th); +} + +static int llog_osd_pad(const struct lu_env *env, struct dt_object *o, + loff_t *off, int len, int index, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + int rc; + + ENTRY; + + LASSERT(th); + LASSERT(off); + LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0); + + lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len; + lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index; + lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC; + + lgi->lgi_buf.lb_buf = &lgi->lgi_lrh; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh); + dt_write_lock(env, o, 0); + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + lgi->lgi_buf.lb_buf = &lgi->lgi_tail; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail); + *off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail); + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing padding record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); +out: + dt_write_unlock(env, o); + RETURN(rc); +} + +static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o, + struct llog_rec_hdr *rec, void *buf, + loff_t *off, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + int buflen = rec->lrh_len; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(o); + + if (buflen == 0) + CWARN("0-length record\n"); + + CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n", + rec->lrh_type, buf, buflen, *off); + + lgi->lgi_attr.la_valid = LA_SIZE; + lgi->lgi_attr.la_size = *off; + + if (!buf) { + lgi->lgi_buf.lb_len = buflen; + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing log record: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + /* the buf case */ + /* protect the following 3 writes from concurrent read */ + dt_write_lock(env, o, 0); + rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail); + lgi->lgi_buf.lb_len = sizeof(*rec); + lgi->lgi_buf.lb_buf = rec; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing log hdr: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out_unlock, rc); + } + + lgi->lgi_buf.lb_len = buflen; + lgi->lgi_buf.lb_buf = buf; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) { + CERROR("%s: error writing log buffer: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out_unlock, rc); + } + + lgi->lgi_tail.lrt_len = rec->lrh_len; + lgi->lgi_tail.lrt_index = rec->lrh_index; + lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail); + lgi->lgi_buf.lb_buf = &lgi->lgi_tail; + rc = dt_record_write(env, o, &lgi->lgi_buf, off, th); + if (rc) + CERROR("%s: error writing log tail: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + +out_unlock: + dt_write_unlock(env, o); + +out: + /* cleanup the content written above */ + if (rc) { + dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th, + BYPASS_CAPA); + dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA); + } + + RETURN(rc); +} + +static int llog_osd_read_header(const struct lu_env *env, + struct llog_handle *handle) +{ + struct llog_rec_hdr *llh_hdr; + struct dt_object *o; + struct llog_thread_info *lgi; + int rc; + + ENTRY; + + LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE); + + o = handle->lgh_obj; + LASSERT(o); + + lgi = llog_info(env); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + RETURN(rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + + if (lgi->lgi_attr.la_size == 0) { + CDEBUG(D_HA, "not reading header from 0-byte log\n"); + RETURN(LLOG_EEMPTY); + } + + lgi->lgi_off = 0; + lgi->lgi_buf.lb_buf = handle->lgh_hdr; + lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE; + + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + if (rc) { + CERROR("%s: error reading log header from "DFID": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), rc); + RETURN(rc); + } + + llh_hdr = &handle->lgh_hdr->llh_hdr; + if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr)) + lustre_swab_llog_hdr(handle->lgh_hdr); + + if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) { + CERROR("%s: bad log %s "DFID" header magic: %#x " + "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_type, LLOG_HDR_MAGIC); + RETURN(-EIO); + } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) { + CERROR("%s: incorrectly sized log %s "DFID" header: " + "%#x (expected %#x)\n" + "you may need to re-run lconf --write_conf.\n", + o->do_lu.lo_dev->ld_obd->obd_name, + handle->lgh_name ? handle->lgh_name : "", + PFID(lu_object_fid(&o->do_lu)), + llh_hdr->lrh_len, LLOG_CHUNK_SIZE); + RETURN(-EIO); + } + + handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index; + + RETURN(0); +} + +static int llog_osd_declare_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(th); + LASSERT(loghandle); + + o = loghandle->lgh_obj; + LASSERT(o); + + /* each time we update header */ + rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0, + th); + if (rc || idx == 0) /* if error or just header */ + RETURN(rc); + + if (dt_object_exists(o)) { + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + lgi->lgi_off = lgi->lgi_attr.la_size; + LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE)); + if (rc) + RETURN(rc); + + rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th); + if (rc) + RETURN(rc); + } else { + lgi->lgi_off = 0; + } + + /* XXX: implement declared window or multi-chunks approach */ + rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th); + + RETURN(rc); +} + +/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */ +/* appends if idx == -1, otherwise overwrites record idx. */ +static int llog_osd_write_rec(const struct lu_env *env, + struct llog_handle *loghandle, + struct llog_rec_hdr *rec, + struct llog_cookie *reccookie, int cookiecount, + void *buf, int idx, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_log_hdr *llh; + int reclen = rec->lrh_len; + int index, rc, old_tail_idx; + struct llog_rec_tail *lrt; + struct dt_object *o; + size_t left; + + ENTRY; + + LASSERT(env); + llh = loghandle->lgh_hdr; + LASSERT(llh); + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(th); + + CDEBUG(D_OTHER, "new record %x to "DFID"\n", + rec->lrh_type, PFID(lu_object_fid(&o->do_lu))); + + /* record length should not bigger than LLOG_CHUNK_SIZE */ + if (buf) + rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail)) ? -E2BIG : 0; + else + rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0; + if (rc) + RETURN(rc); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + RETURN(rc); + + if (buf) + /* write_blob adds header and tail to lrh_len. */ + reclen = sizeof(*rec) + rec->lrh_len + + sizeof(struct llog_rec_tail); + + if (idx != -1) { + /* no header: only allowed to insert record 1 */ + if (idx != 1 && lgi->lgi_attr.la_size == 0) + LBUG(); + + if (idx && llh->llh_size && llh->llh_size != rec->lrh_len) + RETURN(-EINVAL); + + if (!ext2_test_bit(idx, llh->llh_bitmap)) + CERROR("%s: modify unset record %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx); + if (idx != rec->lrh_index) + CERROR("%s: index mismatch %d %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + rec->lrh_index); + + lgi->lgi_off = 0; + rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, + &lgi->lgi_off, th); + /* we are done if we only write the header or on error */ + if (rc || idx == 0) + RETURN(rc); + + if (buf) { + /* We assume that caller has set lgh_cur_* */ + lgi->lgi_off = loghandle->lgh_cur_offset; + CDEBUG(D_OTHER, + "modify record "DOSTID": idx:%d/%u/%d, len:%u " + "offset %llu\n", + POSTID(&loghandle->lgh_id.lgl_oi), idx, + rec->lrh_index, + loghandle->lgh_cur_idx, rec->lrh_len, + (long long)(lgi->lgi_off - sizeof(*llh))); + if (rec->lrh_index != loghandle->lgh_cur_idx) { + CERROR("%s: modify idx mismatch %u/%d\n", + o->do_lu.lo_dev->ld_obd->obd_name, idx, + loghandle->lgh_cur_idx); + RETURN(-EFAULT); + } + } else { + /* Assumes constant lrh_len */ + lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen; + } + + rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = idx; + rc = 1; + } + RETURN(rc); + } + + /* Make sure that records don't cross a chunk boundary, so we can + * process them page-at-a-time if needed. If it will cross a chunk + * boundary, write in a fake (but referenced) entry to pad the chunk. + * + * We know that llog_current_log() will return a loghandle that is + * big enough to hold reclen, so all we care about is padding here. + */ + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = lgi->lgi_attr.la_size; + left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1)); + /* NOTE: padding is a record, but no bit is set */ + if (left != 0 && left != reclen && + left < (reclen + LLOG_MIN_REC_SIZE)) { + index = loghandle->lgh_last_idx + 1; + rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th); + if (rc) + RETURN(rc); + loghandle->lgh_last_idx++; /*for pad rec*/ + } + /* if it's the last idx in log file, then return -ENOSPC */ + if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1) + RETURN(-ENOSPC); + + loghandle->lgh_last_idx++; + index = loghandle->lgh_last_idx; + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + rec->lrh_index = index; + if (buf == NULL) { + lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len - + sizeof(*lrt)); + lrt->lrt_len = rec->lrh_len; + lrt->lrt_index = rec->lrh_index; + } + /* The caller should make sure only 1 process access the lgh_last_idx, + * Otherwise it might hit the assert.*/ + LASSERT(index < LLOG_BITMAP_SIZE(llh)); + spin_lock(&loghandle->lgh_hdr_lock); + if (ext2_set_bit(index, llh->llh_bitmap)) { + CERROR("%s: index %u already set in log bitmap\n", + o->do_lu.lo_dev->ld_obd->obd_name, index); + spin_unlock(&loghandle->lgh_hdr_lock); + LBUG(); /* should never happen */ + } + llh->llh_count++; + spin_unlock(&loghandle->lgh_hdr_lock); + old_tail_idx = llh->llh_tail.lrt_index; + llh->llh_tail.lrt_index = index; + + lgi->lgi_off = 0; + rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off, + th); + if (rc) + GOTO(out, rc); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL); + if (rc) + GOTO(out, rc); + + LASSERT(lgi->lgi_attr.la_valid & LA_SIZE); + lgi->lgi_off = lgi->lgi_attr.la_size; + + rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th); + +out: + /* cleanup llog for error case */ + if (rc) { + spin_lock(&loghandle->lgh_hdr_lock); + ext2_clear_bit(index, llh->llh_bitmap); + llh->llh_count--; + spin_unlock(&loghandle->lgh_hdr_lock); + + /* restore the header */ + loghandle->lgh_last_idx--; + llh->llh_tail.lrt_index = old_tail_idx; + lgi->lgi_off = 0; + llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, + &lgi->lgi_off, th); + } + + CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n", + POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len); + if (rc == 0 && reccookie) { + reccookie->lgc_lgl = loghandle->lgh_id; + reccookie->lgc_index = index; + if ((rec->lrh_type == MDS_UNLINK_REC) || + (rec->lrh_type == MDS_SETATTR64_REC)) + reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; + else if (rec->lrh_type == OST_SZ_REC) + reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT; + else + reccookie->lgc_subsys = -1; + rc = 1; + } + RETURN(rc); +} + +/* We can skip reading at least as many log blocks as the number of + * minimum sized log records we are skipping. If it turns out + * that we are not far enough along the log (because the + * actual records are larger than minimum size) we just skip + * some more records. + */ +static void llog_skip_over(__u64 *off, int curr, int goal) +{ + if (goal <= curr) + return; + *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) & + ~(LLOG_CHUNK_SIZE - 1); +} + +/* sets: + * - cur_offset to the furthest point read in the log file + * - cur_idx to the log index preceeding cur_offset + * returns -EIO/-EINVAL on error + */ +static int llog_osd_next_block(const struct lu_env *env, + struct llog_handle *loghandle, int *cur_idx, + int next_idx, __u64 *cur_offset, void *buf, + int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + int rc; + + ENTRY; + + LASSERT(env); + LASSERT(lgi); + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n", + next_idx, *cur_idx, *cur_offset); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(dt_object_exists(o)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + while (*cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + llog_skip_over(cur_offset, *cur_idx, next_idx); + + /* read up to next LLOG_CHUNK_SIZE block */ + lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE - + (*cur_offset & (LLOG_CHUNK_SIZE - 1)); + lgi->lgi_buf.lb_buf = buf; + + /* Note: read lock is not needed around la_size get above at + * the time of dt_attr_get(). There are only two cases that + * matter. Either la_size == cur_offset, in which case the + * entire read is skipped, or la_size > cur_offset and the loop + * is entered and this thread is blocked at dt_read_lock() + * until the write is completed. When the write completes, then + * the dt_read() will be done with the full length, and will + * get the full data. + */ + dt_read_lock(env, o, 0); + rc = dt_read(env, o, &lgi->lgi_buf, cur_offset); + dt_read_unlock(env, o); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset "LPU64": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), *cur_offset, + rc); + GOTO(out, rc); + } + + if (rc < len) { + /* signal the end of the valid buffer to + * llog_process */ + memset(buf + rc, 0, len - rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + *cur_idx = tail->lrt_index; + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, *cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < next_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > next_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, next_idx); + GOTO(out, rc = -ENOENT); + } + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +static int llog_osd_prev_block(const struct lu_env *env, + struct llog_handle *loghandle, + int prev_idx, void *buf, int len) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o; + struct dt_device *dt; + loff_t cur_offset; + int rc; + + ENTRY; + + if (len == 0 || len & (LLOG_CHUNK_SIZE - 1)) + RETURN(-EINVAL); + + CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx); + + LASSERT(loghandle); + LASSERT(loghandle->lgh_ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + LASSERT(dt_object_exists(o)); + dt = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(dt); + + cur_offset = LLOG_CHUNK_SIZE; + llog_skip_over(&cur_offset, 0, prev_idx); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + while (cur_offset < lgi->lgi_attr.la_size) { + struct llog_rec_hdr *rec, *last_rec; + struct llog_rec_tail *tail; + + lgi->lgi_buf.lb_len = len; + lgi->lgi_buf.lb_buf = buf; + /* It is OK to have locking around dt_read() only, see + * comment in llog_osd_next_block for details + */ + dt_read_lock(env, o, 0); + rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset); + dt_read_unlock(env, o); + if (rc < 0) { + CERROR("%s: can't read llog block from log "DFID + " offset "LPU64": rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + PFID(lu_object_fid(&o->do_lu)), cur_offset, rc); + GOTO(out, rc); + } + + if (rc == 0) /* end of file, nothing to do */ + GOTO(out, rc); + + if (rc < sizeof(*tail)) { + CERROR("%s: invalid llog block at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + + rec = buf; + if (LLOG_REC_HDR_NEEDS_SWABBING(rec)) + lustre_swab_llog_rec(rec); + + tail = (struct llog_rec_tail *)((char *)buf + rc - + sizeof(struct llog_rec_tail)); + /* get the last record in block */ + last_rec = (struct llog_rec_hdr *)((char *)buf + rc - + le32_to_cpu(tail->lrt_len)); + + if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec)) + lustre_swab_llog_rec(last_rec); + LASSERT(last_rec->lrh_index == tail->lrt_index); + + /* this shouldn't happen */ + if (tail->lrt_index == 0) { + CERROR("%s: invalid llog tail at log id "DOSTID"/%u " + "offset "LPU64"\n", + o->do_lu.lo_dev->ld_obd->obd_name, + POSTID(&loghandle->lgh_id.lgl_oi), + loghandle->lgh_id.lgl_ogen, cur_offset); + GOTO(out, rc = -EINVAL); + } + if (tail->lrt_index < prev_idx) + continue; + + /* sanity check that the start of the new buffer is no farther + * than the record that we wanted. This shouldn't happen. */ + if (rec->lrh_index > prev_idx) { + CERROR("%s: missed desired record? %u > %u\n", + o->do_lu.lo_dev->ld_obd->obd_name, + rec->lrh_index, prev_idx); + GOTO(out, rc = -ENOENT); + } + GOTO(out, rc = 0); + } + GOTO(out, rc = -EIO); +out: + return rc; +} + +struct dt_object *llog_osd_dir_get(const struct lu_env *env, + struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dir; + int rc; + + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + if (ctxt->loc_dir == NULL) { + rc = dt_root_get(env, dt, &dti->dti_fid); + if (rc) + return ERR_PTR(rc); + dir = dt_locate(env, dt, &dti->dti_fid); + } else { + lu_object_get(&ctxt->loc_dir->do_lu); + dir = ctxt->loc_dir; + } + + return dir; +} + +static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle, + struct llog_logid *logid, char *name, + enum llog_open_param open_param) +{ + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt = handle->lgh_ctxt; + struct dt_object *o; + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(env); + LASSERT(ctxt); + LASSERT(ctxt->loc_exp); + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + LASSERT(dt); + + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG); + mutex_unlock(&ls->ls_los_mutex); + LASSERT(los); + ls_device_put(env, ls); + + LASSERT(handle); + + if (logid != NULL) { + logid_to_fid(logid, &lgi->lgi_fid); + } else if (name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out, rc = PTR_ERR(llog_dir)); + dt_read_lock(env, llog_dir, 0); + rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid); + dt_read_unlock(env, llog_dir); + lu_object_put(env, &llog_dir->do_lu); + if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) { + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, + &lgi->lgi_fid); + } + if (rc < 0) + GOTO(out, rc); + OBD_ALLOC(handle->lgh_name, strlen(name) + 1); + if (handle->lgh_name) + strcpy(handle->lgh_name, name); + else + GOTO(out, rc = -ENOMEM); + } else { + LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param); + /* generate fid for new llog */ + rc = local_object_fid_generate(env, los, &lgi->lgi_fid); + if (rc < 0) + GOTO(out, rc); + } + + o = ls_locate(env, ls, &lgi->lgi_fid); + if (IS_ERR(o)) + GOTO(out_name, rc = PTR_ERR(o)); + + /* No new llog is expected but doesn't exist */ + if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o)) + GOTO(out_put, rc = -ENOENT); + + fid_to_logid(&lgi->lgi_fid, &handle->lgh_id); + handle->lgh_obj = o; + handle->private_data = los; + LASSERT(handle->lgh_ctxt); + + RETURN(rc); + +out_put: + lu_object_put(env, &o->do_lu); +out_name: + if (handle->lgh_name != NULL) + OBD_FREE(handle->lgh_name, strlen(name) + 1); +out: + dt_los_put(los); + RETURN(rc); +} + +static int llog_osd_exist(struct llog_handle *handle) +{ + LASSERT(handle->lgh_obj); + return (dt_object_exists(handle->lgh_obj) && + !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header)); +} + +static int llog_osd_declare_create(const struct lu_env *env, + struct llog_handle *res, struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct local_oid_storage *los; + struct dt_object *o; + int rc; + + ENTRY; + + LASSERT(res->lgh_obj); + LASSERT(th); + + /* object can be created by another thread */ + o = res->lgh_obj; + if (dt_object_exists(o)) + RETURN(0); + + los = res->private_data; + LASSERT(los); + + rc = llog_osd_declare_new_object(env, los, o, th); + if (rc) + RETURN(rc); + + rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + rc = dt_declare_insert(env, llog_dir, + (struct dt_rec *)&lgi->lgi_fid, + (struct dt_key *)res->lgh_name, th); + lu_object_put(env, &llog_dir->do_lu); + if (rc) + CERROR("%s: can't declare named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +/* This is a callback from the llog_* functions. + * Assumes caller has already pushed us into the kernel context. */ +static int llog_osd_create(const struct lu_env *env, struct llog_handle *res, + struct thandle *th) +{ + struct llog_thread_info *lgi = llog_info(env); + struct local_oid_storage *los; + struct dt_object *o; + int rc = 0; + + ENTRY; + + LASSERT(env); + o = res->lgh_obj; + LASSERT(o); + + /* llog can be already created */ + if (dt_object_exists(o)) + RETURN(-EEXIST); + + los = res->private_data; + LASSERT(los); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = llog_osd_create_new_object(env, los, o, th); + else + rc = -EEXIST; + + dt_write_unlock(env, o); + if (rc) + RETURN(rc); + + if (res->lgh_name) { + struct dt_object *llog_dir; + + llog_dir = llog_osd_dir_get(env, res->lgh_ctxt); + if (IS_ERR(llog_dir)) + RETURN(PTR_ERR(llog_dir)); + + logid_to_fid(&res->lgh_id, &lgi->lgi_fid); + dt_read_lock(env, llog_dir, 0); + rc = dt_insert(env, llog_dir, + (struct dt_rec *)&lgi->lgi_fid, + (struct dt_key *)res->lgh_name, + th, BYPASS_CAPA, 1); + dt_read_unlock(env, llog_dir); + lu_object_put(env, &llog_dir->do_lu); + if (rc) + CERROR("%s: can't create named llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + res->lgh_name, rc); + } + RETURN(rc); +} + +static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle) +{ + struct local_oid_storage *los; + int rc = 0; + + ENTRY; + + LASSERT(handle->lgh_obj); + + lu_object_put(env, &handle->lgh_obj->do_lu); + + los = handle->private_data; + LASSERT(los); + dt_los_put(los); + + if (handle->lgh_name) + OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1); + + RETURN(rc); +} + +static int llog_osd_destroy(const struct lu_env *env, + struct llog_handle *loghandle) +{ + struct llog_ctxt *ctxt; + struct dt_object *o, *llog_dir = NULL; + struct dt_device *d; + struct thandle *th; + char *name = NULL; + int rc; + + ENTRY; + + ctxt = loghandle->lgh_ctxt; + LASSERT(ctxt); + + o = loghandle->lgh_obj; + LASSERT(o); + + d = lu2dt_dev(o->do_lu.lo_dev); + LASSERT(d); + LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt); + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + if (loghandle->lgh_name) { + llog_dir = llog_osd_dir_get(env, ctxt); + if (IS_ERR(llog_dir)) + GOTO(out_trans, rc = PTR_ERR(llog_dir)); + + name = loghandle->lgh_name; + rc = dt_declare_delete(env, llog_dir, + (struct dt_key *)name, th); + if (rc) + GOTO(out_trans, rc); + } + + dt_declare_ref_del(env, o, th); + + rc = dt_declare_destroy(env, o, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) { + if (name) { + dt_read_lock(env, llog_dir, 0); + rc = dt_delete(env, llog_dir, + (struct dt_key *) name, + th, BYPASS_CAPA); + dt_read_unlock(env, llog_dir); + if (rc) { + CERROR("%s: can't remove llog %s: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, + name, rc); + GOTO(out_unlock, rc); + } + } + dt_ref_del(env, o, th); + rc = dt_destroy(env, o, th); + if (rc) + GOTO(out_unlock, rc); + } +out_unlock: + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (llog_dir != NULL) + lu_object_put(env, &llog_dir->do_lu); + RETURN(rc); +} + +static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd, + struct obd_llog_group *olg, int ctxt_idx, + struct obd_device *disk_obd) +{ + struct local_oid_storage *los; + struct llog_thread_info *lgi = llog_info(env); + struct llog_ctxt *ctxt; + int rc = 0; + + ENTRY; + + LASSERT(obd); + LASSERT(olg->olg_ctxts[ctxt_idx]); + + ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]); + LASSERT(ctxt); + + /* initialize data allowing to generate new fids, + * literally we need a sequece */ + lgi->lgi_fid.f_seq = FID_SEQ_LLOG; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, &los); + if (rc < 0) + return rc; + + lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME; + lgi->lgi_fid.f_oid = 1; + lgi->lgi_fid.f_ver = 0; + rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt, + &lgi->lgi_fid, &los); + llog_ctxt_put(ctxt); + return rc; +} + +static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct dt_device *dt; + struct ls_device *ls; + struct local_oid_storage *los, *nlos; + + LASSERT(ctxt->loc_exp->exp_obd); + dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt; + ls = ls_device_get(dt); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + los = dt_los_find(ls, FID_SEQ_LLOG); + nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME); + mutex_unlock(&ls->ls_los_mutex); + if (los != NULL) { + dt_los_put(los); + local_oid_storage_fini(env, los); + } + if (nlos != NULL) { + dt_los_put(nlos); + local_oid_storage_fini(env, nlos); + } + ls_device_put(env, ls); + return 0; +} + +struct llog_operations llog_osd_ops = { + .lop_next_block = llog_osd_next_block, + .lop_prev_block = llog_osd_prev_block, + .lop_read_header = llog_osd_read_header, + .lop_destroy = llog_osd_destroy, + .lop_setup = llog_osd_setup, + .lop_cleanup = llog_osd_cleanup, + .lop_open = llog_osd_open, + .lop_exist = llog_osd_exist, + .lop_declare_create = llog_osd_declare_create, + .lop_create = llog_osd_create, + .lop_declare_write_rec = llog_osd_declare_write_rec, + .lop_write_rec = llog_osd_write_rec, + .lop_close = llog_osd_close, +}; +EXPORT_SYMBOL(llog_osd_ops); + +/* reads the catalog list */ +int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + ENTRY; + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID); + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) { + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + lgi->lgi_attr.la_valid = LA_MODE; + lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (!dt_object_exists(o)) + rc = dt_create(env, o, &lgi->lgi_attr, NULL, + &lgi->lgi_dof, th); + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, d, th); + if (rc) + GOTO(out, rc); + } + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n", + (int)lgi->lgi_attr.la_size, size); + + /* return just number of llogs */ + if (idarray == NULL) { + rc = lgi->lgi_attr.la_size / sizeof(*idarray); + GOTO(out, rc); + } + + /* read for new ost index or for empty file */ + memset(idarray, 0, size); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + GOTO(out, rc = 0); + if (lgi->lgi_attr.la_size < lgi->lgi_off + size) + size = lgi->lgi_attr.la_size - lgi->lgi_off; + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off); + if (rc) { + CERROR("%s: error reading CATALOGS: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, rc); + GOTO(out, rc); + } + + EXIT; +out: + lu_object_put(env, &o->do_lu); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_get_cat_list); + +/* writes the cat list */ +int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d, + int idx, int count, struct llog_catid *idarray) +{ + struct llog_thread_info *lgi = llog_info(env); + struct dt_object *o = NULL; + struct thandle *th; + int rc, size; + + if (!count) + RETURN(0); + + LASSERT(d); + + size = sizeof(*idarray) * count; + lgi->lgi_off = idx * sizeof(*idarray); + + lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID); + + o = dt_locate(env, d, &lgi->lgi_fid); + if (IS_ERR(o)) + RETURN(PTR_ERR(o)); + + if (!dt_object_exists(o)) + GOTO(out, rc = -ENOENT); + + rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA); + if (rc) + GOTO(out, rc); + + if (!S_ISREG(lgi->lgi_attr.la_mode)) { + CERROR("%s: CATALOGS is not a regular file!: mode = %o\n", + o->do_lu.lo_dev->ld_obd->obd_name, + lgi->lgi_attr.la_mode); + GOTO(out, rc = -ENOENT); + } + + th = dt_trans_create(env, d); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th); + if (rc) + GOTO(out, rc); + + rc = dt_trans_start_local(env, d, th); + if (rc) + GOTO(out_trans, rc); + + lgi->lgi_buf.lb_buf = idarray; + lgi->lgi_buf.lb_len = size; + rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th); + if (rc) + CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc); +out_trans: + dt_trans_stop(env, d, th); +out: + lu_object_put(env, &o->do_lu); + RETURN(rc); +} +EXPORT_SYMBOL(llog_osd_put_cat_list); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c new file mode 100644 index 000000000000..dedfecff95bc --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c @@ -0,0 +1,407 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_swab.c + * + * Swabbing of llog datatypes (from disk or over the wire). + * + * Author: jacob berkman <jacob@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_LOG + + +#include <lustre_log.h> + +static void print_llogd_body(struct llogd_body *d) +{ + CDEBUG(D_OTHER, "llogd body: %p\n", d); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n", + POSTID(&d->lgd_logid.lgl_oi)); + CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen); + CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx); + CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags); + CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index); + CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index); + CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len); + CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset); +} + +void lustre_swab_lu_fid(struct lu_fid *fid) +{ + __swab64s (&fid->f_seq); + __swab32s (&fid->f_oid); + __swab32s (&fid->f_ver); +} +EXPORT_SYMBOL(lustre_swab_lu_fid); + +void lustre_swab_ost_id(struct ost_id *oid) +{ + if (fid_seq_is_mdt0(oid->oi.oi_seq)) { + __swab64s(&oid->oi.oi_id); + __swab64s(&oid->oi.oi_seq); + } else { + lustre_swab_lu_fid(&oid->oi_fid); + } +} +EXPORT_SYMBOL(lustre_swab_ost_id); + +void lustre_swab_llog_id(struct llog_logid *log_id) +{ + __swab64s(&log_id->lgl_oi.oi.oi_id); + __swab64s(&log_id->lgl_oi.oi.oi_seq); + __swab32s(&log_id->lgl_ogen); +} +EXPORT_SYMBOL(lustre_swab_llog_id); + +void lustre_swab_llogd_body (struct llogd_body *d) +{ + ENTRY; + print_llogd_body(d); + lustre_swab_llog_id(&d->lgd_logid); + __swab32s (&d->lgd_ctxt_idx); + __swab32s (&d->lgd_llh_flags); + __swab32s (&d->lgd_index); + __swab32s (&d->lgd_saved_index); + __swab32s (&d->lgd_len); + __swab64s (&d->lgd_cur_offset); + print_llogd_body(d); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llogd_body); + +void lustre_swab_llogd_conn_body (struct llogd_conn_body *d) +{ + __swab64s (&d->lgdc_gen.mnt_cnt); + __swab64s (&d->lgdc_gen.conn_cnt); + lustre_swab_llog_id(&d->lgdc_logid); + __swab32s (&d->lgdc_ctxt_idx); +} +EXPORT_SYMBOL(lustre_swab_llogd_conn_body); + +void lustre_swab_ll_fid(struct ll_fid *fid) +{ + __swab64s (&fid->id); + __swab32s (&fid->generation); + __swab32s (&fid->f_type); +} +EXPORT_SYMBOL(lustre_swab_ll_fid); + +void lustre_swab_lu_seq_range(struct lu_seq_range *range) +{ + __swab64s (&range->lsr_start); + __swab64s (&range->lsr_end); + __swab32s (&range->lsr_index); + __swab32s (&range->lsr_flags); +} +EXPORT_SYMBOL(lustre_swab_lu_seq_range); + +void lustre_swab_llog_rec(struct llog_rec_hdr *rec) +{ + struct llog_rec_tail *tail = NULL; + + __swab32s(&rec->lrh_len); + __swab32s(&rec->lrh_index); + __swab32s(&rec->lrh_type); + __swab32s(&rec->lrh_id); + + switch (rec->lrh_type) { + case OST_SZ_REC: + { + struct llog_size_change_rec *lsc = + (struct llog_size_change_rec *)rec; + + lustre_swab_ll_fid(&lsc->lsc_fid); + __swab32s(&lsc->lsc_ioepoch); + tail = &lsc->lsc_tail; + break; + } + case MDS_UNLINK_REC: + { + struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec; + + __swab64s(&lur->lur_oid); + __swab32s(&lur->lur_oseq); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case MDS_UNLINK64_REC: + { + struct llog_unlink64_rec *lur = + (struct llog_unlink64_rec *)rec; + + lustre_swab_lu_fid(&lur->lur_fid); + __swab32s(&lur->lur_count); + tail = &lur->lur_tail; + break; + } + case CHANGELOG_REC: + { + struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec; + + __swab16s(&cr->cr.cr_namelen); + __swab16s(&cr->cr.cr_flags); + __swab32s(&cr->cr.cr_type); + __swab64s(&cr->cr.cr_index); + __swab64s(&cr->cr.cr_prev); + __swab64s(&cr->cr.cr_time); + lustre_swab_lu_fid(&cr->cr.cr_tfid); + lustre_swab_lu_fid(&cr->cr.cr_pfid); + if (CHANGELOG_REC_EXTENDED(&cr->cr)) { + struct llog_changelog_ext_rec *ext = + (struct llog_changelog_ext_rec *)rec; + + lustre_swab_lu_fid(&ext->cr.cr_sfid); + lustre_swab_lu_fid(&ext->cr.cr_spfid); + tail = &ext->cr_tail; + } else { + tail = &cr->cr_tail; + } + break; + } + case CHANGELOG_USER_REC: + { + struct llog_changelog_user_rec *cur = + (struct llog_changelog_user_rec*)rec; + + __swab32s(&cur->cur_id); + __swab64s(&cur->cur_endrec); + tail = &cur->cur_tail; + break; + } + + case MDS_SETATTR64_REC: + { + struct llog_setattr64_rec *lsr = + (struct llog_setattr64_rec *)rec; + + lustre_swab_ost_id(&lsr->lsr_oi); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_uid_h); + __swab32s(&lsr->lsr_gid); + __swab32s(&lsr->lsr_gid_h); + tail = &lsr->lsr_tail; + break; + } + case OBD_CFG_REC: + /* these are swabbed as they are consumed */ + break; + case LLOG_HDR_MAGIC: + { + struct llog_log_hdr *llh = (struct llog_log_hdr *)rec; + + __swab64s(&llh->llh_timestamp); + __swab32s(&llh->llh_count); + __swab32s(&llh->llh_bitmap_offset); + __swab32s(&llh->llh_flags); + __swab32s(&llh->llh_size); + __swab32s(&llh->llh_cat_idx); + tail = &llh->llh_tail; + break; + } + case LLOG_LOGID_MAGIC: + { + struct llog_logid_rec *lid = (struct llog_logid_rec *)rec; + + lustre_swab_llog_id(&lid->lid_id); + tail = &lid->lid_tail; + break; + } + case LLOG_GEN_REC: + { + struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec; + + __swab64s(&lgr->lgr_gen.mnt_cnt); + __swab64s(&lgr->lgr_gen.conn_cnt); + tail = &lgr->lgr_tail; + break; + } + case LLOG_PAD_MAGIC: + break; + default: + CERROR("Unknown llog rec type %#x swabbing rec %p\n", + rec->lrh_type, rec); + } + + if (tail) { + __swab32s(&tail->lrt_len); + __swab32s(&tail->lrt_index); + } +} +EXPORT_SYMBOL(lustre_swab_llog_rec); + +static void print_llog_hdr(struct llog_log_hdr *h) +{ + CDEBUG(D_OTHER, "llog header: %p\n", h); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len); + CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type); + CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp); + CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count); + CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset); + CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags); + CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size); + CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx); + CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index); + CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len); +} + +void lustre_swab_llog_hdr (struct llog_log_hdr *h) +{ + ENTRY; + print_llog_hdr(h); + + lustre_swab_llog_rec(&h->llh_hdr); + + print_llog_hdr(h); + EXIT; +} +EXPORT_SYMBOL(lustre_swab_llog_hdr); + +static void print_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */ + return; + CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg); + CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command); + CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num); + CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags); + CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid)); + + CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount); + if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) + for (i = 0; i < lcfg->lcfg_bufcount; i++) + CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n", + i, lcfg->lcfg_buflens[i]); + EXIT; +} + +void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg) +{ + int i; + ENTRY; + + __swab32s(&lcfg->lcfg_version); + + if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) { + CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n", + lcfg->lcfg_version, LUSTRE_CFG_VERSION); + EXIT; + return; + } + + __swab32s(&lcfg->lcfg_command); + __swab32s(&lcfg->lcfg_num); + __swab32s(&lcfg->lcfg_flags); + __swab64s(&lcfg->lcfg_nid); + __swab32s(&lcfg->lcfg_bufcount); + for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++) + __swab32s(&lcfg->lcfg_buflens[i]); + + print_lustre_cfg(lcfg); + EXIT; + return; +} +EXPORT_SYMBOL(lustre_swab_lustre_cfg); + +/* used only for compatibility with old on-disk cfg_marker data */ +struct cfg_marker32 { + __u32 cm_step; + __u32 cm_flags; + __u32 cm_vers; + __u32 padding; + __u32 cm_createtime; + __u32 cm_canceltime; + char cm_tgtname[MTI_NAME_MAXLEN]; + char cm_comment[MTI_NAME_MAXLEN]; +}; + +#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \ + (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32))) + +void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size) +{ + struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker; + ENTRY; + + if (swab) { + __swab32s(&marker->cm_step); + __swab32s(&marker->cm_flags); + __swab32s(&marker->cm_vers); + } + if (size == sizeof(*cm32)) { + __u32 createtime, canceltime; + /* There was a problem with the original declaration of + * cfg_marker on 32-bit systems because it used time_t as + * a wire protocol structure, and didn't verify this in + * wirecheck. We now have to convert the offsets of the + * later fields in order to work on 32- and 64-bit systems. + * + * Fortunately, the cm_comment field has no functional use + * so can be sacrificed when converting the timestamp size. + * + * Overwrite fields from the end first, so they are not + * clobbered, and use memmove() instead of memcpy() because + * the source and target buffers overlap. bug 16771 */ + createtime = cm32->cm_createtime; + canceltime = cm32->cm_canceltime; + memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32); + marker->cm_comment[MTI_NAMELEN32 - 1] = '\0'; + memmove(marker->cm_tgtname, cm32->cm_tgtname, + sizeof(marker->cm_tgtname)); + if (swab) { + __swab32s(&createtime); + __swab32s(&canceltime); + } + marker->cm_createtime = createtime; + marker->cm_canceltime = canceltime; + CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) " + "for target %s, converting\n", + marker->cm_tgtname); + } else if (swab) { + __swab64s(&marker->cm_createtime); + __swab64s(&marker->cm_canceltime); + } + + EXIT; + return; +} +EXPORT_SYMBOL(lustre_swab_cfg_marker); diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c new file mode 100644 index 000000000000..d397f781ec43 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c @@ -0,0 +1,1087 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/llog_test.c + * + * Author: Phil Schwan <phil@clusterfs.com> + * Author: Mikhail Pershin <mike.pershin@intel.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/module.h> +#include <linux/init.h> + +#include <obd_class.h> +#include <lustre_fid.h> +#include <lustre_log.h> + +/* This is slightly more than the number of records that can fit into a + * single llog file, because the llog_log_header takes up some of the + * space in the first block that cannot be used for the bitmap. */ +#define LLOG_TEST_RECNUM (LLOG_CHUNK_SIZE * 8) + +static int llog_test_rand; +static struct obd_uuid uuid = { .uuid = "test_uuid" }; +static struct llog_logid cat_logid; + +struct llog_mini_rec { + struct llog_rec_hdr lmr_hdr; + struct llog_rec_tail lmr_tail; +} __attribute__((packed)); + +static int verify_handle(char *test, struct llog_handle *llh, int num_recs) +{ + int i; + int last_idx = 0; + int active_recs = 0; + + for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) { + if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) { + last_idx = i; + active_recs++; + } + } + + if (active_recs != num_recs) { + CERROR("%s: expected %d active recs after write, found %d\n", + test, num_recs, active_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_hdr->llh_count != num_recs) { + CERROR("%s: handle->count is %d, expected %d after write\n", + test, llh->lgh_hdr->llh_count, num_recs); + RETURN(-ERANGE); + } + + if (llh->lgh_last_idx < last_idx) { + CERROR("%s: handle->last_idx is %d, expected %d after write\n", + test, llh->lgh_last_idx, last_idx); + RETURN(-ERANGE); + } + + RETURN(0); +} + +/* Test named-log create/open, close */ +static int llog_test_1(const struct lu_env *env, + struct obd_device *obd, char *name) +{ + struct llog_handle *llh; + struct llog_ctxt *ctxt; + int rc; + int rc2; + + ENTRY; + + CWARN("1a: create a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open_create(env, ctxt, &llh, NULL, name); + if (rc) { + CERROR("1a: llog_create with name %s failed: %d\n", name, rc); + GOTO(out, rc); + } + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("1a: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + rc = verify_handle("1", llh, 1); + + CWARN("1b: close newly-created log\n"); +out_close: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("1b: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* Test named-log reopen; returns opened log on success */ +static int llog_test_2(const struct lu_env *env, struct obd_device *obd, + char *name, struct llog_handle **llh) +{ + struct llog_ctxt *ctxt; + struct llog_handle *loghandle; + struct llog_logid logid; + int rc; + + ENTRY; + + CWARN("2a: re-open a log with name: %s\n", name); + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2a: re-open log with name %s failed: %d\n", name, rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2a: can't init llog handle: %d\n", rc); + GOTO(out_close_llh, rc); + } + + rc = verify_handle("2", *llh, 1); + if (rc) + GOTO(out_close_llh, rc); + + /* XXX: there is known issue with tests 2b, MGS is not able to create + * anonymous llog, exit now to allow following tests run. + * It is fixed in upcoming llog over OSD code */ + GOTO(out_put, rc); + + CWARN("2b: create a log without specified NAME & LOGID\n"); + rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL); + if (rc) { + CERROR("2b: create log failed\n"); + GOTO(out_close_llh, rc); + } + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2b: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + logid = loghandle->lgh_id; + llog_close(env, loghandle); + + CWARN("2c: re-open the log by LOGID\n"); + rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("2c: re-open log by LOGID failed\n"); + GOTO(out_close_llh, rc); + } + + rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid); + if (rc) { + CERROR("2c: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + + CWARN("2b: destroy this log\n"); + rc = llog_destroy(env, loghandle); + if (rc) + CERROR("2d: destroy log failed\n"); +out_close: + llog_close(env, loghandle); +out_close_llh: + if (rc) + llog_close(env, *llh); +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test record writing, single and in bulk */ +static int llog_test_3(const struct lu_env *env, struct obd_device *obd, + struct llog_handle *llh) +{ + struct llog_gen_rec lgr; + int rc, i; + int num_recs = 1; /* 1 for the header */ + + ENTRY; + + lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr); + lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + CWARN("3a: write one create_rec\n"); + rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1); + num_recs++; + if (rc < 0) { + CERROR("3a: write one log record failed: %d\n", rc); + RETURN(rc); + } + + rc = verify_handle("3a", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3b: write 10 cfg log records with 8 bytes bufs\n"); + for (i = 0; i < 10; i++) { + struct llog_rec_hdr hdr; + char buf[8]; + + hdr.lrh_len = 8; + hdr.lrh_type = OBD_CFG_REC; + memset(buf, 0, sizeof buf); + rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1); + if (rc < 0) { + CERROR("3b: write 10 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3b", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3c: write 1000 more log records\n"); + for (i = 0; i < 1000; i++) { + rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1); + if (rc < 0) { + CERROR("3c: write 1000 records failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + + rc = verify_handle("3c", llh, num_recs); + if (rc) + RETURN(rc); + + CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n"); + for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) { + struct llog_rec_hdr hdr; + char buf_even[24]; + char buf_odd[32]; + + memset(buf_odd, 0, sizeof buf_odd); + memset(buf_even, 0, sizeof buf_even); + if ((i % 2) == 0) { + hdr.lrh_len = 24; + hdr.lrh_type = OBD_CFG_REC; + rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1); + } else { + hdr.lrh_len = 32; + hdr.lrh_type = OBD_CFG_REC; + rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1); + } + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("3d: write recs failed at #%d: %d\n", + i + 1, rc); + RETURN(rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("3d: write record more than BITMAP size!\n"); + RETURN(-EINVAL); + } + CWARN("3d: wrote %d more records before end of llog is reached\n", + num_recs); + + rc = verify_handle("3d", llh, num_recs); + + RETURN(rc); +} + +/* Test catalogue additions */ +static int llog_test_4(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *cath; + char name[10]; + int rc, rc2, i, buflen; + struct llog_mini_rec lmr; + struct llog_cookie cookie; + struct llog_ctxt *ctxt; + int num_recs = 0; + char *buf; + struct llog_rec_hdr rec; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = 0xf00f00; + + sprintf(name, "%x", llog_test_rand + 1); + CWARN("4a: create a catalog log with name: %s\n", name); + rc = llog_open_create(env, ctxt, &cath, NULL, name); + if (rc) { + CERROR("4a: llog_create with name %s failed: %d\n", name, rc); + GOTO(ctxt_release, rc); + } + rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("4a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + num_recs++; + cat_logid = cath->lgh_id; + + CWARN("4b: write 1 record into the catalog\n"); + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL); + if (rc != 1) { + CERROR("4b: write 1 catalog record failed at: %d\n", rc); + GOTO(out, rc); + } + num_recs++; + rc = verify_handle("4b", cath, 2); + if (rc) + GOTO(out, rc); + + rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4c: cancel 1 log record\n"); + rc = llog_cat_cancel_records(env, cath, 1, &cookie); + if (rc) { + CERROR("4c: cancel 1 catalog based record failed: %d\n", rc); + GOTO(out, rc); + } + num_recs--; + + rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs); + if (rc) + GOTO(out, rc); + + CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM); + for (i = 0; i < LLOG_TEST_RECNUM; i++) { + rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL); + if (rc) { + CERROR("4d: write %d records failed at #%d: %d\n", + LLOG_TEST_RECNUM, i + 1, rc); + GOTO(out, rc); + } + num_recs++; + } + + /* make sure new plain llog appears */ + rc = verify_handle("4d", cath, 3); + if (rc) + GOTO(out, rc); + + CWARN("4e: add 5 large records, one record per block\n"); + buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail); + OBD_ALLOC(buf, buflen); + if (buf == NULL) + GOTO(out, rc = -ENOMEM); + for (i = 0; i < 5; i++) { + rec.lrh_len = buflen; + rec.lrh_type = OBD_CFG_REC; + rc = llog_cat_add(env, cath, &rec, NULL, buf); + if (rc) { + CERROR("4e: write 5 records failed at #%d: %d\n", + i + 1, rc); + GOTO(out_free, rc); + } + num_recs++; + } +out_free: + OBD_FREE(buf, buflen); +out: + CWARN("4f: put newly-created catalog\n"); + rc2 = llog_cat_close(env, cath); + if (rc2) { + CERROR("4: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static int cat_counter; + +static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_logid_rec *lir = (struct llog_logid_rec *)rec; + struct lu_fid fid = {0}; + + if (rec->lrh_type != LLOG_LOGID_MAGIC) { + CERROR("invalid record in catalog\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&lir->lid_id, &fid); + + CWARN("seeing record at index %d - "DFID" in log "DFID"\n", + rec->lrh_index, PFID(&fid), + PFID(lu_object_fid(&llh->lgh_obj->do_lu))); + + cat_counter++; + + RETURN(0); +} + +static int plain_counter; + +static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n", + rec->lrh_index, PFID(&fid)); + + plain_counter++; + + RETURN(0); +} + +static int cancel_count; + +static int llog_cancel_rec_cb(const struct lu_env *env, + struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct llog_cookie cookie; + + if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) { + CERROR("log is not plain\n"); + RETURN(-EINVAL); + } + + cookie.lgc_lgl = llh->lgh_id; + cookie.lgc_index = rec->lrh_index; + + llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie); + cancel_count++; + if (cancel_count == LLOG_TEST_RECNUM) + RETURN(-LLOG_EEMPTY); + RETURN(0); +} + +/* Test log and catalogue processing */ +static int llog_test_5(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + char name[10]; + int rc, rc2; + struct llog_mini_rec lmr; + struct llog_ctxt *ctxt; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE; + lmr.lmr_hdr.lrh_type = 0xf00f00; + + CWARN("5a: re-open catalog by id\n"); + rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("5a: llog_create with logid failed: %d\n", rc); + GOTO(out_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid); + if (rc) { + CERROR("5a: can't init llog handle: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5b: print the catalog entries.. we expect 2\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5b: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 2) { + CERROR("5b: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM); + cancel_count = 0; + rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0); + if (rc != -LLOG_EEMPTY) { + CERROR("5c: process with cat_cancel_cb failed: %d\n", rc); + GOTO(out, rc); + } + + CWARN("5c: print the catalog entries.. we expect 1\n"); + cat_counter = 0; + rc = llog_process(env, llh, cat_print_cb, "test 5", NULL); + if (rc) { + CERROR("5c: process with cat_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (cat_counter != 1) { + CERROR("5c: %d entries in catalog\n", cat_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5d: add 1 record to the log with many canceled empty pages\n"); + rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL); + if (rc) { + CERROR("5d: add record to the log with many canceled empty " + "pages failed\n"); + GOTO(out, rc); + } + + CWARN("5e: print plain log entries.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0); + if (rc) { + CERROR("5e: process with plain_print_cb failed: %d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5e: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + + CWARN("5f: print plain log entries reversely.. expect 6\n"); + plain_counter = 0; + rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar"); + if (rc) { + CERROR("5f: reversely process with plain_print_cb failed:" + "%d\n", rc); + GOTO(out, rc); + } + if (plain_counter != 6) { + CERROR("5f: found %d records\n", plain_counter); + GOTO(out, rc = -EINVAL); + } + +out: + CWARN("5g: close re-opened catalog\n"); + rc2 = llog_cat_close(env, llh); + if (rc2) { + CERROR("5g: close log %s failed: %d\n", name, rc2); + if (rc == 0) + rc = rc2; + } +out_put: + llog_ctxt_put(ctxt); + + RETURN(rc); +} + +/* Test client api; open log by name and process */ +static int llog_test_6(const struct lu_env *env, struct obd_device *obd, + char *name) +{ + struct obd_device *mgc_obd; + struct llog_ctxt *ctxt; + struct obd_uuid *mgs_uuid; + struct obd_export *exp; + struct obd_uuid uuid = { "LLOG_TEST6_UUID" }; + struct llog_handle *llh = NULL; + struct llog_ctxt *nctxt; + int rc, rc2; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid; + + CWARN("6a: re-open log %s using client API\n", name); + mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL); + if (mgc_obd == NULL) { + CERROR("6a: no MGC devices connected to %s found.\n", + mgs_uuid->uuid); + GOTO(ctxt_release, rc = -ENOENT); + } + + rc = obd_connect(NULL, &exp, mgc_obd, &uuid, + NULL /* obd_connect_data */, NULL); + if (rc != -EALREADY) { + CERROR("6a: connect on connected MGC (%s) failed to return" + " -EALREADY", mgc_obd->obd_name); + if (rc == 0) + obd_disconnect(exp); + GOTO(ctxt_release, rc = -EINVAL); + } + + nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT); + rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) { + CERROR("6a: llog_open failed %d\n", rc); + GOTO(nctxt_put, rc); + } + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) { + CERROR("6a: llog_init_handle failed %d\n", rc); + GOTO(parse_out, rc); + } + + plain_counter = 1; /* llog header is first record */ + CWARN("6b: process log %s using client API\n", name); + rc = llog_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6b: llog_process failed %d\n", rc); + CWARN("6b: processed %d records\n", plain_counter); + + rc = verify_handle("6b", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + + plain_counter = 1; /* llog header is first record */ + CWARN("6c: process log %s reversely using client API\n", name); + rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL); + if (rc) + CERROR("6c: llog_reverse_process failed %d\n", rc); + CWARN("6c: processed %d records\n", plain_counter); + + rc = verify_handle("6c", llh, plain_counter); + if (rc) + GOTO(parse_out, rc); + +parse_out: + rc2 = llog_close(env, llh); + if (rc2) { + CERROR("6: llog_close failed: rc = %d\n", rc2); + if (rc == 0) + rc = rc2; + } +nctxt_put: + llog_ctxt_put(nctxt); +ctxt_release: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +static union { + struct llog_rec_hdr lrh; /* common header */ + struct llog_logid_rec llr; /* LLOG_LOGID_MAGIC */ + struct llog_unlink64_rec lur; /* MDS_UNLINK64_REC */ + struct llog_setattr64_rec lsr64; /* MDS_SETATTR64_REC */ + struct llog_size_change_rec lscr; /* OST_SZ_REC */ + struct llog_changelog_rec lcr; /* CHANGELOG_REC */ + struct llog_changelog_user_rec lcur; /* CHANGELOG_USER_REC */ + struct llog_gen_rec lgr; /* LLOG_GEN_REC */ +} llog_records; + +static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + struct lu_fid fid = {0}; + + logid_to_fid(&llh->lgh_id, &fid); + + CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n", + rec->lrh_type, rec->lrh_index, PFID(&fid)); + + plain_counter++; + return 0; +} + +static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh, + struct llog_rec_hdr *rec, void *data) +{ + plain_counter++; + /* test LLOG_DEL_RECORD is working */ + return LLOG_DEL_RECORD; +} + +static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt) +{ + struct llog_handle *llh; + int rc = 0, i, process_count; + int num_recs = 0; + + ENTRY; + + rc = llog_open_create(env, ctxt, &llh, NULL, NULL); + if (rc) { + CERROR("7_sub: create log failed\n"); + RETURN(rc); + } + + rc = llog_init_handle(env, llh, + LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY, + &uuid); + if (rc) { + CERROR("7_sub: can't init llog handle: %d\n", rc); + GOTO(out_close, rc); + } + for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) { + rc = llog_write(env, llh, &llog_records.lrh, NULL, 0, + NULL, -1); + if (rc == -ENOSPC) { + break; + } else if (rc < 0) { + CERROR("7_sub: write recs failed at #%d: %d\n", + i + 1, rc); + GOTO(out_close, rc); + } + num_recs++; + } + if (rc != -ENOSPC) { + CWARN("7_sub: write record more than BITMAP size!\n"); + GOTO(out_close, rc = -EINVAL); + } + + rc = verify_handle("7_sub", llh, num_recs + 1); + if (rc) { + CERROR("7_sub: verify handle failed: %d\n", rc); + GOTO(out_close, rc); + } + if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1) + CWARN("7_sub: records are not aligned, written %d from %u\n", + num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1); + + plain_counter = 0; + rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + process_count = plain_counter; + if (process_count != num_recs) { + CERROR("7_sub: processed %d records from %d total\n", + process_count, num_recs); + GOTO(out_close, rc = -EINVAL); + } + + plain_counter = 0; + rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL); + if (rc) { + CERROR("7_sub: reverse llog process failed: %d\n", rc); + GOTO(out_close, rc); + } + if (process_count != plain_counter) { + CERROR("7_sub: Reverse/direct processing found different" + "number of records: %d/%d\n", + plain_counter, process_count); + GOTO(out_close, rc = -EINVAL); + } + if (llog_exist(llh)) { + CERROR("7_sub: llog exists but should be zapped\n"); + GOTO(out_close, rc = -EEXIST); + } + + rc = verify_handle("7_sub", llh, 1); +out_close: + if (rc) + llog_destroy(env, llh); + llog_close(env, llh); + RETURN(rc); +} + +/* Test all llog records writing and processing */ +static int llog_test_7(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + int rc; + + ENTRY; + + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + + CWARN("7a: test llog_logid_rec\n"); + llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr); + llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr); + llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7a: llog_logid_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7b: test llog_unlink64_rec\n"); + llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur); + llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur); + llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7b: llog_unlink_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7c: test llog_setattr64_rec\n"); + llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64); + llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7c: llog_setattr64_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7d: test llog_size_change_rec\n"); + llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr); + llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7d: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7e: test llog_changelog_rec\n"); + llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr); + llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7e: llog_changelog_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7f: test llog_changelog_user_rec\n"); + llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur); + llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7f: llog_changelog_user_rec test failed\n"); + GOTO(out, rc); + } + + CWARN("7g: test llog_gen_rec\n"); + llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr); + llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC; + + rc = llog_test_7_sub(env, ctxt); + if (rc) { + CERROR("7g: llog_size_change_rec test failed\n"); + GOTO(out, rc); + } +out: + llog_ctxt_put(ctxt); + RETURN(rc); +} + +/* ------------------------------------------------------------------------- + * Tests above, boring obd functions below + * ------------------------------------------------------------------------- */ +static int llog_run_tests(const struct lu_env *env, struct obd_device *obd) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + int rc, err; + char name[10]; + + ENTRY; + ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + + sprintf(name, "%x", llog_test_rand); + + rc = llog_test_1(env, obd, name); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_2(env, obd, name, &llh); + if (rc) + GOTO(cleanup_ctxt, rc); + + rc = llog_test_3(env, obd, llh); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_4(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_5(env, obd); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_6(env, obd, name); + if (rc) + GOTO(cleanup, rc); + + rc = llog_test_7(env, obd); + if (rc) + GOTO(cleanup, rc); + +cleanup: + err = llog_destroy(env, llh); + if (err) + CERROR("cleanup: llog_destroy failed: %d\n", err); + llog_close(env, llh); + if (rc == 0) + rc = err; +cleanup_ctxt: + llog_ctxt_put(ctxt); + return rc; +} + +#ifdef LPROCFS +static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} }; +static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} }; +static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars) +{ + lvars->module_vars = lprocfs_llog_test_module_vars; + lvars->obd_vars = lprocfs_llog_test_obd_vars; +} +#endif + +static int llog_test_cleanup(struct obd_device *obd) +{ + struct obd_device *tgt; + struct lu_env env; + int rc; + + ENTRY; + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd; + rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT)); + if (rc) + CERROR("failed to llog_test_llog_finish: %d\n", rc); + lu_env_fini(&env); + RETURN(rc); +} + +static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_device *tgt; + struct llog_ctxt *ctxt; + struct dt_object *o; + struct lu_env env; + struct lu_context test_session; + int rc; + + ENTRY; + + if (lcfg->lcfg_bufcount < 2) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + if (lcfg->lcfg_buflens[1] < 1) { + CERROR("requires a TARGET OBD name\n"); + RETURN(-EINVAL); + } + + /* disk obd */ + tgt = class_name2obd(lustre_cfg_string(lcfg, 1)); + if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) { + CERROR("target device not attached or not set up (%s)\n", + lustre_cfg_string(lcfg, 1)); + RETURN(-EINVAL); + } + + rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD); + if (rc) + RETURN(rc); + + rc = lu_context_init(&test_session, LCT_SESSION); + if (rc) + GOTO(cleanup_env, rc); + test_session.lc_thread = (struct ptlrpc_thread *)current; + lu_context_enter(&test_session); + env.le_ses = &test_session; + + CWARN("Setup llog-test device over %s device\n", + lustre_cfg_string(lcfg, 1)); + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev); + + rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt, + &llog_osd_ops); + if (rc) + GOTO(cleanup_session, rc); + + /* use MGS llog dir for tests */ + ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt); + o = ctxt->loc_dir; + llog_ctxt_put(ctxt); + + ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT); + LASSERT(ctxt); + ctxt->loc_dir = o; + llog_ctxt_put(ctxt); + + llog_test_rand = cfs_rand(); + + rc = llog_run_tests(&env, tgt); + if (rc) + llog_test_cleanup(obd); +cleanup_session: + lu_context_exit(&test_session); + lu_context_fini(&test_session); +cleanup_env: + lu_env_fini(&env); + RETURN(rc); +} + +static struct obd_ops llog_obd_ops = { + .o_owner = THIS_MODULE, + .o_setup = llog_test_setup, + .o_cleanup = llog_test_cleanup, +}; + +static int __init llog_test_init(void) +{ + struct lprocfs_static_vars lvars; + + lprocfs_llog_test_init_vars(&lvars); + return class_register_type(&llog_obd_ops, NULL, + lvars.module_vars, "llog_test", NULL); +} + +static void __exit llog_test_exit(void) +{ + class_unregister_type("llog_test"); +} + +MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>"); +MODULE_DESCRIPTION("llog test module"); +MODULE_LICENSE("GPL"); + +module_init(llog_test_init); +module_exit(llog_test_exit); diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c new file mode 100644 index 000000000000..3be35a83a495 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c @@ -0,0 +1,903 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin <mike.pershin@intel.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include "local_storage.h" + +/* all initialized local storages on this node are linked on this */ +static LIST_HEAD(ls_list_head); +static DEFINE_MUTEX(ls_list_mutex); + +static int ls_object_init(const struct lu_env *env, struct lu_object *o, + const struct lu_object_conf *unused) +{ + struct ls_device *ls; + struct lu_object *below; + struct lu_device *under; + + ENTRY; + + ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev); + under = &ls->ls_osd->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + if (below == NULL) + RETURN(-ENOMEM); + + lu_object_add(o, below); + + RETURN(0); +} + +static void ls_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct ls_object *obj = lu2ls_obj(o); + struct lu_object_header *h = o->lo_header; + + dt_object_fini(&obj->ls_obj); + lu_object_header_fini(h); + OBD_FREE_PTR(obj); +} + +struct lu_object_operations ls_lu_obj_ops = { + .loo_object_init = ls_object_init, + .loo_object_free = ls_object_free, +}; + +struct lu_object *ls_object_alloc(const struct lu_env *env, + const struct lu_object_header *_h, + struct lu_device *d) +{ + struct lu_object_header *h; + struct ls_object *o; + struct lu_object *l; + + LASSERT(_h == NULL); + + OBD_ALLOC_PTR(o); + if (o != NULL) { + l = &o->ls_obj.do_lu; + h = &o->ls_header; + + lu_object_header_init(h); + dt_object_init(&o->ls_obj, h, d); + lu_object_add_top(h, l); + + l->lo_ops = &ls_lu_obj_ops; + + return l; + } else { + return NULL; + } +} + +static struct lu_device_operations ls_lu_dev_ops = { + .ldo_object_alloc = ls_object_alloc +}; + +static struct ls_device *__ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls, *ret = NULL; + + list_for_each_entry(ls, &ls_list_head, ls_linkage) { + if (ls->ls_osd == dev) { + atomic_inc(&ls->ls_refcount); + ret = ls; + break; + } + } + return ret; +} + +struct ls_device *ls_find_dev(struct dt_device *dev) +{ + struct ls_device *ls; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + mutex_unlock(&ls_list_mutex); + + return ls; +} + +static struct lu_device_type_operations ls_device_type_ops = { + .ldto_start = NULL, + .ldto_stop = NULL, +}; + +static struct lu_device_type ls_lu_type = { + .ldt_name = "local_storage", + .ldt_ops = &ls_device_type_ops, +}; + +struct ls_device *ls_device_get(struct dt_device *dev) +{ + struct ls_device *ls; + + ENTRY; + + mutex_lock(&ls_list_mutex); + ls = __ls_find_dev(dev); + if (ls) + GOTO(out_ls, ls); + + /* not found, then create */ + OBD_ALLOC_PTR(ls); + if (ls == NULL) + GOTO(out_ls, ls = ERR_PTR(-ENOMEM)); + + atomic_set(&ls->ls_refcount, 1); + INIT_LIST_HEAD(&ls->ls_los_list); + mutex_init(&ls->ls_los_mutex); + + ls->ls_osd = dev; + + LASSERT(dev->dd_lu_dev.ld_site); + lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type); + ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops; + ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site; + + /* finally add ls to the list */ + list_add(&ls->ls_linkage, &ls_list_head); +out_ls: + mutex_unlock(&ls_list_mutex); + RETURN(ls); +} + +void ls_device_put(const struct lu_env *env, struct ls_device *ls) +{ + LASSERT(env); + if (!atomic_dec_and_test(&ls->ls_refcount)) + return; + + mutex_lock(&ls_list_mutex); + if (atomic_read(&ls->ls_refcount) == 0) { + LASSERT(list_empty(&ls->ls_los_list)); + list_del(&ls->ls_linkage); + lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0); + lu_device_fini(&ls->ls_top_dev.dd_lu_dev); + OBD_FREE_PTR(ls); + } + mutex_unlock(&ls_list_mutex); +} + +/** + * local file fid generation + */ +int local_object_fid_generate(const struct lu_env *env, + struct local_oid_storage *los, + struct lu_fid *fid) +{ + LASSERT(los->los_dev); + LASSERT(los->los_obj); + + /* take next OID */ + + /* to make it unique after reboot we store + * the latest generated fid atomically with + * object creation see local_object_create() */ + + mutex_lock(&los->los_id_lock); + fid->f_seq = los->los_seq; + fid->f_oid = ++los->los_last_oid; + fid->f_ver = 0; + mutex_unlock(&los->los_id_lock); + + return 0; +} + +int local_object_declare_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, + struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + int rc; + + ENTRY; + + /* update fid generation file */ + if (los != NULL) { + LASSERT(dt_object_exists(los->los_obj)); + rc = dt_declare_record_write(env, los->los_obj, + sizeof(struct los_ondisk), 0, th); + if (rc) + RETURN(rc); + } + + rc = dt_declare_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + dti->dti_lb.lb_buf = NULL; + dti->dti_lb.lb_len = sizeof(dti->dti_lma); + rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th); + + RETURN(rc); +} + +int local_object_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *o, struct lu_attr *attr, + struct dt_object_format *dof, struct thandle *th) +{ + struct dt_thread_info *dti = dt_info(env); + obd_id lastid; + int rc; + + ENTRY; + + rc = dt_create(env, o, attr, NULL, dof, th); + if (rc) + RETURN(rc); + + if (los == NULL) + RETURN(rc); + + LASSERT(los->los_obj); + LASSERT(dt_object_exists(los->los_obj)); + + /* many threads can be updated this, serialize + * them here to avoid the race where one thread + * takes the value first, but writes it last */ + mutex_lock(&los->los_id_lock); + + /* update local oid number on disk so that + * we know the last one used after reboot */ + lastid = cpu_to_le64(los->los_last_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off, + th); + mutex_unlock(&los->los_id_lock); + + RETURN(rc); +} + +/* + * Create local named object (file, directory or index) in parent directory. + */ +struct dt_object *__local_file_create(const struct lu_env *env, + const struct lu_fid *fid, + struct local_oid_storage *los, + struct ls_device *ls, + struct dt_object *parent, + const char *name, struct lu_attr *attr, + struct dt_object_format *dof) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + dto = ls_locate(env, ls, fid); + if (unlikely(IS_ERR(dto))) + RETURN(dto); + + LASSERT(dto != NULL); + if (dt_object_exists(dto)) + GOTO(out, rc = -EEXIST); + + th = dt_trans_create(env, ls->ls_osd); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(trans_stop, rc); + + if (dti->dti_dof.dof_type == DFT_DIR) { + dt_declare_ref_add(env, dto, th); + dt_declare_ref_add(env, parent, th); + } + + rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th); + if (rc) + GOTO(trans_stop, rc); + + rc = dt_trans_start_local(env, ls->ls_osd, th); + if (rc) + GOTO(trans_stop, rc); + + dt_write_lock(env, dto, 0); + if (dt_object_exists(dto)) + GOTO(unlock, rc = 0); + + CDEBUG(D_OTHER, "create new object "DFID"\n", + PFID(lu_object_fid(&dto->do_lu))); + rc = local_object_create(env, los, dto, attr, dof, th); + if (rc) + GOTO(unlock, rc); + LASSERT(dt_object_exists(dto)); + + if (dti->dti_dof.dof_type == DFT_DIR) { + if (!dt_try_as_dir(env, dto)) + GOTO(destroy, rc = -ENOTDIR); + /* Add "." and ".." for newly created dir */ + rc = dt_insert(env, dto, (void *)fid, (void *)".", th, + BYPASS_CAPA, 1); + if (rc) + GOTO(destroy, rc); + dt_ref_add(env, dto, th); + rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu), + (void *)"..", th, BYPASS_CAPA, 1); + if (rc) + GOTO(destroy, rc); + } + + dt_write_lock(env, parent, 0); + rc = dt_insert(env, parent, (const struct dt_rec *)fid, + (const struct dt_key *)name, th, BYPASS_CAPA, 1); + if (dti->dti_dof.dof_type == DFT_DIR) + dt_ref_add(env, parent, th); + dt_write_unlock(env, parent); + if (rc) + GOTO(destroy, rc); +destroy: + if (rc) + dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +trans_stop: + dt_trans_stop(env, ls->ls_osd, th); +out: + if (rc) { + lu_object_put_nocache(env, &dto->do_lu); + dto = ERR_PTR(rc); + } + RETURN(dto); +} + +/* + * Look up and create (if it does not exist) a local named file or directory in + * parent directory. + */ +struct dt_object *local_file_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid); + else if (rc != -ENOENT) + dto = ERR_PTR(rc); + else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create); + +struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, + __u32 mode) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + dto = dt_locate(env, dt, &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT); + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + lu_object_put_nocache(env, &dto->do_lu); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_file_find_or_create_with_fid); + +/* + * Look up and create (if it does not exist) a local named index file in parent + * directory. + */ +struct dt_object *local_index_find_or_create(const struct lu_env *env, + struct local_oid_storage *los, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + rc = local_object_fid_generate(env, los, &dti->dti_fid); + if (rc < 0) { + dto = ERR_PTR(rc); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, &dti->dti_fid, los, + dt2ls_dev(los->los_dev), + parent, name, &dti->dti_attr, + &dti->dti_dof); + } + } + return dto; + +} +EXPORT_SYMBOL(local_index_find_or_create); + +struct dt_object * +local_index_find_or_create_with_fid(const struct lu_env *env, + struct dt_device *dt, + const struct lu_fid *fid, + struct dt_object *parent, + const char *name, __u32 mode, + const struct dt_index_features *ft) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + int rc; + + LASSERT(parent); + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == 0) { + /* name is found, get the object */ + if (!lu_fid_eq(fid, &dti->dti_fid)) + dto = ERR_PTR(-EINVAL); + else + dto = dt_locate(env, dt, fid); + } else if (rc != -ENOENT) { + dto = ERR_PTR(rc); + } else { + struct ls_device *ls; + + ls = ls_device_get(dt); + if (IS_ERR(ls)) { + dto = ERR_PTR(PTR_ERR(ls)); + } else { + /* create the object */ + dti->dti_attr.la_valid = LA_MODE; + dti->dti_attr.la_mode = mode; + dti->dti_dof.dof_type = DFT_INDEX; + dti->dti_dof.u.dof_idx.di_feat = ft; + dto = __local_file_create(env, fid, NULL, ls, parent, + name, &dti->dti_attr, + &dti->dti_dof); + /* ls_device_put() will finalize the ls device, we + * have to open the object in other device stack */ + if (!IS_ERR(dto)) { + dti->dti_fid = dto->do_lu.lo_header->loh_fid; + lu_object_put_nocache(env, &dto->do_lu); + dto = dt_locate(env, dt, &dti->dti_fid); + } + ls_device_put(env, ls); + } + } + return dto; +} +EXPORT_SYMBOL(local_index_find_or_create_with_fid); + +static int local_object_declare_unlink(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + struct dt_object *c, const char *name, + struct thandle *th) +{ + int rc; + + rc = dt_declare_delete(env, p, (const struct dt_key *)name, th); + if (rc < 0) + return rc; + + rc = dt_declare_ref_del(env, c, th); + if (rc < 0) + return rc; + + return dt_declare_destroy(env, c, th); +} + +int local_object_unlink(const struct lu_env *env, struct dt_device *dt, + struct dt_object *parent, const char *name) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *dto; + struct thandle *th; + int rc; + + ENTRY; + + rc = dt_lookup_dir(env, parent, name, &dti->dti_fid); + if (rc == -ENOENT) + RETURN(0); + else if (rc < 0) + RETURN(rc); + + dto = dt_locate(env, dt, &dti->dti_fid); + if (unlikely(IS_ERR(dto))) + RETURN(PTR_ERR(dto)); + + th = dt_trans_create(env, dt); + if (IS_ERR(th)) + GOTO(out, rc = PTR_ERR(th)); + + rc = local_object_declare_unlink(env, dt, parent, dto, name, th); + if (rc < 0) + GOTO(stop, rc); + + rc = dt_trans_start_local(env, dt, th); + if (rc < 0) + GOTO(stop, rc); + + dt_write_lock(env, dto, 0); + rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA); + if (rc < 0) + GOTO(unlock, rc); + + rc = dt_ref_del(env, dto, th); + if (rc < 0) { + rc = dt_insert(env, parent, + (const struct dt_rec *)&dti->dti_fid, + (const struct dt_key *)name, th, BYPASS_CAPA, 1); + GOTO(unlock, rc); + } + + rc = dt_destroy(env, dto, th); +unlock: + dt_write_unlock(env, dto); +stop: + dt_trans_stop(env, dt, th); +out: + lu_object_put_nocache(env, &dto->do_lu); + return rc; +} +EXPORT_SYMBOL(local_object_unlink); + +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq) +{ + struct local_oid_storage *los, *ret = NULL; + + list_for_each_entry(los, &ls->ls_los_list, los_list) { + if (los->los_seq == seq) { + atomic_inc(&los->los_refcount); + ret = los; + break; + } + } + return ret; +} + +void dt_los_put(struct local_oid_storage *los) +{ + if (atomic_dec_and_test(&los->los_refcount)) + /* should never happen, only local_oid_storage_fini should + * drop refcount to zero */ + LBUG(); + return; +} + +/* after Lustre 2.3 release there may be old file to store last generated FID + * If such file exists then we have to read its content + */ +int lastid_compat_check(const struct lu_env *env, struct dt_device *dev, + __u64 lastid_seq, __u32 *first_oid, struct ls_device *ls) +{ + struct dt_thread_info *dti = dt_info(env); + struct dt_object *root = NULL; + struct los_ondisk losd; + struct dt_object *o = NULL; + int rc = 0; + + rc = dt_root_get(env, dev, &dti->dti_fid); + if (rc) + return rc; + + root = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(root)) + return PTR_ERR(root); + + /* find old last_id file */ + snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-"LPX64"-lastid", + lastid_seq); + rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid); + lu_object_put_nocache(env, &root->do_lu); + if (rc == -ENOENT) { + /* old llog lastid accessed by FID only */ + if (lastid_seq != FID_SEQ_LLOG) + return 0; + dti->dti_fid.f_seq = FID_SEQ_LLOG; + dti->dti_fid.f_oid = 1; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(o)) + return PTR_ERR(o); + + if (!dt_object_exists(o)) { + lu_object_put_nocache(env, &o->do_lu); + return 0; + } + CDEBUG(D_INFO, "Found old llog lastid file\n"); + } else if (rc < 0) { + return rc; + } else { + CDEBUG(D_INFO, "Found old lastid file for sequence "LPX64"\n", + lastid_seq); + o = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(o)) + return PTR_ERR(o); + } + /* let's read seq-NNNNNN-lastid file value */ + LASSERT(dt_object_exists(o)); + dti->dti_off = 0; + dti->dti_lb.lb_buf = &losd; + dti->dti_lb.lb_len = sizeof(losd); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + lu_object_put_nocache(env, &o->do_lu); + if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) { + CERROR("%s: wrong content of seq-"LPX64"-lastid file, magic %x\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, + le32_to_cpu(losd.lso_magic)); + return -EINVAL; + } else if (rc < 0) { + CERROR("%s: failed to read seq-"LPX64"-lastid: rc = %d\n", + o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc); + return rc; + } + *first_oid = le32_to_cpu(losd.lso_next_oid); + return rc; +} + +/** + * Initialize local OID storage for required sequence. + * That may be needed for services that uses local files and requires + * dynamic OID allocation for them. + * + * Per each sequence we have an object with 'first_fid' identificator + * containing the counter for OIDs of locally created files with that + * sequence. + * + * It is used now by llog subsystem and MGS for NID tables + * + * Function gets first_fid to create counter object. + * All dynamic fids will be generated with the same sequence and incremented + * OIDs + * + * Returned local_oid_storage is in-memory representaion of OID storage + */ +int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev, + const struct lu_fid *first_fid, + struct local_oid_storage **los) +{ + struct dt_thread_info *dti = dt_info(env); + struct ls_device *ls; + obd_id lastid; + struct dt_object *o = NULL; + struct thandle *th; + __u32 first_oid = fid_oid(first_fid); + int rc = 0; + + ENTRY; + + ls = ls_device_get(dev); + if (IS_ERR(ls)) + RETURN(PTR_ERR(ls)); + + mutex_lock(&ls->ls_los_mutex); + *los = dt_los_find(ls, fid_seq(first_fid)); + if (*los != NULL) + GOTO(out, rc = 0); + + /* not found, then create */ + OBD_ALLOC_PTR(*los); + if (*los == NULL) + GOTO(out, rc = -ENOMEM); + + atomic_set(&(*los)->los_refcount, 1); + mutex_init(&(*los)->los_id_lock); + (*los)->los_dev = &ls->ls_top_dev; + atomic_inc(&ls->ls_refcount); + list_add(&(*los)->los_list, &ls->ls_los_list); + + /* Use {seq, 0, 0} to create the LAST_ID file for every + * sequence. OIDs start at LUSTRE_FID_INIT_OID. + */ + dti->dti_fid.f_seq = fid_seq(first_fid); + dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID; + dti->dti_fid.f_ver = 0; + o = ls_locate(env, ls, &dti->dti_fid); + if (IS_ERR(o)) + GOTO(out_los, rc = PTR_ERR(o)); + + if (!dt_object_exists(o)) { + rc = lastid_compat_check(env, dev, fid_seq(first_fid), + &first_oid, ls); + if (rc < 0) + GOTO(out_los, rc); + + th = dt_trans_create(env, dev); + if (IS_ERR(th)) + GOTO(out_los, rc = PTR_ERR(th)); + + dti->dti_attr.la_valid = LA_MODE | LA_TYPE; + dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR; + dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG); + + rc = dt_declare_create(env, o, &dti->dti_attr, NULL, + &dti->dti_dof, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_declare_record_write(env, o, sizeof(lastid), 0, th); + if (rc) + GOTO(out_trans, rc); + + rc = dt_trans_start_local(env, dev, th); + if (rc) + GOTO(out_trans, rc); + + dt_write_lock(env, o, 0); + if (dt_object_exists(o)) + GOTO(out_lock, rc = 0); + + rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof, + th); + if (rc) + GOTO(out_lock, rc); + + lastid = cpu_to_le64(first_oid); + + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th); + if (rc) + GOTO(out_lock, rc); +out_lock: + dt_write_unlock(env, o); +out_trans: + dt_trans_stop(env, dev, th); + } else { + dti->dti_off = 0; + dti->dti_lb.lb_buf = &lastid; + dti->dti_lb.lb_len = sizeof(lastid); + dt_read_lock(env, o, 0); + rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off); + dt_read_unlock(env, o); + if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) { + CERROR("%s: bad oid "LPU64" is read from LAST_ID\n", + o->do_lu.lo_dev->ld_obd->obd_name, + le64_to_cpu(lastid)); + rc = -EINVAL; + } + } +out_los: + if (rc != 0) { + list_del(&(*los)->los_list); + atomic_dec(&ls->ls_refcount); + OBD_FREE_PTR(*los); + *los = NULL; + if (o != NULL && !IS_ERR(o)) + lu_object_put_nocache(env, &o->do_lu); + } else { + (*los)->los_seq = fid_seq(first_fid); + (*los)->los_last_oid = le64_to_cpu(lastid); + (*los)->los_obj = o; + /* read value should not be less than initial one */ + LASSERTF((*los)->los_last_oid >= first_oid, "%u < %u\n", + (*los)->los_last_oid, first_oid); + } +out: + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); + return rc; +} +EXPORT_SYMBOL(local_oid_storage_init); + +void local_oid_storage_fini(const struct lu_env *env, + struct local_oid_storage *los) +{ + struct ls_device *ls; + + if (!atomic_dec_and_test(&los->los_refcount)) + return; + + LASSERT(env); + LASSERT(los->los_dev); + ls = dt2ls_dev(los->los_dev); + + mutex_lock(&ls->ls_los_mutex); + if (atomic_read(&los->los_refcount) == 0) { + if (los->los_obj) + lu_object_put_nocache(env, &los->los_obj->do_lu); + list_del(&los->los_list); + OBD_FREE_PTR(los); + } + mutex_unlock(&ls->ls_los_mutex); + ls_device_put(env, ls); +} +EXPORT_SYMBOL(local_oid_storage_fini); diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h new file mode 100644 index 000000000000..d553c3752703 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h @@ -0,0 +1,88 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License version 2 for more details. A copy is + * included in the COPYING file that accompanied this code. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + */ +/* + * lustre/obdclass/local_storage.c + * + * Local storage for file/objects with fid generation. Works on top of OSD. + * + * Author: Mikhail Pershin <mike.pershin@intel.com> + */ + +#include <dt_object.h> +#include <obd.h> +#include <lustre_fid.h> +#include <lustre_disk.h> + +struct ls_device { + struct dt_device ls_top_dev; + /* all initialized ls_devices on this node linked by this */ + struct list_head ls_linkage; + /* how many handle's reference this local storage */ + atomic_t ls_refcount; + /* underlaying OSD device */ + struct dt_device *ls_osd; + /* list of all local OID storages */ + struct list_head ls_los_list; + struct mutex ls_los_mutex; +}; + +static inline struct ls_device *dt2ls_dev(struct dt_device *d) +{ + return container_of0(d, struct ls_device, ls_top_dev); +} + +struct ls_object { + struct lu_object_header ls_header; + struct dt_object ls_obj; +}; + +static inline struct ls_object *lu2ls_obj(struct lu_object *o) +{ + return container_of0(o, struct ls_object, ls_obj.do_lu); +} + +static inline struct dt_object *ls_locate(const struct lu_env *env, + struct ls_device *ls, + const struct lu_fid *fid) +{ + return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev); +} + +struct ls_device *ls_device_get(struct dt_device *dev); +void ls_device_put(const struct lu_env *env, struct ls_device *ls); +struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq); +void dt_los_put(struct local_oid_storage *los); + +/* Lustre 2.3 on-disk structure describing local object OIDs storage + * the structure to be used with any sequence managed by + * local object library. + * Obsoleted since 2.4 but is kept for compatibility reasons, + * see lastid_compat_check() in obdclass/local_storage.c */ +struct los_ondisk { + __u32 lso_magic; + __u32 lso_next_oid; +}; + +#define LOS_MAGIC 0xdecafbee diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c new file mode 100644 index 000000000000..e2d57fef0da3 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c @@ -0,0 +1,562 @@ +/* GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * GPL HEADER END + */ +/* + * Copyright (c) 2011, 2012, Intel Corporation. + * Use is subject to license terms. + * + * Author: Niu Yawei <niu@whamcloud.com> + */ +/* + * lustre/obdclass/lprocfs_jobstats.c + */ + +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_CLASS + + +#include <obd_class.h> +#include <lprocfs_status.h> +#include <lustre/lustre_idl.h> + +#if defined(LPROCFS) + +/* + * JobID formats & JobID environment variable names for supported + * job schedulers: + * + * SLURM: + * JobID format: 32 bit integer. + * JobID env var: SLURM_JOB_ID. + * SGE: + * JobID format: Decimal integer range to 99999. + * JobID env var: JOB_ID. + * LSF: + * JobID format: 6 digit integer by default (up to 999999), can be + * increased to 10 digit (up to 2147483646). + * JobID env var: LSB_JOBID. + * Loadleveler: + * JobID format: String of machine_name.cluster_id.process_id, for + * example: fr2n02.32.0 + * JobID env var: LOADL_STEP_ID. + * PBS: + * JobID format: String of sequence_number[.server_name][@server]. + * JobID env var: PBS_JOBID. + * Maui/MOAB: + * JobID format: Same as PBS. + * JobID env var: Same as PBS. + */ + +struct job_stat { + struct hlist_node js_hash; + struct list_head js_list; + atomic_t js_refcount; + char js_jobid[JOBSTATS_JOBID_SIZE]; + time_t js_timestamp; /* seconds */ + struct lprocfs_stats *js_stats; + struct obd_job_stats *js_jobstats; +}; + +static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, strlen(key), mask); +} + +static void *job_stat_key(struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return job->js_jobid; +} + +static int job_stat_keycmp(const void *key, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + return (strlen(job->js_jobid) == strlen(key)) && + !strncmp(job->js_jobid, key, strlen(key)); +} + +static void *job_stat_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct job_stat, js_hash); +} + +static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + atomic_inc(&job->js_refcount); +} + +static void job_free(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) == 0); + LASSERT(job->js_jobstats); + + write_lock(&job->js_jobstats->ojs_lock); + list_del_init(&job->js_list); + write_unlock(&job->js_jobstats->ojs_lock); + + lprocfs_free_stats(&job->js_stats); + OBD_FREE_PTR(job); +} + +static void job_putref(struct job_stat *job) +{ + LASSERT(atomic_read(&job->js_refcount) > 0); + if (atomic_dec_and_test(&job->js_refcount)) + job_free(job); +} + +static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct job_stat *job; + job = hlist_entry(hnode, struct job_stat, js_hash); + job_putref(job); +} + +static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode) +{ + CERROR("Should not have any items!"); +} + +static cfs_hash_ops_t job_stats_hash_ops = { + .hs_hash = job_stat_hash, + .hs_key = job_stat_key, + .hs_keycmp = job_stat_keycmp, + .hs_object = job_stat_object, + .hs_get = job_stat_get, + .hs_put_locked = job_stat_put_locked, + .hs_exit = job_stat_exit, +}; + +static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + time_t oldest = *((time_t *)data); + struct job_stat *job; + + job = hlist_entry(hnode, struct job_stat, js_hash); + if (!oldest || job->js_timestamp < oldest) + cfs_hash_bd_del_locked(hs, bd, hnode); + + return 0; +} + +static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force) +{ + time_t oldest, now; + + if (stats->ojs_cleanup_interval == 0) + return; + + now = cfs_time_current_sec(); + if (!force && now < stats->ojs_last_cleanup + + stats->ojs_cleanup_interval) + return; + + oldest = now - stats->ojs_cleanup_interval; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, + &oldest); + stats->ojs_last_cleanup = cfs_time_current_sec(); +} + +static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs) +{ + struct job_stat *job; + + LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn); + + OBD_ALLOC_PTR(job); + if (job == NULL) + return NULL; + + job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0); + if (job->js_stats == NULL) { + OBD_FREE_PTR(job); + return NULL; + } + + jobs->ojs_cntr_init_fn(job->js_stats); + + memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE); + job->js_timestamp = cfs_time_current_sec(); + job->js_jobstats = jobs; + INIT_HLIST_NODE(&job->js_hash); + INIT_LIST_HEAD(&job->js_list); + atomic_set(&job->js_refcount, 1); + + return job; +} + +int lprocfs_job_stats_log(struct obd_device *obd, char *jobid, + int event, long amount) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + struct job_stat *job, *job2; + ENTRY; + + LASSERT(stats && stats->ojs_hash); + + lprocfs_job_cleanup(stats, false); + + if (!jobid || !strlen(jobid)) + RETURN(-EINVAL); + + if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) { + CERROR("Invalid jobid size (%lu), expect(%d)\n", + (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE); + RETURN(-EINVAL); + } + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (job) + goto found; + + job = job_alloc(jobid, stats); + if (job == NULL) + RETURN(-ENOMEM); + + job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid, + &job->js_hash); + if (job2 != job) { + job_putref(job); + job = job2; + /* We cannot LASSERT(!list_empty(&job->js_list)) here, + * since we just lost the race for inserting "job" into the + * ojs_list, and some other thread is doing it _right_now_. + * Instead, be content the other thread is doing this, since + * "job2" was initialized in job_alloc() already. LU-2163 */ + } else { + LASSERT(list_empty(&job->js_list)); + write_lock(&stats->ojs_lock); + list_add_tail(&job->js_list, &stats->ojs_list); + write_unlock(&stats->ojs_lock); + } + +found: + LASSERT(stats == job->js_jobstats); + LASSERT(stats->ojs_cntr_num > event); + job->js_timestamp = cfs_time_current_sec(); + lprocfs_counter_add(job->js_stats, event, amount); + + job_putref(job); + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_job_stats_log); + +void lprocfs_job_stats_fini(struct obd_device *obd) +{ + struct obd_job_stats *stats = &obd->u.obt.obt_jobstats; + time_t oldest = 0; + + if (stats->ojs_hash == NULL) + return; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest); + cfs_hash_putref(stats->ojs_hash); + stats->ojs_hash = NULL; + LASSERT(list_empty(&stats->ojs_list)); +} +EXPORT_SYMBOL(lprocfs_job_stats_fini); + +static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + loff_t off = *pos; + struct job_stat *job; + + read_lock(&stats->ojs_lock); + if (off == 0) + return SEQ_START_TOKEN; + off--; + list_for_each_entry(job, &stats->ojs_list, js_list) { + if (!off--) + return job; + } + return NULL; +} + +static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v) +{ + struct obd_job_stats *stats = p->private; + + read_unlock(&stats->ojs_lock); +} + +static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct obd_job_stats *stats = p->private; + struct job_stat *job; + struct list_head *next; + + ++*pos; + if (v == SEQ_START_TOKEN) { + next = stats->ojs_list.next; + } else { + job = (struct job_stat *)v; + next = job->js_list.next; + } + + return next == &stats->ojs_list ? NULL : + list_entry(next, struct job_stat, js_list); +} + +/* + * Example of output on MDT: + * + * job_stats: + * - job_id: test_id.222.25844 + * snapshot_time: 1322494486 + * open: { samples: 3, unit: reqs } + * close: { samples: 3, unit: reqs } + * mknod: { samples: 0, unit: reqs } + * link: { samples: 0, unit: reqs } + * unlink: { samples: 0, unit: reqs } + * mkdir: { samples: 0, unit: reqs } + * rmdir: { samples: 0, unit: reqs } + * rename: { samples: 1, unit: reqs } + * getattr: { samples: 7, unit: reqs } + * setattr: { samples: 0, unit: reqs } + * getxattr: { samples: 0, unit: reqs } + * setxattr: { samples: 0, unit: reqs } + * statfs: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + * + * Example of output on OST: + * + * job_stats: + * - job_id 4854 + * snapshot_time: 1322494602 + * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 } + * write: { samples: 1, unit: bytes, min: 10, max: 10, sum: 10 } + * setattr: { samples: 0, unit: reqs } + * punch: { samples: 0, unit: reqs } + * sync: { samples: 0, unit: reqs } + */ + +static const char spaces[] = " "; + +static int inline width(const char *str, int len) +{ + return len - min((int)strlen(str), 15); +} + +static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v) +{ + struct job_stat *job = v; + struct lprocfs_stats *s; + struct lprocfs_counter ret; + struct lprocfs_counter *cntr; + struct lprocfs_counter_header *cntr_header; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(p, "job_stats:\n"); + return 0; + } + + seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid); + seq_printf(p, " %-16s %ld\n", "snapshot_time:", job->js_timestamp); + + s = job->js_stats; + for (i = 0; i < s->ls_num; i++) { + cntr = lprocfs_stats_counter_get(s, 0, i); + cntr_header = &s->ls_cnt_header[i]; + lprocfs_stats_collect(s, i, &ret); + + seq_printf(p, " %s:%.*s { samples: %11"LPF64"u", + cntr_header->lc_name, + width(cntr_header->lc_name, 15), spaces, + ret.lc_count); + if (cntr_header->lc_units[0] != '\0') + seq_printf(p, ", unit: %5s", cntr_header->lc_units); + + if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) { + seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u," + " sum:%16"LPF64"u", + ret.lc_count ? ret.lc_min : 0, + ret.lc_count ? ret.lc_max : 0, + ret.lc_count ? ret.lc_sum : 0); + } + if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) { + seq_printf(p, ", sumsq: %18"LPF64"u", + ret.lc_count ? ret.lc_sumsquare : 0); + } + + seq_printf(p, " }\n"); + + } + return 0; +} + +struct seq_operations lprocfs_jobstats_seq_sops = { + start: lprocfs_jobstats_seq_start, + stop: lprocfs_jobstats_seq_stop, + next: lprocfs_jobstats_seq_next, + show: lprocfs_jobstats_seq_show, +}; + +static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_jobstats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_job_stats *stats = seq->private; + char jobid[JOBSTATS_JOBID_SIZE]; + int all = 0; + struct job_stat *job; + + if (!memcmp(buf, "clear", strlen("clear"))) { + all = 1; + } else if (len < JOBSTATS_JOBID_SIZE) { + memset(jobid, 0, JOBSTATS_JOBID_SIZE); + /* Trim '\n' if any */ + if (buf[len - 1] == '\n') + memcpy(jobid, buf, len - 1); + else + memcpy(jobid, buf, len); + } else { + return -EINVAL; + } + + LASSERT(stats->ojs_hash); + if (all) { + time_t oldest = 0; + cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, + &oldest); + return len; + } + + if (!strlen(jobid)) + return -EINVAL; + + job = cfs_hash_lookup(stats->ojs_hash, jobid); + if (!job) + return -EINVAL; + + cfs_hash_del_key(stats->ojs_hash, jobid); + + job_putref(job); + return len; +} + +struct file_operations lprocfs_jobstats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_jobstats_seq_open, + .read = seq_read, + .write = lprocfs_jobstats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + +int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num, + cntr_init_callback init_fn) +{ + struct proc_dir_entry *entry; + struct obd_job_stats *stats; + ENTRY; + + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_type->typ_name); + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) { + CERROR("Invalid obd device type.\n"); + RETURN(-EINVAL); + } + stats = &obd->u.obt.obt_jobstats; + + LASSERT(stats->ojs_hash == NULL); + stats->ojs_hash = cfs_hash_create("JOB_STATS", + HASH_JOB_STATS_CUR_BITS, + HASH_JOB_STATS_MAX_BITS, + HASH_JOB_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &job_stats_hash_ops, + CFS_HASH_DEFAULT); + if (stats->ojs_hash == NULL) + RETURN(-ENOMEM); + + INIT_LIST_HEAD(&stats->ojs_list); + rwlock_init(&stats->ojs_lock); + stats->ojs_cntr_num = cntr_num; + stats->ojs_cntr_init_fn = init_fn; + stats->ojs_cleanup_interval = 600; /* 10 mins by default */ + stats->ojs_last_cleanup = cfs_time_current_sec(); + + entry = proc_create_data("job_stats", 0644, obd->obd_proc_entry, + &lprocfs_jobstats_seq_fops, stats); + if (entry) + RETURN(0); + else + RETURN(-ENOMEM); +} +EXPORT_SYMBOL(lprocfs_job_stats_init); + +int lprocfs_rd_job_interval(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_job_stats *stats; + + LASSERT(obd != NULL); + stats = &obd->u.obt.obt_jobstats; + return seq_printf(m, "%d\n", stats->ojs_cleanup_interval); +} +EXPORT_SYMBOL(lprocfs_rd_job_interval); + +int lprocfs_wr_job_interval(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_job_stats *stats; + int val, rc; + + LASSERT(obd != NULL); + stats = &obd->u.obt.obt_jobstats; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + stats->ojs_cleanup_interval = val; + lprocfs_job_cleanup(stats, true); + + return count; + +} +EXPORT_SYMBOL(lprocfs_wr_job_interval); + +#endif /* LPROCFS*/ diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c new file mode 100644 index 000000000000..f7af3d6a4efc --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c @@ -0,0 +1,1985 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lprocfs_status.c + * + * Author: Hariharan Thantry <thantry@users.sourceforge.net> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + + +#include <obd_class.h> +#include <lprocfs_status.h> +#include <lustre/lustre_idl.h> +#include <linux/seq_file.h> + +#if defined(LPROCFS) + +static int lprocfs_no_percpu_stats = 0; +CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644, + "Do not alloc percpu data for lprocfs stats"); + +#define MAX_STRING_SIZE 128 + +int lprocfs_single_release(struct inode *inode, struct file *file) +{ + return single_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_single_release); + +int lprocfs_seq_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} +EXPORT_SYMBOL(lprocfs_seq_release); + +/* lprocfs API calls */ + +proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root, + char *name, void *data, + struct file_operations *fops) +{ + proc_dir_entry_t *proc; + umode_t mode = 0; + + if (root == NULL || name == NULL || fops == NULL) + return ERR_PTR(-EINVAL); + + if (fops->read) + mode = 0444; + if (fops->write) + mode |= 0200; + proc = proc_create_data(name, mode, root, fops, data); + if (!proc) { + CERROR("LprocFS: No memory to create /proc entry %s", name); + return ERR_PTR(-ENOMEM); + } + return proc; +} +EXPORT_SYMBOL(lprocfs_add_simple); + +struct proc_dir_entry *lprocfs_add_symlink(const char *name, + struct proc_dir_entry *parent, const char *format, ...) +{ + struct proc_dir_entry *entry; + char *dest; + va_list ap; + + if (parent == NULL || format == NULL) + return NULL; + + OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1); + if (dest == NULL) + return NULL; + + va_start(ap, format); + vsnprintf(dest, MAX_STRING_SIZE, format, ap); + va_end(ap); + + entry = proc_symlink(name, parent, dest); + if (entry == NULL) + CERROR("LprocFS: Could not create symbolic link from %s to %s", + name, dest); + + OBD_FREE(dest, MAX_STRING_SIZE + 1); + return entry; +} +EXPORT_SYMBOL(lprocfs_add_symlink); + +static struct file_operations lprocfs_generic_fops = { }; + +/** + * Add /proc entries. + * + * \param root [in] The parent proc entry on which new entry will be added. + * \param list [in] Array of proc entries to be added. + * \param data [in] The argument to be passed when entries read/write routines + * are called through /proc file. + * + * \retval 0 on success + * < 0 on error + */ +int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list, + void *data) +{ + if (root == NULL || list == NULL) + return -EINVAL; + + while (list->name != NULL) { + struct proc_dir_entry *proc; + umode_t mode = 0; + + if (list->proc_mode != 0000) { + mode = list->proc_mode; + } else if (list->fops) { + if (list->fops->read) + mode = 0444; + if (list->fops->write) + mode |= 0200; + } + proc = proc_create_data(list->name, mode, root, + list->fops ?: &lprocfs_generic_fops, + list->data ?: data); + if (proc == NULL) + return -ENOMEM; + list++; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_add_vars); + +void lprocfs_remove(struct proc_dir_entry **rooth) +{ + proc_remove(*rooth); + *rooth = NULL; +} +EXPORT_SYMBOL(lprocfs_remove); + +void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent) +{ + LASSERT(parent != NULL); + remove_proc_entry(name, parent); +} +EXPORT_SYMBOL(lprocfs_remove_proc_entry); + +struct proc_dir_entry *lprocfs_register(const char *name, + struct proc_dir_entry *parent, + struct lprocfs_vars *list, void *data) +{ + struct proc_dir_entry *newchild; + + newchild = proc_mkdir(name, parent); + if (newchild != NULL && list != NULL) { + int rc = lprocfs_add_vars(newchild, list, data); + if (rc) { + lprocfs_remove(&newchild); + return ERR_PTR(rc); + } + } + return newchild; +} +EXPORT_SYMBOL(lprocfs_register); + +/* Generic callbacks */ +int lprocfs_rd_uint(struct seq_file *m, void *data) +{ + return seq_printf(m, "%u\n", *(unsigned int *)data); +} +EXPORT_SYMBOL(lprocfs_rd_uint); + +int lprocfs_wr_uint(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + unsigned *p = data; + char dummy[MAX_STRING_SIZE + 1], *end; + unsigned long tmp; + + dummy[MAX_STRING_SIZE] = '\0'; + if (copy_from_user(dummy, buffer, MAX_STRING_SIZE)) + return -EFAULT; + + tmp = simple_strtoul(dummy, &end, 0); + if (dummy == end) + return -EINVAL; + + *p = (unsigned int)tmp; + return count; +} +EXPORT_SYMBOL(lprocfs_wr_uint); + +int lprocfs_rd_u64(struct seq_file *m, void *data) +{ + return seq_printf(m, LPU64"\n", *(__u64 *)data); +} +EXPORT_SYMBOL(lprocfs_rd_u64); + +int lprocfs_rd_atomic(struct seq_file *m, void *data) +{ + atomic_t *atom = data; + LASSERT(atom != NULL); + return seq_printf(m, "%d\n", atomic_read(atom)); +} +EXPORT_SYMBOL(lprocfs_rd_atomic); + +int lprocfs_wr_atomic(struct file *file, const char __user *buffer, + unsigned long count, void *data) +{ + atomic_t *atm = data; + int val = 0; + int rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + + if (val <= 0) + return -ERANGE; + + atomic_set(atm, val); + return count; +} +EXPORT_SYMBOL(lprocfs_wr_atomic); + +int lprocfs_rd_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + return seq_printf(m, "%s\n", obd->obd_uuid.uuid); +} +EXPORT_SYMBOL(lprocfs_rd_uuid); + +int lprocfs_rd_name(struct seq_file *m, void *data) +{ + struct obd_device *dev = data; + + LASSERT(dev != NULL); + return seq_printf(m, "%s\n", dev->obd_name); +} +EXPORT_SYMBOL(lprocfs_rd_name); + +int lprocfs_rd_blksize(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + rc = seq_printf(m, "%u\n", osfs.os_bsize); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_blksize); + +int lprocfs_rd_kbytestotal(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_blocks; + + while (blk_size >>= 1) + result <<= 1; + + rc = seq_printf(m, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytestotal); + +int lprocfs_rd_kbytesfree(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bfree; + + while (blk_size >>= 1) + result <<= 1; + + rc = seq_printf(m, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesfree); + +int lprocfs_rd_kbytesavail(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) { + __u32 blk_size = osfs.os_bsize >> 10; + __u64 result = osfs.os_bavail; + + while (blk_size >>= 1) + result <<= 1; + + rc = seq_printf(m, LPU64"\n", result); + } + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_kbytesavail); + +int lprocfs_rd_filestotal(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + rc = seq_printf(m, LPU64"\n", osfs.os_files); + + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filestotal); + +int lprocfs_rd_filesfree(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_statfs osfs; + int rc = obd_statfs(NULL, obd->obd_self_export, &osfs, + cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS), + OBD_STATFS_NODELAY); + if (!rc) + rc = seq_printf(m, LPU64"\n", osfs.os_ffree); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_filesfree); + +int lprocfs_rd_server_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct obd_import *imp; + char *imp_state_name = NULL; + int rc = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + imp_state_name = ptlrpc_import_state_name(imp->imp_state); + rc = seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name, + imp->imp_deactive ? "\tDEACTIVATED" : ""); + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_server_uuid); + +int lprocfs_rd_conn_uuid(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + struct ptlrpc_connection *conn; + int rc = 0; + + LASSERT(obd != NULL); + + LPROCFS_CLIMP_CHECK(obd); + conn = obd->u.cli.cl_import->imp_connection; + if (conn && obd->u.cli.cl_import) + rc = seq_printf(m, "%s\n", conn->c_remote_uuid.uuid); + else + rc = seq_printf(m, "%s\n", "<none>"); + + LPROCFS_CLIMP_EXIT(obd); + return rc; +} +EXPORT_SYMBOL(lprocfs_rd_conn_uuid); + +/** add up per-cpu counters */ +void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx, + struct lprocfs_counter *cnt) +{ + unsigned int num_entry; + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *cntr_header; + int i; + unsigned long flags = 0; + + memset(cnt, 0, sizeof(*cnt)); + + if (stats == NULL) { + /* set count to 1 to avoid divide-by-zero errs in callers */ + cnt->lc_count = 1; + return; + } + + cnt->lc_min = LC_MIN_INIT; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + cntr_header = &stats->ls_cnt_header[idx]; + percpu_cntr = lprocfs_stats_counter_get(stats, i, idx); + + cnt->lc_count += percpu_cntr->lc_count; + cnt->lc_sum += percpu_cntr->lc_sum; + if (percpu_cntr->lc_min < cnt->lc_min) + cnt->lc_min = percpu_cntr->lc_min; + if (percpu_cntr->lc_max > cnt->lc_max) + cnt->lc_max = percpu_cntr->lc_max; + cnt->lc_sumsquare += percpu_cntr->lc_sumsquare; + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_stats_collect); + +/** + * Append a space separated list of current set flags to str. + */ +#define flag2str(flag, first) \ + do { \ + if (imp->imp_##flag) \ + seq_printf(m, "%s" #flag, first ? "" : ", "); \ + } while (0) +static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m) +{ + bool first = true; + + if (imp->imp_obd->obd_no_recov) { + seq_printf(m, "no_recov"); + first = false; + } + + flag2str(invalid, first); + first = false; + flag2str(deactive, first); + flag2str(replayable, first); + flag2str(pingable, first); + return 0; +} +#undef flags2str + +static const char *obd_connect_names[] = { + "read_only", + "lov_index", + "unused", + "write_grant", + "server_lock", + "version", + "request_portal", + "acl", + "xattr", + "create_on_write", + "truncate_lock", + "initial_transno", + "inode_bit_locks", + "join_file(obsolete)", + "getattr_by_fid", + "no_oh_for_devices", + "remote_client", + "remote_client_by_force", + "max_byte_per_rpc", + "64bit_qdata", + "mds_capability", + "oss_capability", + "early_lock_cancel", + "som", + "adaptive_timeouts", + "lru_resize", + "mds_mds_connection", + "real_conn", + "change_qunit_size", + "alt_checksum_algorithm", + "fid_is_enabled", + "version_recovery", + "pools", + "grant_shrink", + "skip_orphan", + "large_ea", + "full20", + "layout_lock", + "64bithash", + "object_max_bytes", + "imp_recov", + "jobstats", + "umask", + "einprogress", + "grant_param", + "flock_owner", + "lvb_type", + "nanoseconds_times", + "lightweight_conn", + "short_io", + "pingless", + "unknown", + NULL +}; + +static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep) +{ + __u64 mask = 1; + int i; + bool first = true; + + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) { + seq_printf(m, "%s%s", + first ? sep : "", obd_connect_names[i]); + first = false; + } + } + if (flags & ~(mask - 1)) + seq_printf(m, "%sunknown flags "LPX64, + first ? sep : "", flags & ~(mask - 1)); +} + +int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep) +{ + __u64 mask = 1; + int i, ret = 0; + + for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) { + if (flags & mask) + ret += snprintf(page + ret, count - ret, "%s%s", + ret ? sep : "", obd_connect_names[i]); + } + if (flags & ~(mask - 1)) + ret += snprintf(page + ret, count - ret, + "%sunknown flags "LPX64, + ret ? sep : "", flags & ~(mask - 1)); + return ret; +} +EXPORT_SYMBOL(obd_connect_flags2str); + +int lprocfs_rd_import(struct seq_file *m, void *data) +{ + struct lprocfs_counter ret; + struct lprocfs_counter_header *header; + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + struct obd_import_conn *conn; + int j; + int k; + int rw = 0; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + seq_printf(m, + "import:\n" + " name: %s\n" + " target: %s\n" + " state: %s\n" + " instance: %u\n" + " connect_flags: [", + obd->obd_name, + obd2cli_tgt(obd), + ptlrpc_import_state_name(imp->imp_state), + imp->imp_connect_data.ocd_instance); + obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", "); + seq_printf(m, + "]\n" + " import_flags: ["); + obd_import_flags2str(imp, m); + + seq_printf(m, + "]\n" + " connection:\n" + " failover_nids: ["); + spin_lock(&imp->imp_lock); + j = 0; + list_for_each_entry(conn, &imp->imp_conn_list, oic_item) { + seq_printf(m, "%s%s", j ? ", " : "", + libcfs_nid2str(conn->oic_conn->c_peer.nid)); + j++; + } + seq_printf(m, + "]\n" + " current_connection: %s\n" + " connection_attempts: %u\n" + " generation: %u\n" + " in-progress_invalidations: %u\n", + imp->imp_connection == NULL ? "<none>" : + libcfs_nid2str(imp->imp_connection->c_peer.nid), + imp->imp_conn_cnt, + imp->imp_generation, + atomic_read(&imp->imp_inval_count)); + spin_unlock(&imp->imp_lock); + + if (obd->obd_svc_stats == NULL) + goto out_climp; + + header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR]; + lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret); + if (ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + } else + ret.lc_sum = 0; + seq_printf(m, + " rpcs:\n" + " inflight: %u\n" + " unregistering: %u\n" + " timeouts: %u\n" + " avg_waittime: "LPU64" %s\n", + atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), + atomic_read(&imp->imp_timeouts), + ret.lc_sum, header->lc_units); + + k = 0; + for(j = 0; j < IMP_AT_MAX_PORTALS; j++) { + if (imp->imp_at.iat_portal[j] == 0) + break; + k = max_t(unsigned int, k, + at_get(&imp->imp_at.iat_service_estimate[j])); + } + seq_printf(m, + " service_estimates:\n" + " services: %u sec\n" + " network: %u sec\n", + k, + at_get(&imp->imp_at.iat_net_latency)); + + seq_printf(m, + " transactions:\n" + " last_replay: "LPU64"\n" + " peer_committed: "LPU64"\n" + " last_checked: "LPU64"\n", + imp->imp_last_replay_transno, + imp->imp_peer_committed_transno, + imp->imp_last_transno_checked); + + /* avg data rates */ + for (rw = 0; rw <= 1; rw++) { + lprocfs_stats_collect(obd->obd_svc_stats, + PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw, + &ret); + if (ret.lc_sum > 0 && ret.lc_count > 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, + " %s_data_averages:\n" + " bytes_per_rpc: "LPU64"\n", + rw ? "write" : "read", + ret.lc_sum); + } + k = (int)ret.lc_sum; + j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES; + header = &obd->obd_svc_stats->ls_cnt_header[j]; + lprocfs_stats_collect(obd->obd_svc_stats, j, &ret); + if (ret.lc_sum > 0 && ret.lc_count != 0) { + /* first argument to do_div MUST be __u64 */ + __u64 sum = ret.lc_sum; + do_div(sum, ret.lc_count); + ret.lc_sum = sum; + seq_printf(m, + " %s_per_rpc: "LPU64"\n", + header->lc_units, ret.lc_sum); + j = (int)ret.lc_sum; + if (j > 0) + seq_printf(m, + " MB_per_sec: %u.%.02u\n", + k / j, (100 * k / j) % 100); + } + } + +out_climp: + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_import); + +int lprocfs_rd_state(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + int j, k; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + seq_printf(m, "current_state: %s\n", + ptlrpc_import_state_name(imp->imp_state)); + seq_printf(m, "state_history:\n"); + k = imp->imp_state_hist_idx; + for (j = 0; j < IMP_STATE_HIST_LEN; j++) { + struct import_state_hist *ish = + &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN]; + if (ish->ish_state == 0) + continue; + seq_printf(m, " - ["CFS_TIME_T", %s]\n", + ish->ish_time, + ptlrpc_import_state_name(ish->ish_state)); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_state); + +int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at) +{ + int i; + for (i = 0; i < AT_BINS; i++) + seq_printf(m, "%3u ", at->at_hist[i]); + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_at_hist_helper); + +/* See also ptlrpc_lprocfs_rd_timeouts */ +int lprocfs_rd_timeouts(struct seq_file *m, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_import *imp; + unsigned int cur, worst; + time_t now, worstt; + struct dhms ts; + int i; + + LASSERT(obd != NULL); + LPROCFS_CLIMP_CHECK(obd); + imp = obd->u.cli.cl_import; + + now = cfs_time_current_sec(); + + /* Some network health info for kicks */ + s2dhms(&ts, now - imp->imp_last_reply_time); + seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n", + "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts)); + + cur = at_get(&imp->imp_at.iat_net_latency); + worst = imp->imp_at.iat_net_latency.at_worst_ever; + worstt = imp->imp_at.iat_net_latency.at_worst_time; + s2dhms(&ts, now - worstt); + seq_printf(m, "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ", + "network", cur, worst, worstt, DHMS_VARS(&ts)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency); + + for(i = 0; i < IMP_AT_MAX_PORTALS; i++) { + if (imp->imp_at.iat_portal[i] == 0) + break; + cur = at_get(&imp->imp_at.iat_service_estimate[i]); + worst = imp->imp_at.iat_service_estimate[i].at_worst_ever; + worstt = imp->imp_at.iat_service_estimate[i].at_worst_time; + s2dhms(&ts, now - worstt); + seq_printf(m, "portal %-2d : cur %3u worst %3u (at %ld, " + DHMS_FMT" ago) ", imp->imp_at.iat_portal[i], + cur, worst, worstt, DHMS_VARS(&ts)); + lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]); + } + + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_timeouts); + +int lprocfs_rd_connect_flags(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + __u64 flags; + + LPROCFS_CLIMP_CHECK(obd); + flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags; + seq_printf(m, "flags="LPX64"\n", flags); + obd_connect_seq_flags2str(m, flags, "\n"); + seq_printf(m, "\n"); + LPROCFS_CLIMP_EXIT(obd); + return 0; +} +EXPORT_SYMBOL(lprocfs_rd_connect_flags); + +int lprocfs_rd_num_exports(struct seq_file *m, void *data) +{ + struct obd_device *obd = data; + + LASSERT(obd != NULL); + return seq_printf(m, "%u\n", obd->obd_num_exports); +} +EXPORT_SYMBOL(lprocfs_rd_num_exports); + +int lprocfs_rd_numrefs(struct seq_file *m, void *data) +{ + struct obd_type *class = (struct obd_type*) data; + + LASSERT(class != NULL); + return seq_printf(m, "%d\n", class->typ_refcnt); +} +EXPORT_SYMBOL(lprocfs_rd_numrefs); + +int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list) +{ + int rc = 0; + + LASSERT(obd != NULL); + LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC); + LASSERT(obd->obd_type->typ_procroot != NULL); + + obd->obd_proc_entry = lprocfs_register(obd->obd_name, + obd->obd_type->typ_procroot, + list, obd); + if (IS_ERR(obd->obd_proc_entry)) { + rc = PTR_ERR(obd->obd_proc_entry); + CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name); + obd->obd_proc_entry = NULL; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_setup); + +int lprocfs_obd_cleanup(struct obd_device *obd) +{ + if (!obd) + return -EINVAL; + if (obd->obd_proc_exports_entry) { + /* Should be no exports left */ + lprocfs_remove(&obd->obd_proc_exports_entry); + obd->obd_proc_exports_entry = NULL; + } + if (obd->obd_proc_entry) { + lprocfs_remove(&obd->obd_proc_entry); + obd->obd_proc_entry = NULL; + } + return 0; +} +EXPORT_SYMBOL(lprocfs_obd_cleanup); + +static void lprocfs_free_client_stats(struct nid_stat *client_stat) +{ + CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat, + client_stat->nid_proc, client_stat->nid_stats); + + LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0, + "nid %s:count %d\n", libcfs_nid2str(client_stat->nid), + atomic_read(&client_stat->nid_exp_ref_count)); + + if (client_stat->nid_proc) + lprocfs_remove(&client_stat->nid_proc); + + if (client_stat->nid_stats) + lprocfs_free_stats(&client_stat->nid_stats); + + if (client_stat->nid_ldlm_stats) + lprocfs_free_stats(&client_stat->nid_ldlm_stats); + + OBD_FREE_PTR(client_stat); + return; + +} + +void lprocfs_free_per_client_stats(struct obd_device *obd) +{ + cfs_hash_t *hash = obd->obd_nid_stats_hash; + struct nid_stat *stat; + ENTRY; + + /* we need extra list - because hash_exit called to early */ + /* not need locking because all clients is died */ + while (!list_empty(&obd->obd_nid_stats)) { + stat = list_entry(obd->obd_nid_stats.next, + struct nid_stat, nid_list); + list_del_init(&stat->nid_list); + cfs_hash_del(hash, &stat->nid, &stat->nid_hash); + lprocfs_free_client_stats(stat); + } + EXIT; +} +EXPORT_SYMBOL(lprocfs_free_per_client_stats); + +struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num, + enum lprocfs_stats_flags flags) +{ + struct lprocfs_stats *stats; + unsigned int num_entry; + unsigned int percpusize = 0; + int i; + + if (num == 0) + return NULL; + + if (lprocfs_no_percpu_stats != 0) + flags |= LPROCFS_STATS_FLAG_NOPERCPU; + + if (flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + /* alloc percpu pointers for all possible cpu slots */ + LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); + if (stats == NULL) + return NULL; + + stats->ls_num = num; + stats->ls_flags = flags; + spin_lock_init(&stats->ls_lock); + + /* alloc num of counter headers */ + LIBCFS_ALLOC(stats->ls_cnt_header, + stats->ls_num * sizeof(struct lprocfs_counter_header)); + if (stats->ls_cnt_header == NULL) + goto fail; + + if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) { + /* contains only one set counters */ + percpusize = lprocfs_stats_counter_size(stats); + LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize); + if (stats->ls_percpu[0] == NULL) + goto fail; + stats->ls_biggest_alloc_num = 1; + } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) { + /* alloc all percpu data, currently only obd_memory use this */ + for (i = 0; i < num_entry; ++i) + if (lprocfs_stats_alloc_one(stats, i) < 0) + goto fail; + } + + return stats; + +fail: + lprocfs_free_stats(&stats); + return NULL; +} +EXPORT_SYMBOL(lprocfs_alloc_stats); + +void lprocfs_free_stats(struct lprocfs_stats **statsh) +{ + struct lprocfs_stats *stats = *statsh; + unsigned int num_entry; + unsigned int percpusize; + unsigned int i; + + if (stats == NULL || stats->ls_num == 0) + return; + *statsh = NULL; + + if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU) + num_entry = 1; + else + num_entry = num_possible_cpus(); + + percpusize = lprocfs_stats_counter_size(stats); + for (i = 0; i < num_entry; i++) + if (stats->ls_percpu[i] != NULL) + LIBCFS_FREE(stats->ls_percpu[i], percpusize); + if (stats->ls_cnt_header != NULL) + LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num * + sizeof(struct lprocfs_counter_header)); + LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry])); +} +EXPORT_SYMBOL(lprocfs_free_stats); + +void lprocfs_clear_stats(struct lprocfs_stats *stats) +{ + struct lprocfs_counter *percpu_cntr; + struct lprocfs_counter_header *header; + int i; + int j; + unsigned int num_entry; + unsigned long flags = 0; + + num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + + for (i = 0; i < num_entry; i++) { + if (stats->ls_percpu[i] == NULL) + continue; + for (j = 0; j < stats->ls_num; j++) { + header = &stats->ls_cnt_header[j]; + percpu_cntr = lprocfs_stats_counter_get(stats, i, j); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) + percpu_cntr->lc_sum_irq = 0; + } + } + + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_clear_stats); + +static ssize_t lprocfs_stats_seq_write(struct file *file, + const char __user *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct lprocfs_stats *stats = seq->private; + + lprocfs_clear_stats(stats); + + return len; +} + +static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos) +{ + struct lprocfs_stats *stats = p->private; + + return (*pos < stats->ls_num) ? pos : NULL; +} + +static void lprocfs_stats_seq_stop(struct seq_file *p, void *v) +{ +} + +static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos) +{ + (*pos)++; + return lprocfs_stats_seq_start(p, pos); +} + +/* seq file export of one lprocfs counter */ +static int lprocfs_stats_seq_show(struct seq_file *p, void *v) +{ + struct lprocfs_stats *stats = p->private; + struct lprocfs_counter_header *hdr; + struct lprocfs_counter ctr; + int idx = *(loff_t *)v; + int rc = 0; + + if (idx == 0) { + struct timeval now; + do_gettimeofday(&now); + rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n", + "snapshot_time", now.tv_sec, now.tv_usec); + if (rc < 0) + return rc; + } + hdr = &stats->ls_cnt_header[idx]; + lprocfs_stats_collect(stats, idx, &ctr); + + if (ctr.lc_count == 0) + goto out; + + rc = seq_printf(p, "%-25s "LPD64" samples [%s]", hdr->lc_name, + ctr.lc_count, hdr->lc_units); + + if (rc < 0) + goto out; + + if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ctr.lc_count > 0)) { + rc = seq_printf(p, " "LPD64" "LPD64" "LPD64, + ctr.lc_min, ctr.lc_max, ctr.lc_sum); + if (rc < 0) + goto out; + if (hdr->lc_config & LPROCFS_CNTR_STDDEV) + rc = seq_printf(p, " "LPD64, ctr.lc_sumsquare); + if (rc < 0) + goto out; + } + rc = seq_printf(p, "\n"); +out: + return (rc < 0) ? rc : 0; +} + +struct seq_operations lprocfs_stats_seq_sops = { + .start = lprocfs_stats_seq_start, + .stop = lprocfs_stats_seq_stop, + .next = lprocfs_stats_seq_next, + .show = lprocfs_stats_seq_show, +}; + +static int lprocfs_stats_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc; + + rc = seq_open(file, &lprocfs_stats_seq_sops); + if (rc) + return rc; + seq = file->private_data; + seq->private = PDE_DATA(inode); + return 0; +} + +struct file_operations lprocfs_stats_seq_fops = { + .owner = THIS_MODULE, + .open = lprocfs_stats_seq_open, + .read = seq_read, + .write = lprocfs_stats_seq_write, + .llseek = seq_lseek, + .release = lprocfs_seq_release, +}; + +int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, + struct lprocfs_stats *stats) +{ + struct proc_dir_entry *entry; + LASSERT(root != NULL); + + entry = proc_create_data(name, 0644, root, + &lprocfs_stats_seq_fops, stats); + if (entry == NULL) + return -ENOMEM; + + return 0; +} +EXPORT_SYMBOL(lprocfs_register_stats); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + unsigned conf, const char *name, const char *units) +{ + struct lprocfs_counter_header *header; + struct lprocfs_counter *percpu_cntr; + unsigned long flags = 0; + unsigned int i; + unsigned int num_cpu; + + LASSERT(stats != NULL); + + header = &stats->ls_cnt_header[index]; + LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", + index, name, units); + + header->lc_config = conf; + header->lc_name = name; + header->lc_units = units; + + num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); + for (i = 0; i < num_cpu; ++i) { + if (stats->ls_percpu[i] == NULL) + continue; + percpu_cntr = lprocfs_stats_counter_get(stats, i, index); + percpu_cntr->lc_count = 0; + percpu_cntr->lc_min = LC_MIN_INIT; + percpu_cntr->lc_max = 0; + percpu_cntr->lc_sumsquare = 0; + percpu_cntr->lc_sum = 0; + if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) + percpu_cntr->lc_sum_irq = 0; + } + lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); +} +EXPORT_SYMBOL(lprocfs_counter_init); + +#define LPROCFS_OBD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, create); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref); +} +EXPORT_SYMBOL(lprocfs_init_ops_stats); + +int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->obd_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->obd_cntr_base == 0); + + num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) + + num_private_stats - 1 /* o_owner */; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_ops_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + /* If this LBUGs, it is likely that an obd + * operation was added to struct obd_ops in + * <obd.h>, and that the corresponding line item + * LPROCFS_OBD_OP_INIT(.., .., opname) + * is missing from the list above. */ + LASSERTF(stats->ls_cnt_header[i].lc_name != NULL, + "Missing obd_stat initializer obd_op " + "operation at offset %d.\n", i - num_private_stats); + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->obd_stats = stats; + obd->obd_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_obd_stats); + +void lprocfs_free_obd_stats(struct obd_device *obd) +{ + if (obd->obd_stats) + lprocfs_free_stats(&obd->obd_stats); +} +EXPORT_SYMBOL(lprocfs_free_obd_stats); + +#define LPROCFS_MD_OP_INIT(base, stats, op) \ +do { \ + unsigned int coffset = base + MD_COUNTER_OFFSET(op); \ + LASSERT(coffset < stats->ls_num); \ + lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \ +} while (0) + +void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats) +{ + LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus); + LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode); + LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata); + LPROCFS_MD_OP_INIT(num_private_stats, stats, close); + LPROCFS_MD_OP_INIT(num_private_stats, stats, create); + LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing); + LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock); + LPROCFS_MD_OP_INIT(num_private_stats, stats, link); + LPROCFS_MD_OP_INIT(num_private_stats, stats, rename); + LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, sync); + LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink); + LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr); + LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data); + LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match); + LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused); + LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm); + LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async); + LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock); +} +EXPORT_SYMBOL(lprocfs_init_mps_stats); + +int lprocfs_alloc_md_stats(struct obd_device *obd, + unsigned num_private_stats) +{ + struct lprocfs_stats *stats; + unsigned int num_stats; + int rc, i; + + LASSERT(obd->md_stats == NULL); + LASSERT(obd->obd_proc_entry != NULL); + LASSERT(obd->md_cntr_base == 0); + + num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) + + num_private_stats; + stats = lprocfs_alloc_stats(num_stats, 0); + if (stats == NULL) + return -ENOMEM; + + lprocfs_init_mps_stats(num_private_stats, stats); + + for (i = num_private_stats; i < num_stats; i++) { + if (stats->ls_cnt_header[i].lc_name == NULL) { + CERROR("Missing md_stat initializer md_op " + "operation at offset %d. Aborting.\n", + i - num_private_stats); + LBUG(); + } + } + rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats); + if (rc < 0) { + lprocfs_free_stats(&stats); + } else { + obd->md_stats = stats; + obd->md_cntr_base = num_private_stats; + } + return rc; +} +EXPORT_SYMBOL(lprocfs_alloc_md_stats); + +void lprocfs_free_md_stats(struct obd_device *obd) +{ + struct lprocfs_stats *stats = obd->md_stats; + + if (stats != NULL) { + obd->md_stats = NULL; + obd->md_cntr_base = 0; + lprocfs_free_stats(&stats); + } +} +EXPORT_SYMBOL(lprocfs_free_md_stats); + +void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) +{ + lprocfs_counter_init(ldlm_stats, + LDLM_ENQUEUE - LDLM_FIRST_OPC, + 0, "ldlm_enqueue", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CONVERT - LDLM_FIRST_OPC, + 0, "ldlm_convert", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CANCEL - LDLM_FIRST_OPC, + 0, "ldlm_cancel", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_bl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_cp_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, + LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + 0, "ldlm_gl_callback", "reqs"); +} +EXPORT_SYMBOL(lprocfs_init_ldlm_stats); + +int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) + +{ + struct obd_export *exp = cfs_hash_object(hs, hnode); + struct seq_file *m = (struct seq_file *)data; + + if (exp->exp_nid_stats) + seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid)); + + return 0; +} + +static int +lproc_exp_uuid_seq_show(struct seq_file *m, void *unused) +{ + struct nid_stat *stats = (struct nid_stat *)m->private; + struct obd_device *obd = stats->nid_obd; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_uuid, m); + return 0; +} + +LPROC_SEQ_FOPS_RO(lproc_exp_uuid); + +struct exp_hash_cb_data { + struct seq_file *m; + bool first; +}; + +int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *cb_data) + +{ + struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data; + struct obd_export *exp = cfs_hash_object(hs, hnode); + + if (exp->exp_lock_hash != NULL) { + if (data->first) { + cfs_hash_debug_header(data->m); + data->first = false; + } + cfs_hash_debug_str(hs, data->m); + } + + return 0; +} + +static int +lproc_exp_hash_seq_show(struct seq_file *m, void *unused) +{ + struct nid_stat *stats = (struct nid_stat *)m->private; + struct obd_device *obd = stats->nid_obd; + struct exp_hash_cb_data cb_data = {m, true}; + + cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid, + lprocfs_exp_print_hash, &cb_data); + return 0; +} + +LPROC_SEQ_FOPS_RO(lproc_exp_hash); + +int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data) +{ + return seq_printf(m, "%s\n", + "Write into this file to clear all nid stats and " + "stale nid entries"); +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_read); + +static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data) +{ + struct nid_stat *stat = obj; + ENTRY; + + CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count)); + if (atomic_read(&stat->nid_exp_ref_count) == 1) { + /* object has only hash references. */ + spin_lock(&stat->nid_obd->obd_nid_lock); + list_move(&stat->nid_list, data); + spin_unlock(&stat->nid_obd->obd_nid_lock); + RETURN(1); + } + /* we has reference to object - only clear data*/ + if (stat->nid_stats) + lprocfs_clear_stats(stat->nid_stats); + + RETURN(0); +} + +int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct nid_stat *client_stat; + LIST_HEAD(free_list); + + cfs_hash_cond_del(obd->obd_nid_stats_hash, + lprocfs_nid_stats_clear_write_cb, &free_list); + + while (!list_empty(&free_list)) { + client_stat = list_entry(free_list.next, struct nid_stat, + nid_list); + list_del_init(&client_stat->nid_list); + lprocfs_free_client_stats(client_stat); + } + + return count; +} +EXPORT_SYMBOL(lprocfs_nid_stats_clear_write); + +int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid) +{ + struct nid_stat *new_stat, *old_stat; + struct obd_device *obd = NULL; + proc_dir_entry_t *entry; + char *buffer = NULL; + int rc = 0; + ENTRY; + + *newnid = 0; + + if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry || + !exp->exp_obd->obd_nid_stats_hash) + RETURN(-EINVAL); + + /* not test against zero because eric say: + * You may only test nid against another nid, or LNET_NID_ANY. + * Anything else is nonsense.*/ + if (!nid || *nid == LNET_NID_ANY) + RETURN(0); + + obd = exp->exp_obd; + + CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash); + + OBD_ALLOC_PTR(new_stat); + if (new_stat == NULL) + RETURN(-ENOMEM); + + new_stat->nid = *nid; + new_stat->nid_obd = exp->exp_obd; + /* we need set default refcount to 1 to balance obd_disconnect */ + atomic_set(&new_stat->nid_exp_ref_count, 1); + + old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash, + nid, &new_stat->nid_hash); + CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n", + old_stat, libcfs_nid2str(*nid), + atomic_read(&new_stat->nid_exp_ref_count)); + + /* We need to release old stats because lprocfs_exp_cleanup() hasn't + * been and will never be called. */ + if (exp->exp_nid_stats) { + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + } + + /* Return -EALREADY here so that we know that the /proc + * entry already has been created */ + if (old_stat != new_stat) { + exp->exp_nid_stats = old_stat; + GOTO(destroy_new, rc = -EALREADY); + } + /* not found - create */ + OBD_ALLOC(buffer, LNET_NIDSTR_SIZE); + if (buffer == NULL) + GOTO(destroy_new, rc = -ENOMEM); + + memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE); + new_stat->nid_proc = lprocfs_register(buffer, + obd->obd_proc_exports_entry, + NULL, NULL); + OBD_FREE(buffer, LNET_NIDSTR_SIZE); + + if (new_stat->nid_proc == NULL) { + CERROR("Error making export directory for nid %s\n", + libcfs_nid2str(*nid)); + GOTO(destroy_new_ns, rc = -ENOMEM); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "uuid", + new_stat, &lproc_exp_uuid_fops); + if (IS_ERR(entry)) { + CWARN("Error adding the NID stats file\n"); + rc = PTR_ERR(entry); + GOTO(destroy_new_ns, rc); + } + + entry = lprocfs_add_simple(new_stat->nid_proc, "hash", + new_stat, &lproc_exp_hash_fops); + if (IS_ERR(entry)) { + CWARN("Error adding the hash file\n"); + rc = PTR_ERR(entry); + GOTO(destroy_new_ns, rc); + } + + exp->exp_nid_stats = new_stat; + *newnid = 1; + /* protect competitive add to list, not need locking on destroy */ + spin_lock(&obd->obd_nid_lock); + list_add(&new_stat->nid_list, &obd->obd_nid_stats); + spin_unlock(&obd->obd_nid_lock); + + RETURN(rc); + +destroy_new_ns: + if (new_stat->nid_proc != NULL) + lprocfs_remove(&new_stat->nid_proc); + cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash); + +destroy_new: + nidstat_putref(new_stat); + OBD_FREE_PTR(new_stat); + RETURN(rc); +} +EXPORT_SYMBOL(lprocfs_exp_setup); + +int lprocfs_exp_cleanup(struct obd_export *exp) +{ + struct nid_stat *stat = exp->exp_nid_stats; + + if(!stat || !exp->exp_obd) + RETURN(0); + + nidstat_putref(exp->exp_nid_stats); + exp->exp_nid_stats = NULL; + + return 0; +} +EXPORT_SYMBOL(lprocfs_exp_cleanup); + +int lprocfs_write_helper(const char *buffer, unsigned long count, + int *val) +{ + return lprocfs_write_frac_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_helper); + +int lprocfs_write_frac_helper(const char *buffer, unsigned long count, + int *val, int mult) +{ + char kernbuf[20], *end, *pbuf; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } + + *val = (int)simple_strtoul(pbuf, &end, 10) * mult; + if (pbuf == end) + return -EINVAL; + + if (end != NULL && *end == '.') { + int temp_val, pow = 1; + int i; + + pbuf = end + 1; + if (strlen(pbuf) > 5) + pbuf[5] = '\0'; /*only allow 5bits fractional*/ + + temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult; + + if (pbuf < end) { + for (i = 0; i < (end - pbuf); i++) + pow *= 10; + + *val += temp_val / pow; + } + } + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_helper); + +int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val, + int mult) +{ + long decimal_val, frac_val; + int prtn; + + if (count < 10) + return -EINVAL; + + decimal_val = val / mult; + prtn = snprintf(buffer, count, "%ld", decimal_val); + frac_val = val % mult; + + if (prtn < (count - 4) && frac_val > 0) { + long temp_frac; + int i, temp_mult = 1, frac_bits = 0; + + temp_frac = frac_val * 10; + buffer[prtn++] = '.'; + while (frac_bits < 2 && (temp_frac / mult) < 1 ) { + /* only reserved 2 bits fraction */ + buffer[prtn++] ='0'; + temp_frac *= 10; + frac_bits++; + } + /* + * Need to think these cases : + * 1. #echo x.00 > /proc/xxx output result : x + * 2. #echo x.0x > /proc/xxx output result : x.0x + * 3. #echo x.x0 > /proc/xxx output result : x.x + * 4. #echo x.xx > /proc/xxx output result : x.xx + * Only reserved 2 bits fraction. + */ + for (i = 0; i < (5 - prtn); i++) + temp_mult *= 10; + + frac_bits = min((int)count - prtn, 3 - frac_bits); + prtn += snprintf(buffer + prtn, frac_bits, "%ld", + frac_val * temp_mult / mult); + + prtn--; + while(buffer[prtn] < '1' || buffer[prtn] > '9') { + prtn--; + if (buffer[prtn] == '.') { + prtn--; + break; + } + } + prtn++; + } + buffer[prtn++] ='\n'; + return prtn; +} +EXPORT_SYMBOL(lprocfs_read_frac_helper); + +int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult) +{ + long decimal_val, frac_val; + + decimal_val = val / mult; + seq_printf(m, "%ld", decimal_val); + frac_val = val % mult; + + if (frac_val > 0) { + frac_val *= 100; + frac_val /= mult; + } + if (frac_val > 0) { + /* Three cases: x0, xx, 0x */ + if ((frac_val % 10) != 0) + seq_printf(m, ".%ld", frac_val); + else + seq_printf(m, ".%ld", frac_val / 10); + } + + seq_printf(m, "\n"); + return 0; +} +EXPORT_SYMBOL(lprocfs_seq_read_frac_helper); + +int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val) +{ + return lprocfs_write_frac_u64_helper(buffer, count, val, 1); +} +EXPORT_SYMBOL(lprocfs_write_u64_helper); + +int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, + __u64 *val, int mult) +{ + char kernbuf[22], *end, *pbuf; + __u64 whole, frac = 0, units; + unsigned frac_d = 1; + + if (count > (sizeof(kernbuf) - 1)) + return -EINVAL; + + if (copy_from_user(kernbuf, buffer, count)) + return -EFAULT; + + kernbuf[count] = '\0'; + pbuf = kernbuf; + if (*pbuf == '-') { + mult = -mult; + pbuf++; + } + + whole = simple_strtoull(pbuf, &end, 10); + if (pbuf == end) + return -EINVAL; + + if (end != NULL && *end == '.') { + int i; + pbuf = end + 1; + + /* need to limit frac_d to a __u32 */ + if (strlen(pbuf) > 10) + pbuf[10] = '\0'; + + frac = simple_strtoull(pbuf, &end, 10); + /* count decimal places */ + for (i = 0; i < (end - pbuf); i++) + frac_d *= 10; + } + + units = 1; + switch(*end) { + case 'p': case 'P': + units <<= 10; + case 't': case 'T': + units <<= 10; + case 'g': case 'G': + units <<= 10; + case 'm': case 'M': + units <<= 10; + case 'k': case 'K': + units <<= 10; + } + /* Specified units override the multiplier */ + if (units) + mult = mult < 0 ? -units : units; + + frac *= mult; + do_div(frac, frac_d); + *val = whole * mult + frac; + return 0; +} +EXPORT_SYMBOL(lprocfs_write_frac_u64_helper); + +static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len) +{ + size_t l2; + + l2 = strlen(s2); + if (!l2) + return (char *)s1; + while (len >= l2) { + len--; + if (!memcmp(s1, s2, l2)) + return (char *)s1; + s1++; + } + return NULL; +} + +/** + * Find the string \a name in the input \a buffer, and return a pointer to the + * value immediately following \a name, reducing \a count appropriately. + * If \a name is not found the original \a buffer is returned. + */ +char *lprocfs_find_named_value(const char *buffer, const char *name, + unsigned long *count) +{ + char *val; + size_t buflen = *count; + + /* there is no strnstr() in rhel5 and ubuntu kernels */ + val = lprocfs_strnstr(buffer, name, buflen); + if (val == NULL) + return (char *)buffer; + + val += strlen(name); /* skip prefix */ + while (val < buffer + buflen && isspace(*val)) /* skip separator */ + val++; + + *count = 0; + while (val < buffer + buflen && isalnum(*val)) { + ++*count; + ++val; + } + + return val - *count; +} +EXPORT_SYMBOL(lprocfs_find_named_value); + +int lprocfs_seq_create(proc_dir_entry_t *parent, + const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + struct proc_dir_entry *entry; + ENTRY; + + /* Disallow secretly (un)writable entries. */ + LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0)); + entry = proc_create_data(name, mode, parent, seq_fops, data); + + if (entry == NULL) + RETURN(-ENOMEM); + + RETURN(0); +} +EXPORT_SYMBOL(lprocfs_seq_create); + +int lprocfs_obd_seq_create(struct obd_device *dev, + const char *name, + umode_t mode, + const struct file_operations *seq_fops, + void *data) +{ + return (lprocfs_seq_create(dev->obd_proc_entry, name, + mode, seq_fops, data)); +} +EXPORT_SYMBOL(lprocfs_obd_seq_create); + +void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value) +{ + if (value >= OBD_HIST_MAX) + value = OBD_HIST_MAX - 1; + + spin_lock(&oh->oh_lock); + oh->oh_buckets[value]++; + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_tally); + +void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value) +{ + unsigned int val; + + for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++) + ; + + lprocfs_oh_tally(oh, val); +} +EXPORT_SYMBOL(lprocfs_oh_tally_log2); + +unsigned long lprocfs_oh_sum(struct obd_histogram *oh) +{ + unsigned long ret = 0; + int i; + + for (i = 0; i < OBD_HIST_MAX; i++) + ret += oh->oh_buckets[i]; + return ret; +} +EXPORT_SYMBOL(lprocfs_oh_sum); + +void lprocfs_oh_clear(struct obd_histogram *oh) +{ + spin_lock(&oh->oh_lock); + memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets)); + spin_unlock(&oh->oh_lock); +} +EXPORT_SYMBOL(lprocfs_oh_clear); + +int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data) +{ + struct obd_device *dev = data; + struct client_obd *cli = &dev->u.cli; + int rc; + + client_obd_list_lock(&cli->cl_loi_list_lock); + rc = seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc); + client_obd_list_unlock(&cli->cl_loi_list_lock); + return rc; +} +EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc); + +#endif /* LPROCFS*/ diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c new file mode 100644 index 000000000000..fdf0ed367693 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c @@ -0,0 +1,2185 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/libcfs/libcfs.h> + +# include <linux/module.h> + +/* hash_long() */ +#include <linux/libcfs/libcfs_hash.h> +#include <obd_class.h> +#include <obd_support.h> +#include <lustre_disk.h> +#include <lustre_fid.h> +#include <lu_object.h> +#include <lu_ref.h> +#include <linux/list.h> + +static void lu_object_free(const struct lu_env *env, struct lu_object *o); + +/** + * Decrease reference counter on object. If last reference is freed, return + * object to the cache, unless lu_object_is_dying(o) holds. In the latter + * case, free object immediately. + */ +void lu_object_put(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *top; + struct lu_site *site; + struct lu_object *orig; + cfs_hash_bd_t bd; + const struct lu_fid *fid; + + top = o->lo_header; + site = o->lo_dev->ld_site; + orig = o; + + /* + * till we have full fids-on-OST implemented anonymous objects + * are possible in OSP. such an object isn't listed in the site + * so we should not remove it from the site. + */ + fid = lu_object_fid(o); + if (fid_is_zero(fid)) { + LASSERT(top->loh_hash.next == NULL + && top->loh_hash.pprev == NULL); + LASSERT(list_empty(&top->loh_lru)); + if (!atomic_dec_and_test(&top->loh_ref)) + return; + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + lu_object_free(env, orig); + return; + } + + cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd); + + if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) { + if (lu_object_is_dying(top)) { + + /* + * somebody may be waiting for this, currently only + * used for cl_object, see cl_object_put_last(). + */ + wake_up_all(&bkt->lsb_marche_funebre); + } + return; + } + + LASSERT(bkt->lsb_busy > 0); + bkt->lsb_busy--; + /* + * When last reference is released, iterate over object + * layers, and notify them that object is no longer busy. + */ + list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_release != NULL) + o->lo_ops->loo_object_release(env, o); + } + + if (!lu_object_is_dying(top)) { + LASSERT(list_empty(&top->loh_lru)); + list_add_tail(&top->loh_lru, &bkt->lsb_lru); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + return; + } + + /* + * If object is dying (will not be cached), removed it + * from hash table and LRU. + * + * This is done with hash table and LRU lists locked. As the only + * way to acquire first reference to previously unreferenced + * object is through hash-table lookup (lu_object_find()), + * or LRU scanning (lu_site_purge()), that are done under hash-table + * and LRU lock, no race with concurrent object lookup is possible + * and we can safely destroy object below. + */ + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) + cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1); + /* + * Object was already removed from hash and lru above, can + * kill it. + */ + lu_object_free(env, orig); +} +EXPORT_SYMBOL(lu_object_put); + +/** + * Put object and don't keep in cache. This is temporary solution for + * multi-site objects when its layering is not constant. + */ +void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o) +{ + set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags); + return lu_object_put(env, o); +} +EXPORT_SYMBOL(lu_object_put_nocache); + +/** + * Kill the object and take it out of LRU cache. + * Currently used by client code for layout change. + */ +void lu_object_unhash(const struct lu_env *env, struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags); + if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) { + cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash; + cfs_hash_bd_t bd; + + cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1); + list_del_init(&top->loh_lru); + cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash); + cfs_hash_bd_unlock(obj_hash, &bd, 1); + } +} +EXPORT_SYMBOL(lu_object_unhash); + +/** + * Allocate new object. + * + * This follows object creation protocol, described in the comment within + * struct lu_device_operations definition. + */ +static struct lu_object *lu_object_alloc(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *scan; + struct lu_object *top; + struct list_head *layers; + int clean; + int result; + ENTRY; + + /* + * Create top-level object slice. This will also create + * lu_object_header. + */ + top = dev->ld_ops->ldo_object_alloc(env, NULL, dev); + if (top == NULL) + RETURN(ERR_PTR(-ENOMEM)); + if (IS_ERR(top)) + RETURN(top); + /* + * This is the only place where object fid is assigned. It's constant + * after this point. + */ + top->lo_header->loh_fid = *f; + layers = &top->lo_header->loh_layers; + do { + /* + * Call ->loo_object_init() repeatedly, until no more new + * object slices are created. + */ + clean = 1; + list_for_each_entry(scan, layers, lo_linkage) { + if (scan->lo_flags & LU_OBJECT_ALLOCATED) + continue; + clean = 0; + scan->lo_header = top->lo_header; + result = scan->lo_ops->loo_object_init(env, scan, conf); + if (result != 0) { + lu_object_free(env, top); + RETURN(ERR_PTR(result)); + } + scan->lo_flags |= LU_OBJECT_ALLOCATED; + } + } while (!clean); + + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_start != NULL) { + result = scan->lo_ops->loo_object_start(env, scan); + if (result != 0) { + lu_object_free(env, top); + RETURN(ERR_PTR(result)); + } + } + } + + lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED); + RETURN(top); +} + +/** + * Free an object. + */ +static void lu_object_free(const struct lu_env *env, struct lu_object *o) +{ + struct lu_site_bkt_data *bkt; + struct lu_site *site; + struct lu_object *scan; + struct list_head *layers; + struct list_head splice; + + site = o->lo_dev->ld_site; + layers = &o->lo_header->loh_layers; + bkt = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid); + /* + * First call ->loo_object_delete() method to release all resources. + */ + list_for_each_entry_reverse(scan, layers, lo_linkage) { + if (scan->lo_ops->loo_object_delete != NULL) + scan->lo_ops->loo_object_delete(env, scan); + } + + /* + * Then, splice object layers into stand-alone list, and call + * ->loo_object_free() on all layers to free memory. Splice is + * necessary, because lu_object_header is freed together with the + * top-level slice. + */ + INIT_LIST_HEAD(&splice); + list_splice_init(layers, &splice); + while (!list_empty(&splice)) { + /* + * Free layers in bottom-to-top order, so that object header + * lives as long as possible and ->loo_object_free() methods + * can look at its contents. + */ + o = container_of0(splice.prev, struct lu_object, lo_linkage); + list_del_init(&o->lo_linkage); + LASSERT(o->lo_ops->loo_object_free != NULL); + o->lo_ops->loo_object_free(env, o); + } + + if (waitqueue_active(&bkt->lsb_marche_funebre)) + wake_up_all(&bkt->lsb_marche_funebre); +} + +/** + * Free \a nr objects from the cold end of the site LRU list. + */ +int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr) +{ + struct lu_object_header *h; + struct lu_object_header *temp; + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + cfs_hash_bd_t bd2; + struct list_head dispose; + int did_sth; + int start; + int count; + int bnr; + int i; + + if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU)) + RETURN(0); + + INIT_LIST_HEAD(&dispose); + /* + * Under LRU list lock, scan LRU list and move unreferenced objects to + * the dispose list, removing them from LRU and hash table. + */ + start = s->ls_purge_start; + bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1; + again: + did_sth = 0; + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + if (i < start) + continue; + count = bnr; + cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1); + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + + list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) { + LASSERT(atomic_read(&h->loh_ref) == 0); + + cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2); + LASSERT(bd.bd_bucket == bd2.bd_bucket); + + cfs_hash_bd_del_locked(s->ls_obj_hash, + &bd2, &h->loh_hash); + list_move(&h->loh_lru, &dispose); + if (did_sth == 0) + did_sth = 1; + + if (nr != ~0 && --nr == 0) + break; + + if (count > 0 && --count == 0) + break; + + } + cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1); + cond_resched(); + /* + * Free everything on the dispose list. This is safe against + * races due to the reasons described in lu_object_put(). + */ + while (!list_empty(&dispose)) { + h = container_of0(dispose.next, + struct lu_object_header, loh_lru); + list_del_init(&h->loh_lru); + lu_object_free(env, lu_object_top(h)); + lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED); + } + + if (nr == 0) + break; + } + + if (nr != 0 && did_sth && start != 0) { + start = 0; /* restart from the first bucket */ + goto again; + } + /* race on s->ls_purge_start, but nobody cares */ + s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash); + + return nr; +} +EXPORT_SYMBOL(lu_site_purge); + +/* + * Object printing. + * + * Code below has to jump through certain loops to output object description + * into libcfs_debug_msg-based log. The problem is that lu_object_print() + * composes object description from strings that are parts of _lines_ of + * output (i.e., strings that are not terminated by newline). This doesn't fit + * very well into libcfs_debug_msg() interface that assumes that each message + * supplied to it is a self-contained output line. + * + * To work around this, strings are collected in a temporary buffer + * (implemented as a value of lu_cdebug_key key), until terminating newline + * character is detected. + * + */ + +enum { + /** + * Maximal line size. + * + * XXX overflow is not handled correctly. + */ + LU_CDEBUG_LINE = 512 +}; + +struct lu_cdebug_data { + /** + * Temporary buffer. + */ + char lck_area[LU_CDEBUG_LINE]; +}; + +/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */ +LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data); + +/** + * Key, holding temporary buffer. This key is registered very early by + * lu_global_init(). + */ +struct lu_context_key lu_global_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | + LCT_MG_THREAD | LCT_CL_THREAD, + .lct_init = lu_global_key_init, + .lct_fini = lu_global_key_fini +}; + +/** + * Printer function emitting messages through libcfs_debug_msg(). + */ +int lu_cdebug_printer(const struct lu_env *env, + void *cookie, const char *format, ...) +{ + struct libcfs_debug_msg_data *msgdata = cookie; + struct lu_cdebug_data *key; + int used; + int complete; + va_list args; + + va_start(args, format); + + key = lu_context_key_get(&env->le_ctx, &lu_global_key); + LASSERT(key != NULL); + + used = strlen(key->lck_area); + complete = format[strlen(format) - 1] == '\n'; + /* + * Append new chunk to the buffer. + */ + vsnprintf(key->lck_area + used, + ARRAY_SIZE(key->lck_area) - used, format, args); + if (complete) { + if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys)) + libcfs_debug_msg(msgdata, "%s", key->lck_area); + key->lck_area[0] = 0; + } + va_end(args); + return 0; +} +EXPORT_SYMBOL(lu_cdebug_printer); + +/** + * Print object header. + */ +void lu_object_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct lu_object_header *hdr) +{ + (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", + hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), + PFID(&hdr->loh_fid), + hlist_unhashed(&hdr->loh_hash) ? "" : " hash", + list_empty((struct list_head *)&hdr->loh_lru) ? \ + "" : " lru", + hdr->loh_attr & LOHA_EXISTS ? " exist":""); +} +EXPORT_SYMBOL(lu_object_header_print); + +/** + * Print human readable representation of the \a o to the \a printer. + */ +void lu_object_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct lu_object *o) +{ + static const char ruler[] = "........................................"; + struct lu_object_header *top; + int depth; + + top = o->lo_header; + lu_object_header_print(env, cookie, printer, top); + (*printer)(env, cookie, "{ \n"); + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + depth = o->lo_depth + 4; + + /* + * print `.' \a depth times followed by type name and address + */ + (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, + o->lo_dev->ld_type->ldt_name, o); + if (o->lo_ops->loo_object_print != NULL) + o->lo_ops->loo_object_print(env, cookie, printer, o); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} header@%p\n", top); +} +EXPORT_SYMBOL(lu_object_print); + +/** + * Check object consistency. + */ +int lu_object_invariant(const struct lu_object *o) +{ + struct lu_object_header *top; + + top = o->lo_header; + list_for_each_entry(o, &top->loh_layers, lo_linkage) { + if (o->lo_ops->loo_object_invariant != NULL && + !o->lo_ops->loo_object_invariant(o)) + return 0; + } + return 1; +} +EXPORT_SYMBOL(lu_object_invariant); + +static struct lu_object *htable_lookup(struct lu_site *s, + cfs_hash_bd_t *bd, + const struct lu_fid *f, + wait_queue_t *waiter, + __u64 *version) +{ + struct lu_site_bkt_data *bkt; + struct lu_object_header *h; + struct hlist_node *hnode; + __u64 ver = cfs_hash_bd_version_get(bd); + + if (*version == ver) + return NULL; + + *version = ver; + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd); + /* cfs_hash_bd_peek_locked is a somehow "internal" function + * of cfs_hash, it doesn't add refcount on object. */ + hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f); + if (hnode == NULL) { + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS); + return NULL; + } + + h = container_of0(hnode, struct lu_object_header, loh_hash); + if (likely(!lu_object_is_dying(h))) { + cfs_hash_get(s->ls_obj_hash, hnode); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT); + list_del_init(&h->loh_lru); + return lu_object_top(h); + } + + /* + * Lookup found an object being destroyed this object cannot be + * returned (to assure that references to dying objects are eventually + * drained), and moreover, lookup has to wait until object is freed. + */ + + init_waitqueue_entry_current(waiter); + add_wait_queue(&bkt->lsb_marche_funebre, waiter); + set_current_state(TASK_UNINTERRUPTIBLE); + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE); + return ERR_PTR(-EAGAIN); +} + +/** + * Search cache for an object with the fid \a f. If such object is found, + * return it. Otherwise, create new object, insert it into cache and return + * it. In any case, additional reference is acquired on the returned object. + */ +struct lu_object *lu_object_find(const struct lu_env *env, + struct lu_device *dev, const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf); +} +EXPORT_SYMBOL(lu_object_find); + +static struct lu_object *lu_object_new(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *o; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + struct lu_site_bkt_data *bkt; + + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + hs = dev->ld_site->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; +} + +/** + * Core logic of lu_object_find*() functions. + */ +static struct lu_object *lu_object_find_try(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf, + wait_queue_t *waiter) +{ + struct lu_object *o; + struct lu_object *shadow; + struct lu_site *s; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + __u64 version = 0; + + /* + * This uses standard index maintenance protocol: + * + * - search index under lock, and return object if found; + * - otherwise, unlock index, allocate new object; + * - lock index and search again; + * - if nothing is found (usual case), insert newly created + * object into index; + * - otherwise (race: other thread inserted object), free + * object just allocated. + * - unlock index; + * - return object. + * + * For "LOC_F_NEW" case, we are sure the object is new established. + * It is unnecessary to perform lookup-alloc-lookup-insert, instead, + * just alloc and insert directly. + * + * If dying object is found during index search, add @waiter to the + * site wait-queue and return ERR_PTR(-EAGAIN). + */ + if (conf != NULL && conf->loc_flags & LOC_F_NEW) + return lu_object_new(env, dev, f, conf); + + s = dev->ld_site; + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1); + o = htable_lookup(s, &bd, f, waiter, &version); + cfs_hash_bd_unlock(hs, &bd, 1); + if (o != NULL) + return o; + + /* + * Allocate new object. This may result in rather complicated + * operations, including fld queries, inode loading, etc. + */ + o = lu_object_alloc(env, dev, f, conf); + if (unlikely(IS_ERR(o))) + return o; + + LASSERT(lu_fid_eq(lu_object_fid(o), f)); + + cfs_hash_bd_lock(hs, &bd, 1); + + shadow = htable_lookup(s, &bd, f, waiter, &version); + if (likely(shadow == NULL)) { + struct lu_site_bkt_data *bkt; + + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); + return o; + } + + lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE); + cfs_hash_bd_unlock(hs, &bd, 1); + lu_object_free(env, o); + return shadow; +} + +/** + * Much like lu_object_find(), but top level device of object is specifically + * \a dev rather than top level device of the site. This interface allows + * objects of different "stacking" to be created within the same site. + */ +struct lu_object *lu_object_find_at(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_site_bkt_data *bkt; + struct lu_object *obj; + wait_queue_t wait; + + while (1) { + obj = lu_object_find_try(env, dev, f, conf, &wait); + if (obj != ERR_PTR(-EAGAIN)) + return obj; + /* + * lu_object_find_try() already added waiter into the + * wait queue. + */ + waitq_wait(&wait, TASK_UNINTERRUPTIBLE); + bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f); + remove_wait_queue(&bkt->lsb_marche_funebre, &wait); + } +} +EXPORT_SYMBOL(lu_object_find_at); + +/** + * Find object with given fid, and return its slice belonging to given device. + */ +struct lu_object *lu_object_find_slice(const struct lu_env *env, + struct lu_device *dev, + const struct lu_fid *f, + const struct lu_object_conf *conf) +{ + struct lu_object *top; + struct lu_object *obj; + + top = lu_object_find(env, dev, f, conf); + if (!IS_ERR(top)) { + obj = lu_object_locate(top->lo_header, dev->ld_type); + if (obj == NULL) + lu_object_put(env, top); + } else + obj = top; + return obj; +} +EXPORT_SYMBOL(lu_object_find_slice); + +/** + * Global list of all device types. + */ +static LIST_HEAD(lu_device_types); + +int lu_device_type_init(struct lu_device_type *ldt) +{ + int result = 0; + + INIT_LIST_HEAD(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_init) + result = ldt->ldt_ops->ldto_init(ldt); + if (result == 0) + list_add(&ldt->ldt_linkage, &lu_device_types); + return result; +} +EXPORT_SYMBOL(lu_device_type_init); + +void lu_device_type_fini(struct lu_device_type *ldt) +{ + list_del_init(&ldt->ldt_linkage); + if (ldt->ldt_ops->ldto_fini) + ldt->ldt_ops->ldto_fini(ldt); +} +EXPORT_SYMBOL(lu_device_type_fini); + +void lu_types_stop(void) +{ + struct lu_device_type *ldt; + + list_for_each_entry(ldt, &lu_device_types, ldt_linkage) { + if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop) + ldt->ldt_ops->ldto_stop(ldt); + } +} +EXPORT_SYMBOL(lu_types_stop); + +/** + * Global list of all sites on this node + */ +static LIST_HEAD(lu_sites); +static DEFINE_MUTEX(lu_sites_guard); + +/** + * Global environment used by site shrinker. + */ +static struct lu_env lu_shrink_env; + +struct lu_site_print_arg { + struct lu_env *lsp_env; + void *lsp_cookie; + lu_printer_t lsp_printer; +}; + +static int +lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd, + struct hlist_node *hnode, void *data) +{ + struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data; + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (!list_empty(&h->loh_layers)) { + const struct lu_object *o; + + o = lu_object_top(h); + lu_object_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, o); + } else { + lu_object_header_print(arg->lsp_env, arg->lsp_cookie, + arg->lsp_printer, h); + } + return 0; +} + +/** + * Print all objects in \a s. + */ +void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie, + lu_printer_t printer) +{ + struct lu_site_print_arg arg = { + .lsp_env = (struct lu_env *)env, + .lsp_cookie = cookie, + .lsp_printer = printer, + }; + + cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg); +} +EXPORT_SYMBOL(lu_site_print); + +enum { + LU_CACHE_PERCENT_MAX = 50, + LU_CACHE_PERCENT_DEFAULT = 20 +}; + +static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; +CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644, + "Percentage of memory to be used as lu_object cache"); + +/** + * Return desired hash table order. + */ +static int lu_htable_order(void) +{ + unsigned long cache_size; + int bits; + + /* + * Calculate hash table size, assuming that we want reasonable + * performance when 20% of total memory is occupied by cache of + * lu_objects. + * + * Size of lu_object is (arbitrary) taken as 1K (together with inode). + */ + cache_size = num_physpages; + +#if BITS_PER_LONG == 32 + /* limit hashtable size for lowmem systems to low RAM */ + if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT)) + cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4; +#endif + + /* clear off unreasonable cache setting. */ + if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) { + CWARN("obdclass: invalid lu_cache_percent: %u, it must be in" + " the range of (0, %u]. Will use default value: %u.\n", + lu_cache_percent, LU_CACHE_PERCENT_MAX, + LU_CACHE_PERCENT_DEFAULT); + + lu_cache_percent = LU_CACHE_PERCENT_DEFAULT; + } + cache_size = cache_size / 100 * lu_cache_percent * + (PAGE_CACHE_SIZE / 1024); + + for (bits = 1; (1 << bits) < cache_size; ++bits) { + ; + } + return bits; +} + +static unsigned lu_obj_hop_hash(cfs_hash_t *hs, + const void *key, unsigned mask) +{ + struct lu_fid *fid = (struct lu_fid *)key; + __u32 hash; + + hash = fid_flatten32(fid); + hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */ + hash = cfs_hash_long(hash, hs->hs_bkt_bits); + + /* give me another random factor */ + hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3); + + hash <<= hs->hs_cur_bits - hs->hs_bkt_bits; + hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1); + + return hash & mask; +} + +static void *lu_obj_hop_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct lu_object_header, loh_hash); +} + +static void *lu_obj_hop_key(struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return &h->loh_fid; +} + +static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key); +} + +static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct lu_object_header *h; + + h = hlist_entry(hnode, struct lu_object_header, loh_hash); + if (atomic_add_return(1, &h->loh_ref) == 1) { + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + + cfs_hash_bd_get(hs, &h->loh_fid, &bd); + bkt = cfs_hash_bd_extra_get(hs, &bd); + bkt->lsb_busy++; + } +} + +static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + LBUG(); /* we should never called it */ +} + +cfs_hash_ops_t lu_site_hash_ops = { + .hs_hash = lu_obj_hop_hash, + .hs_key = lu_obj_hop_key, + .hs_keycmp = lu_obj_hop_keycmp, + .hs_object = lu_obj_hop_object, + .hs_get = lu_obj_hop_get, + .hs_put_locked = lu_obj_hop_put_locked, +}; + +void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + if (list_empty(&d->ld_linkage)) + list_add(&d->ld_linkage, &s->ls_ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_add_linkage); + +void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d) +{ + spin_lock(&s->ls_ld_lock); + list_del_init(&d->ld_linkage); + spin_unlock(&s->ls_ld_lock); +} +EXPORT_SYMBOL(lu_dev_del_linkage); + +/** + * Initialize site \a s, with \a d as the top level device. + */ +#define LU_SITE_BITS_MIN 12 +#define LU_SITE_BITS_MAX 24 +/** + * total 256 buckets, we don't want too many buckets because: + * - consume too much memory + * - avoid unbalanced LRU list + */ +#define LU_SITE_BKT_BITS 8 + +int lu_site_init(struct lu_site *s, struct lu_device *top) +{ + struct lu_site_bkt_data *bkt; + cfs_hash_bd_t bd; + char name[16]; + int bits; + int i; + ENTRY; + + memset(s, 0, sizeof *s); + bits = lu_htable_order(); + snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name); + for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX); + bits >= LU_SITE_BITS_MIN; bits--) { + s->ls_obj_hash = cfs_hash_create(name, bits, bits, + bits - LU_SITE_BKT_BITS, + sizeof(*bkt), 0, 0, + &lu_site_hash_ops, + CFS_HASH_SPIN_BKTLOCK | + CFS_HASH_NO_ITEMREF | + CFS_HASH_DEPTH | + CFS_HASH_ASSERT_EMPTY); + if (s->ls_obj_hash != NULL) + break; + } + + if (s->ls_obj_hash == NULL) { + CERROR("failed to create lu_site hash with bits: %d\n", bits); + return -ENOMEM; + } + + cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) { + bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd); + INIT_LIST_HEAD(&bkt->lsb_lru); + init_waitqueue_head(&bkt->lsb_marche_funebre); + } + + s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0); + if (s->ls_stats == NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + return -ENOMEM; + } + + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, + 0, "created", "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, + 0, "cache_hit", "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, + 0, "cache_miss", "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, + 0, "cache_race", "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race", "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, + 0, "lru_purged", "lru_purged"); + + INIT_LIST_HEAD(&s->ls_linkage); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); + + INIT_LIST_HEAD(&s->ls_ld_linkage); + spin_lock_init(&s->ls_ld_lock); + + lu_dev_add_linkage(s, top); + + RETURN(0); +} +EXPORT_SYMBOL(lu_site_init); + +/** + * Finalize \a s and release its resources. + */ +void lu_site_fini(struct lu_site *s) +{ + mutex_lock(&lu_sites_guard); + list_del_init(&s->ls_linkage); + mutex_unlock(&lu_sites_guard); + + if (s->ls_obj_hash != NULL) { + cfs_hash_putref(s->ls_obj_hash); + s->ls_obj_hash = NULL; + } + + if (s->ls_top_dev != NULL) { + s->ls_top_dev->ld_site = NULL; + lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s); + lu_device_put(s->ls_top_dev); + s->ls_top_dev = NULL; + } + + if (s->ls_stats != NULL) + lprocfs_free_stats(&s->ls_stats); +} +EXPORT_SYMBOL(lu_site_fini); + +/** + * Called when initialization of stack for this site is completed. + */ +int lu_site_init_finish(struct lu_site *s) +{ + int result; + mutex_lock(&lu_sites_guard); + result = lu_context_refill(&lu_shrink_env.le_ctx); + if (result == 0) + list_add(&s->ls_linkage, &lu_sites); + mutex_unlock(&lu_sites_guard); + return result; +} +EXPORT_SYMBOL(lu_site_init_finish); + +/** + * Acquire additional reference on device \a d + */ +void lu_device_get(struct lu_device *d) +{ + atomic_inc(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_get); + +/** + * Release reference on device \a d. + */ +void lu_device_put(struct lu_device *d) +{ + LASSERT(atomic_read(&d->ld_ref) > 0); + atomic_dec(&d->ld_ref); +} +EXPORT_SYMBOL(lu_device_put); + +/** + * Initialize device \a d of type \a t. + */ +int lu_device_init(struct lu_device *d, struct lu_device_type *t) +{ + if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL) + t->ldt_ops->ldto_start(t); + memset(d, 0, sizeof *d); + atomic_set(&d->ld_ref, 0); + d->ld_type = t; + lu_ref_init(&d->ld_reference); + INIT_LIST_HEAD(&d->ld_linkage); + return 0; +} +EXPORT_SYMBOL(lu_device_init); + +/** + * Finalize device \a d. + */ +void lu_device_fini(struct lu_device *d) +{ + struct lu_device_type *t; + + t = d->ld_type; + if (d->ld_obd != NULL) { + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } + + lu_ref_fini(&d->ld_reference); + LASSERTF(atomic_read(&d->ld_ref) == 0, + "Refcount is %u\n", atomic_read(&d->ld_ref)); + LASSERT(t->ldt_device_nr > 0); + if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL) + t->ldt_ops->ldto_stop(t); +} +EXPORT_SYMBOL(lu_device_fini); + +/** + * Initialize object \a o that is part of compound object \a h and was created + * by device \a d. + */ +int lu_object_init(struct lu_object *o, + struct lu_object_header *h, struct lu_device *d) +{ + memset(o, 0, sizeof *o); + o->lo_header = h; + o->lo_dev = d; + lu_device_get(d); + o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o); + INIT_LIST_HEAD(&o->lo_linkage); + return 0; +} +EXPORT_SYMBOL(lu_object_init); + +/** + * Finalize object and release its resources. + */ +void lu_object_fini(struct lu_object *o) +{ + struct lu_device *dev = o->lo_dev; + + LASSERT(list_empty(&o->lo_linkage)); + + if (dev != NULL) { + lu_ref_del_at(&dev->ld_reference, + o->lo_dev_ref , "lu_object", o); + lu_device_put(dev); + o->lo_dev = NULL; + } +} +EXPORT_SYMBOL(lu_object_fini); + +/** + * Add object \a o as first layer of compound object \a h + * + * This is typically called by the ->ldo_object_alloc() method of top-level + * device. + */ +void lu_object_add_top(struct lu_object_header *h, struct lu_object *o) +{ + list_move(&o->lo_linkage, &h->loh_layers); +} +EXPORT_SYMBOL(lu_object_add_top); + +/** + * Add object \a o as a layer of compound object, going after \a before. + * + * This is typically called by the ->ldo_object_alloc() method of \a + * before->lo_dev. + */ +void lu_object_add(struct lu_object *before, struct lu_object *o) +{ + list_move(&o->lo_linkage, &before->lo_linkage); +} +EXPORT_SYMBOL(lu_object_add); + +/** + * Initialize compound object. + */ +int lu_object_header_init(struct lu_object_header *h) +{ + memset(h, 0, sizeof *h); + atomic_set(&h->loh_ref, 1); + INIT_HLIST_NODE(&h->loh_hash); + INIT_LIST_HEAD(&h->loh_lru); + INIT_LIST_HEAD(&h->loh_layers); + lu_ref_init(&h->loh_reference); + return 0; +} +EXPORT_SYMBOL(lu_object_header_init); + +/** + * Finalize compound object. + */ +void lu_object_header_fini(struct lu_object_header *h) +{ + LASSERT(list_empty(&h->loh_layers)); + LASSERT(list_empty(&h->loh_lru)); + LASSERT(hlist_unhashed(&h->loh_hash)); + lu_ref_fini(&h->loh_reference); +} +EXPORT_SYMBOL(lu_object_header_fini); + +/** + * Given a compound object, find its slice, corresponding to the device type + * \a dtype. + */ +struct lu_object *lu_object_locate(struct lu_object_header *h, + const struct lu_device_type *dtype) +{ + struct lu_object *o; + + list_for_each_entry(o, &h->loh_layers, lo_linkage) { + if (o->lo_dev->ld_type == dtype) + return o; + } + return NULL; +} +EXPORT_SYMBOL(lu_object_locate); + + + +/** + * Finalize and free devices in the device stack. + * + * Finalize device stack by purging object cache, and calling + * lu_device_type_operations::ldto_device_fini() and + * lu_device_type_operations::ldto_device_free() on all devices in the stack. + */ +void lu_stack_fini(const struct lu_env *env, struct lu_device *top) +{ + struct lu_site *site = top->ld_site; + struct lu_device *scan; + struct lu_device *next; + + lu_site_purge(env, site, ~0); + for (scan = top; scan != NULL; scan = next) { + next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan); + lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init); + lu_device_put(scan); + } + + /* purge again. */ + lu_site_purge(env, site, ~0); + + for (scan = top; scan != NULL; scan = next) { + const struct lu_device_type *ldt = scan->ld_type; + struct obd_type *type; + + next = ldt->ldt_ops->ldto_device_free(env, scan); + type = ldt->ldt_obd_type; + if (type != NULL) { + type->typ_refcnt--; + class_put_type(type); + } + } +} +EXPORT_SYMBOL(lu_stack_fini); + +enum { + /** + * Maximal number of tld slots. + */ + LU_CONTEXT_KEY_NR = 40 +}; + +static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, }; + +static DEFINE_SPINLOCK(lu_keys_guard); + +/** + * Global counter incremented whenever key is registered, unregistered, + * revived or quiesced. This is used to void unnecessary calls to + * lu_context_refill(). No locking is provided, as initialization and shutdown + * are supposed to be externally serialized. + */ +static unsigned key_set_version = 0; + +/** + * Register new key. + */ +int lu_context_key_register(struct lu_context_key *key) +{ + int result; + int i; + + LASSERT(key->lct_init != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(key->lct_tags != 0); + LASSERT(key->lct_owner != NULL); + + result = -ENFILE; + spin_lock(&lu_keys_guard); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (lu_keys[i] == NULL) { + key->lct_index = i; + atomic_set(&key->lct_used, 1); + lu_keys[i] = key; + lu_ref_init(&key->lct_reference); + result = 0; + ++key_set_version; + break; + } + } + spin_unlock(&lu_keys_guard); + return result; +} +EXPORT_SYMBOL(lu_context_key_register); + +static void key_fini(struct lu_context *ctx, int index) +{ + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { + struct lu_context_key *key; + + key = lu_keys[index]; + LASSERT(key != NULL); + LASSERT(key->lct_fini != NULL); + LASSERT(atomic_read(&key->lct_used) > 1); + + key->lct_fini(ctx, key, ctx->lc_value[index]); + lu_ref_del(&key->lct_reference, "ctx", ctx); + atomic_dec(&key->lct_used); + + LASSERT(key->lct_owner != NULL); + if ((ctx->lc_tags & LCT_NOREF) == 0) { +#ifdef CONFIG_MODULE_UNLOAD + LINVRNT(module_refcount(key->lct_owner) > 0); +#endif + module_put(key->lct_owner); + } + ctx->lc_value[index] = NULL; + } +} + +/** + * Deregister key. + */ +void lu_context_key_degister(struct lu_context_key *key) +{ + LASSERT(atomic_read(&key->lct_used) >= 1); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + + lu_context_key_quiesce(key); + + ++key_set_version; + spin_lock(&lu_keys_guard); + key_fini(&lu_shrink_env.le_ctx, key->lct_index); + if (lu_keys[key->lct_index]) { + lu_keys[key->lct_index] = NULL; + lu_ref_fini(&key->lct_reference); + } + spin_unlock(&lu_keys_guard); + + LASSERTF(atomic_read(&key->lct_used) == 1, + "key has instances: %d\n", + atomic_read(&key->lct_used)); +} +EXPORT_SYMBOL(lu_context_key_degister); + +/** + * Register a number of keys. This has to be called after all keys have been + * initialized by a call to LU_CONTEXT_KEY_INIT(). + */ +int lu_context_key_register_many(struct lu_context_key *k, ...) +{ + struct lu_context_key *key = k; + va_list args; + int result; + + va_start(args, k); + do { + result = lu_context_key_register(key); + if (result) + break; + key = va_arg(args, struct lu_context_key *); + } while (key != NULL); + va_end(args); + + if (result != 0) { + va_start(args, k); + while (k != key) { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key *); + } + va_end(args); + } + + return result; +} +EXPORT_SYMBOL(lu_context_key_register_many); + +/** + * De-register a number of keys. This is a dual to + * lu_context_key_register_many(). + */ +void lu_context_key_degister_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_degister(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_degister_many); + +/** + * Revive a number of keys. + */ +void lu_context_key_revive_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_revive(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_revive_many); + +/** + * Quiescent a number of keys. + */ +void lu_context_key_quiesce_many(struct lu_context_key *k, ...) +{ + va_list args; + + va_start(args, k); + do { + lu_context_key_quiesce(k); + k = va_arg(args, struct lu_context_key*); + } while (k != NULL); + va_end(args); +} +EXPORT_SYMBOL(lu_context_key_quiesce_many); + +/** + * Return value associated with key \a key in context \a ctx. + */ +void *lu_context_key_get(const struct lu_context *ctx, + const struct lu_context_key *key) +{ + LINVRNT(ctx->lc_state == LCS_ENTERED); + LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + LASSERT(lu_keys[key->lct_index] == key); + return ctx->lc_value[key->lct_index]; +} +EXPORT_SYMBOL(lu_context_key_get); + +/** + * List of remembered contexts. XXX document me. + */ +static LIST_HEAD(lu_context_remembered); + +/** + * Destroy \a key in all remembered contexts. This is used to destroy key + * values in "shared" contexts (like service threads), when a module owning + * the key is about to be unloaded. + */ +void lu_context_key_quiesce(struct lu_context_key *key) +{ + struct lu_context *ctx; + + if (!(key->lct_tags & LCT_QUIESCENT)) { + /* + * XXX layering violation. + */ + key->lct_tags |= LCT_QUIESCENT; + /* + * XXX memory barrier has to go here. + */ + spin_lock(&lu_keys_guard); + list_for_each_entry(ctx, &lu_context_remembered, + lc_remember) + key_fini(ctx, key->lct_index); + spin_unlock(&lu_keys_guard); + ++key_set_version; + } +} +EXPORT_SYMBOL(lu_context_key_quiesce); + +void lu_context_key_revive(struct lu_context_key *key) +{ + key->lct_tags &= ~LCT_QUIESCENT; + ++key_set_version; +} +EXPORT_SYMBOL(lu_context_key_revive); + +static void keys_fini(struct lu_context *ctx) +{ + int i; + + if (ctx->lc_value == NULL) + return; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) + key_fini(ctx, i); + + OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + ctx->lc_value = NULL; +} + +static int keys_fill(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_value != NULL); + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (ctx->lc_value[i] == NULL && key != NULL && + (key->lct_tags & ctx->lc_tags) && + /* + * Don't create values for a LCT_QUIESCENT key, as this + * will pin module owning a key. + */ + !(key->lct_tags & LCT_QUIESCENT)) { + void *value; + + LINVRNT(key->lct_init != NULL); + LINVRNT(key->lct_index == i); + + value = key->lct_init(ctx, key); + if (unlikely(IS_ERR(value))) + return PTR_ERR(value); + + LASSERT(key->lct_owner != NULL); + if (!(ctx->lc_tags & LCT_NOREF)) + try_module_get(key->lct_owner); + lu_ref_add_atomic(&key->lct_reference, "ctx", ctx); + atomic_inc(&key->lct_used); + /* + * This is the only place in the code, where an + * element of ctx->lc_value[] array is set to non-NULL + * value. + */ + ctx->lc_value[i] = value; + if (key->lct_exit != NULL) + ctx->lc_tags |= LCT_HAS_EXIT; + } + ctx->lc_version = key_set_version; + } + return 0; +} + +static int keys_init(struct lu_context *ctx) +{ + OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]); + if (likely(ctx->lc_value != NULL)) + return keys_fill(ctx); + + return -ENOMEM; +} + +/** + * Initialize context data-structure. Create values for all keys. + */ +int lu_context_init(struct lu_context *ctx, __u32 tags) +{ + int rc; + + memset(ctx, 0, sizeof *ctx); + ctx->lc_state = LCS_INITIALIZED; + ctx->lc_tags = tags; + if (tags & LCT_REMEMBER) { + spin_lock(&lu_keys_guard); + list_add(&ctx->lc_remember, &lu_context_remembered); + spin_unlock(&lu_keys_guard); + } else { + INIT_LIST_HEAD(&ctx->lc_remember); + } + + rc = keys_init(ctx); + if (rc != 0) + lu_context_fini(ctx); + + return rc; +} +EXPORT_SYMBOL(lu_context_init); + +/** + * Finalize context data-structure. Destroy key values. + */ +void lu_context_fini(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_FINALIZED; + + if ((ctx->lc_tags & LCT_REMEMBER) == 0) { + LASSERT(list_empty(&ctx->lc_remember)); + keys_fini(ctx); + + } else { /* could race with key degister */ + spin_lock(&lu_keys_guard); + keys_fini(ctx); + list_del_init(&ctx->lc_remember); + spin_unlock(&lu_keys_guard); + } +} +EXPORT_SYMBOL(lu_context_fini); + +/** + * Called before entering context. + */ +void lu_context_enter(struct lu_context *ctx) +{ + LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT); + ctx->lc_state = LCS_ENTERED; +} +EXPORT_SYMBOL(lu_context_enter); + +/** + * Called after exiting from \a ctx + */ +void lu_context_exit(struct lu_context *ctx) +{ + int i; + + LINVRNT(ctx->lc_state == LCS_ENTERED); + ctx->lc_state = LCS_LEFT; + if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) { + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + if (ctx->lc_value[i] != NULL) { + struct lu_context_key *key; + + key = lu_keys[i]; + LASSERT(key != NULL); + if (key->lct_exit != NULL) + key->lct_exit(ctx, + key, ctx->lc_value[i]); + } + } + } +} +EXPORT_SYMBOL(lu_context_exit); + +/** + * Allocate for context all missing keys that were registered after context + * creation. key_set_version is only changed in rare cases when modules + * are loaded and removed. + */ +int lu_context_refill(struct lu_context *ctx) +{ + return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx); +} +EXPORT_SYMBOL(lu_context_refill); + +/** + * lu_ctx_tags/lu_ses_tags will be updated if there are new types of + * obd being added. Currently, this is only used on client side, specifically + * for echo device client, for other stack (like ptlrpc threads), context are + * predefined when the lu_device type are registered, during the module probe + * phase. + */ +__u32 lu_context_tags_default = 0; +__u32 lu_session_tags_default = 0; + +void lu_context_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_update); + +void lu_context_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_context_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_context_tags_clear); + +void lu_session_tags_update(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default |= tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_update); + +void lu_session_tags_clear(__u32 tags) +{ + spin_lock(&lu_keys_guard); + lu_session_tags_default &= ~tags; + key_set_version++; + spin_unlock(&lu_keys_guard); +} +EXPORT_SYMBOL(lu_session_tags_clear); + +int lu_env_init(struct lu_env *env, __u32 tags) +{ + int result; + + env->le_ses = NULL; + result = lu_context_init(&env->le_ctx, tags); + if (likely(result == 0)) + lu_context_enter(&env->le_ctx); + return result; +} +EXPORT_SYMBOL(lu_env_init); + +void lu_env_fini(struct lu_env *env) +{ + lu_context_exit(&env->le_ctx); + lu_context_fini(&env->le_ctx); + env->le_ses = NULL; +} +EXPORT_SYMBOL(lu_env_fini); + +int lu_env_refill(struct lu_env *env) +{ + int result; + + result = lu_context_refill(&env->le_ctx); + if (result == 0 && env->le_ses != NULL) + result = lu_context_refill(env->le_ses); + return result; +} +EXPORT_SYMBOL(lu_env_refill); + +/** + * Currently, this API will only be used by echo client. + * Because echo client and normal lustre client will share + * same cl_env cache. So echo client needs to refresh + * the env context after it get one from the cache, especially + * when normal client and echo client co-exist in the same client. + */ +int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags, + __u32 stags) +{ + int result; + + if ((env->le_ctx.lc_tags & ctags) != ctags) { + env->le_ctx.lc_version = 0; + env->le_ctx.lc_tags |= ctags; + } + + if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) { + env->le_ses->lc_version = 0; + env->le_ses->lc_tags |= stags; + } + + result = lu_env_refill(env); + + return result; +} +EXPORT_SYMBOL(lu_env_refill_by_tags); + +static struct shrinker *lu_site_shrinker = NULL; + +typedef struct lu_site_stats{ + unsigned lss_populated; + unsigned lss_max_search; + unsigned lss_total; + unsigned lss_busy; +} lu_site_stats_t; + +static void lu_site_stats_get(cfs_hash_t *hs, + lu_site_stats_t *stats, int populated) +{ + cfs_hash_bd_t bd; + int i; + + cfs_hash_for_each_bucket(hs, &bd, i) { + struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd); + struct hlist_head *hhead; + + cfs_hash_bd_lock(hs, &bd, 1); + stats->lss_busy += bkt->lsb_busy; + stats->lss_total += cfs_hash_bd_count_get(&bd); + stats->lss_max_search = max((int)stats->lss_max_search, + cfs_hash_bd_depmax_get(&bd)); + if (!populated) { + cfs_hash_bd_unlock(hs, &bd, 1); + continue; + } + + cfs_hash_bd_for_each_hlist(hs, &bd, hhead) { + if (!hlist_empty(hhead)) + stats->lss_populated++; + } + cfs_hash_bd_unlock(hs, &bd, 1); + } +} + + +/* + * There exists a potential lock inversion deadlock scenario when using + * Lustre on top of ZFS. This occurs between one of ZFS's + * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially, + * thread A will take the lu_sites_guard lock and sleep on the ht_lock, + * while thread B will take the ht_lock and sleep on the lu_sites_guard + * lock. Obviously neither thread will wake and drop their respective hold + * on their lock. + * + * To prevent this from happening we must ensure the lu_sites_guard lock is + * not taken while down this code path. ZFS reliably does not set the + * __GFP_FS bit in its code paths, so this can be used to determine if it + * is safe to take the lu_sites_guard lock. + * + * Ideally we should accurately return the remaining number of cached + * objects without taking the lu_sites_guard lock, but this is not + * possible in the current implementation. + */ +static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask)) +{ + lu_site_stats_t stats; + struct lu_site *s; + struct lu_site *tmp; + int cached = 0; + int remain = shrink_param(sc, nr_to_scan); + LIST_HEAD(splice); + + if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) { + if (remain != 0) + return -1; + else + /* We must not take the lu_sites_guard lock when + * __GFP_FS is *not* set because of the deadlock + * possibility detailed above. Additionally, + * since we cannot determine the number of + * objects in the cache without taking this + * lock, we're in a particularly tough spot. As + * a result, we'll just lie and say our cache is + * empty. This _should_ be ok, as we can't + * reclaim objects when __GFP_FS is *not* set + * anyways. + */ + return 0; + } + + CDEBUG(D_INODE, "Shrink %d objects\n", remain); + + mutex_lock(&lu_sites_guard); + list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) { + if (shrink_param(sc, nr_to_scan) != 0) { + remain = lu_site_purge(&lu_shrink_env, s, remain); + /* + * Move just shrunk site to the tail of site list to + * assure shrinking fairness. + */ + list_move_tail(&s->ls_linkage, &splice); + } + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 0); + cached += stats.lss_total - stats.lss_busy; + if (shrink_param(sc, nr_to_scan) && remain <= 0) + break; + } + list_splice(&splice, lu_sites.prev); + mutex_unlock(&lu_sites_guard); + + cached = (cached / 100) * sysctl_vfs_cache_pressure; + if (shrink_param(sc, nr_to_scan) == 0) + CDEBUG(D_INODE, "%d objects cached\n", cached); + return cached; +} + +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *unused, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +/** + * Initialization of global lu_* data. + */ +int lu_global_init(void) +{ + int result; + + CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys); + + result = lu_ref_global_init(); + if (result != 0) + return result; + + LU_CONTEXT_KEY_INIT(&lu_global_key); + result = lu_context_key_register(&lu_global_key); + if (result != 0) + return result; + + /* + * At this level, we don't know what tags are needed, so allocate them + * conservatively. This should not be too bad, because this + * environment is global. + */ + mutex_lock(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + mutex_unlock(&lu_sites_guard); + if (result != 0) + return result; + + /* + * seeks estimation: 3 seeks to read a record from oi, one to read + * inode, one for ea. Unfortunately setting this high value results in + * lu_object/inode cache consuming all the memory. + */ + lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink); + if (lu_site_shrinker == NULL) + return -ENOMEM; + + return result; +} + +/** + * Dual to lu_global_init(). + */ +void lu_global_fini(void) +{ + if (lu_site_shrinker != NULL) { + remove_shrinker(lu_site_shrinker); + lu_site_shrinker = NULL; + } + + lu_context_key_degister(&lu_global_key); + + /* + * Tear shrinker environment down _after_ de-registering + * lu_global_key, because the latter has a value in the former. + */ + mutex_lock(&lu_sites_guard); + lu_env_fini(&lu_shrink_env); + mutex_unlock(&lu_sites_guard); + + lu_ref_global_fini(); +} + +static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx) +{ +#ifdef LPROCFS + struct lprocfs_counter ret; + + lprocfs_stats_collect(stats, idx, &ret); + return (__u32)ret.lc_count; +#else + return 0; +#endif +} + +/** + * Output site statistical counters into a buffer. Suitable for + * lprocfs_rd_*()-style functions. + */ +int lu_site_stats_print(const struct lu_site *s, struct seq_file *m) +{ + lu_site_stats_t stats; + + memset(&stats, 0, sizeof(stats)); + lu_site_stats_get(s->ls_obj_hash, &stats, 1); + + return seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n", + stats.lss_busy, + stats.lss_total, + stats.lss_populated, + CFS_HASH_NHLIST(s->ls_obj_hash), + stats.lss_max_search, + ls_stats_read(s->ls_stats, LU_SS_CREATED), + ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT), + ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS), + ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE), + ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE), + ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED)); +} +EXPORT_SYMBOL(lu_site_stats_print); + +/** + * Helper function to initialize a number of kmem slab caches at once. + */ +int lu_kmem_init(struct lu_kmem_descr *caches) +{ + int result; + struct lu_kmem_descr *iter = caches; + + for (result = 0; iter->ckd_cache != NULL; ++iter) { + *iter->ckd_cache = kmem_cache_create(iter->ckd_name, + iter->ckd_size, + 0, 0, NULL); + if (*iter->ckd_cache == NULL) { + result = -ENOMEM; + /* free all previously allocated caches */ + lu_kmem_fini(caches); + break; + } + } + return result; +} +EXPORT_SYMBOL(lu_kmem_init); + +/** + * Helper function to finalize a number of kmem slab cached at once. Dual to + * lu_kmem_init(). + */ +void lu_kmem_fini(struct lu_kmem_descr *caches) +{ + for (; caches->ckd_cache != NULL; ++caches) { + if (*caches->ckd_cache != NULL) { + kmem_cache_destroy(*caches->ckd_cache); + *caches->ckd_cache = NULL; + } + } +} +EXPORT_SYMBOL(lu_kmem_fini); + +/** + * Temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o, + const struct lu_fid *fid) +{ + struct lu_site *s = o->lo_dev->ld_site; + struct lu_fid *old = &o->lo_header->loh_fid; + struct lu_site_bkt_data *bkt; + struct lu_object *shadow; + wait_queue_t waiter; + cfs_hash_t *hs; + cfs_hash_bd_t bd; + __u64 version = 0; + + LASSERT(fid_is_zero(old)); + + hs = s->ls_obj_hash; + cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1); + shadow = htable_lookup(s, &bd, fid, &waiter, &version); + /* supposed to be unique */ + LASSERT(shadow == NULL); + *old = *fid; + bkt = cfs_hash_bd_extra_get(hs, &bd); + cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash); + bkt->lsb_busy++; + cfs_hash_bd_unlock(hs, &bd, 1); +} +EXPORT_SYMBOL(lu_object_assign_fid); + +/** + * allocates object with 0 (non-assiged) fid + * XXX: temporary solution to be able to assign fid in ->do_create() + * till we have fully-functional OST fids + */ +struct lu_object *lu_object_anon(const struct lu_env *env, + struct lu_device *dev, + const struct lu_object_conf *conf) +{ + struct lu_fid fid; + struct lu_object *o; + + fid_zero(&fid); + o = lu_object_alloc(env, dev, &fid, conf); + + return o; +} +EXPORT_SYMBOL(lu_object_anon); + +struct lu_buf LU_BUF_NULL = { + .lb_buf = NULL, + .lb_len = 0 +}; +EXPORT_SYMBOL(LU_BUF_NULL); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, int size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, int size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); + +struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len) +{ + if (buf->lb_buf == NULL && buf->lb_len == 0) + lu_buf_alloc(buf, len); + + if ((len > buf->lb_len) && (buf->lb_buf != NULL)) + lu_buf_realloc(buf, len); + + return buf; +} +EXPORT_SYMBOL(lu_buf_check_and_alloc); + +/** + * Increase the size of the \a buf. + * preserves old data in buffer + * old buffer remains unchanged on error + * \retval 0 or -ENOMEM + */ +int lu_buf_check_and_grow(struct lu_buf *buf, int len) +{ + char *ptr; + + if (len <= buf->lb_len) + return 0; + + OBD_ALLOC_LARGE(ptr, len); + if (ptr == NULL) + return -ENOMEM; + + /* Free the old buf */ + if (buf->lb_buf != NULL) { + memcpy(ptr, buf->lb_buf, buf->lb_len); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + } + + buf->lb_buf = ptr; + buf->lb_len = len; + return 0; +} +EXPORT_SYMBOL(lu_buf_check_and_grow); diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c new file mode 100644 index 000000000000..23a76f158356 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c @@ -0,0 +1,50 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_ref.c + * + * Lustre reference. + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +# include <linux/libcfs/libcfs.h> + +#include <obd.h> +#include <obd_class.h> +#include <obd_support.h> +#include <lu_ref.h> diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c new file mode 100644 index 000000000000..229db6c39b78 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c @@ -0,0 +1,107 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lu_object.c + * + * Lustre Object. + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + * + * Author: Nikita Danilov <nikita.danilov@sun.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <linux/libcfs/libcfs.h> +#include <obd_support.h> +#include <lu_object.h> +#include <md_object.h> + +/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */ +LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred); + +static struct lu_context_key lu_ucred_key = { + .lct_tags = LCT_SESSION, + .lct_init = lu_ucred_key_init, + .lct_fini = lu_ucred_key_fini +}; + +/** + * Get ucred key if session exists and ucred key is allocated on it. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred(const struct lu_env *env) +{ + if (!env->le_ses) + return NULL; + return lu_context_key_get(env->le_ses, &lu_ucred_key); +} +EXPORT_SYMBOL(lu_ucred); + +/** + * Get ucred key and check if it is properly initialized. + * Return NULL otherwise. + */ +struct lu_ucred *lu_ucred_check(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred(env); + if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW) + return NULL; + return uc; +} +EXPORT_SYMBOL(lu_ucred_check); + +/** + * Get ucred key, which must exist and must be properly initialized. + * Assert otherwise. + */ +struct lu_ucred *lu_ucred_assert(const struct lu_env *env) +{ + struct lu_ucred *uc = lu_ucred_check(env); + LASSERT(uc != NULL); + return uc; +} +EXPORT_SYMBOL(lu_ucred_assert); + +int lu_ucred_global_init(void) +{ + LU_CONTEXT_KEY_INIT(&lu_ucred_key); + return lu_context_key_register(&lu_ucred_key); +} + +void lu_ucred_global_fini(void) +{ + lu_context_key_degister(&lu_ucred_key); +} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c new file mode 100644 index 000000000000..69d6499ef731 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c @@ -0,0 +1,263 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/lustre_handles.c + * + * Author: Phil Schwan <phil@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd_support.h> +#include <lustre_handles.h> +#include <lustre_lib.h> + + +static __u64 handle_base; +#define HANDLE_INCR 7 +static spinlock_t handle_base_lock; + +static struct handle_bucket { + spinlock_t lock; + struct list_head head; +} *handle_hash; + +#define HANDLE_HASH_SIZE (1 << 16) +#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1) + +/* + * Generate a unique 64bit cookie (hash) for a handle and insert it into + * global (per-node) hash-table. + */ +void class_handle_hash(struct portals_handle *h, + struct portals_handle_ops *ops) +{ + struct handle_bucket *bucket; + ENTRY; + + LASSERT(h != NULL); + LASSERT(list_empty(&h->h_link)); + + /* + * This is fast, but simplistic cookie generation algorithm, it will + * need a re-do at some point in the future for security. + */ + spin_lock(&handle_base_lock); + handle_base += HANDLE_INCR; + + if (unlikely(handle_base == 0)) { + /* + * Cookie of zero is "dangerous", because in many places it's + * assumed that 0 means "unassigned" handle, not bound to any + * object. + */ + CWARN("The universe has been exhausted: cookie wrap-around.\n"); + handle_base += HANDLE_INCR; + } + h->h_cookie = handle_base; + spin_unlock(&handle_base_lock); + + h->h_ops = ops; + spin_lock_init(&h->h_lock); + + bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK]; + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n", + h, h->h_cookie); + EXIT; +} +EXPORT_SYMBOL(class_handle_hash); + +static void class_handle_unhash_nolock(struct portals_handle *h) +{ + if (list_empty(&h->h_link)) { + CERROR("removing an already-removed handle ("LPX64")\n", + h->h_cookie); + return; + } + + CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n", + h, h->h_cookie); + + spin_lock(&h->h_lock); + if (h->h_in == 0) { + spin_unlock(&h->h_lock); + return; + } + h->h_in = 0; + spin_unlock(&h->h_lock); + list_del_rcu(&h->h_link); +} + +void class_handle_unhash(struct portals_handle *h) +{ + struct handle_bucket *bucket; + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + class_handle_unhash_nolock(h); + spin_unlock(&bucket->lock); +} +EXPORT_SYMBOL(class_handle_unhash); + +void class_handle_hash_back(struct portals_handle *h) +{ + struct handle_bucket *bucket; + ENTRY; + + bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK); + + spin_lock(&bucket->lock); + list_add_rcu(&h->h_link, &bucket->head); + h->h_in = 1; + spin_unlock(&bucket->lock); + + EXIT; +} +EXPORT_SYMBOL(class_handle_hash_back); + +void *class_handle2object(__u64 cookie) +{ + struct handle_bucket *bucket; + struct portals_handle *h; + void *retval = NULL; + ENTRY; + + LASSERT(handle_hash != NULL); + + /* Be careful when you want to change this code. See the + * rcu_read_lock() definition on top this file. - jxiong */ + bucket = handle_hash + (cookie & HANDLE_HASH_MASK); + + rcu_read_lock(); + list_for_each_entry_rcu(h, &bucket->head, h_link) { + if (h->h_cookie != cookie) + continue; + + spin_lock(&h->h_lock); + if (likely(h->h_in != 0)) { + h->h_ops->hop_addref(h); + retval = h; + } + spin_unlock(&h->h_lock); + break; + } + rcu_read_unlock(); + + RETURN(retval); +} +EXPORT_SYMBOL(class_handle2object); + +void class_handle_free_cb(cfs_rcu_head_t *rcu) +{ + struct portals_handle *h = RCU2HANDLE(rcu); + void *ptr = (void *)(unsigned long)h->h_cookie; + + if (h->h_ops->hop_free != NULL) + h->h_ops->hop_free(ptr, h->h_size); + else + OBD_FREE(ptr, h->h_size); +} +EXPORT_SYMBOL(class_handle_free_cb); + +int class_handle_init(void) +{ + struct handle_bucket *bucket; + struct timeval tv; + int seed[2]; + + LASSERT(handle_hash == NULL); + + OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE); + if (handle_hash == NULL) + return -ENOMEM; + + spin_lock_init(&handle_base_lock); + for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash; + bucket--) { + INIT_LIST_HEAD(&bucket->head); + spin_lock_init(&bucket->lock); + } + + /** bug 21430: add randomness to the initial base */ + cfs_get_random_bytes(seed, sizeof(seed)); + do_gettimeofday(&tv); + cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]); + + cfs_get_random_bytes(&handle_base, sizeof(handle_base)); + LASSERT(handle_base != 0ULL); + + return 0; +} + +static int cleanup_all_handles(void) +{ + int rc; + int i; + + for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) { + struct portals_handle *h; + + spin_lock(&handle_hash[i].lock); + list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) { + CERROR("force clean handle "LPX64" addr %p ops %p\n", + h->h_cookie, h, h->h_ops); + + class_handle_unhash_nolock(h); + rc++; + } + spin_unlock(&handle_hash[i].lock); + } + + return rc; +} + +void class_handle_cleanup(void) +{ + int count; + LASSERT(handle_hash != NULL); + + count = cleanup_all_handles(); + + OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE); + handle_hash = NULL; + + if (count != 0) + CERROR("handle_count at cleanup: %d\n", count); +} diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c new file mode 100644 index 000000000000..2fa2589dc8eb --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c @@ -0,0 +1,218 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_RPC + +#include <obd.h> +#include <obd_support.h> +#include <obd_class.h> +#include <lustre_lib.h> +#include <lustre_ha.h> +#include <lustre_net.h> +#include <lprocfs_status.h> + +#define NIDS_MAX 32 + +struct uuid_nid_data { + struct list_head un_list; + struct obd_uuid un_uuid; + int un_nid_count; + lnet_nid_t un_nids[NIDS_MAX]; +}; + +/* FIXME: This should probably become more elegant than a global linked list */ +static struct list_head g_uuid_list; +static spinlock_t g_uuid_lock; + +void class_init_uuidlist(void) +{ + INIT_LIST_HEAD(&g_uuid_list); + spin_lock_init(&g_uuid_lock); +} + +void class_exit_uuidlist(void) +{ + /* delete all */ + class_del_uuid(NULL); +} + +int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index) +{ + struct uuid_nid_data *data; + struct obd_uuid tmp; + int rc = -ENOENT; + + obd_str2uuid(&tmp, uuid); + spin_lock(&g_uuid_lock); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + if (index >= data->un_nid_count) + break; + + rc = 0; + *peer_nid = data->un_nids[index]; + break; + } + } + spin_unlock(&g_uuid_lock); + return rc; +} +EXPORT_SYMBOL(lustre_uuid_to_peer); + +/* Add a nid to a niduuid. Multiple nids can be added to a single uuid; + LNET will choose the best one. */ +int class_add_uuid(const char *uuid, __u64 nid) +{ + struct uuid_nid_data *data, *entry; + int found = 0; + + LASSERT(nid != 0); /* valid newconfig NID is never zero */ + + if (strlen(uuid) > UUID_MAX - 1) + return -EOVERFLOW; + + OBD_ALLOC_PTR(data); + if (data == NULL) + return -ENOMEM; + + obd_str2uuid(&data->un_uuid, uuid); + data->un_nids[0] = nid; + data->un_nid_count = 1; + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) { + int i; + + found = 1; + for (i = 0; i < entry->un_nid_count; i++) + if (nid == entry->un_nids[i]) + break; + + if (i == entry->un_nid_count) { + LASSERT(entry->un_nid_count < NIDS_MAX); + entry->un_nids[entry->un_nid_count++] = nid; + } + break; + } + } + if (!found) + list_add(&data->un_list, &g_uuid_list); + spin_unlock(&g_uuid_lock); + + if (found) { + CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid, + libcfs_nid2str(nid), entry->un_nid_count); + OBD_FREE(data, sizeof(*data)); + } else { + CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid)); + } + return 0; +} +EXPORT_SYMBOL(class_add_uuid); + +/* Delete the nids for one uuid if specified, otherwise delete all */ +int class_del_uuid(const char *uuid) +{ + LIST_HEAD(deathrow); + struct uuid_nid_data *data; + + spin_lock(&g_uuid_lock); + if (uuid != NULL) { + struct obd_uuid tmp; + + obd_str2uuid(&tmp, uuid); + list_for_each_entry(data, &g_uuid_list, un_list) { + if (obd_uuid_equals(&data->un_uuid, &tmp)) { + list_move(&data->un_list, &deathrow); + break; + } + } + } else + list_splice_init(&g_uuid_list, &deathrow); + spin_unlock(&g_uuid_lock); + + if (uuid != NULL && list_empty(&deathrow)) { + CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid); + return -EINVAL; + } + + while (!list_empty(&deathrow)) { + data = list_entry(deathrow.next, struct uuid_nid_data, + un_list); + list_del(&data->un_list); + + CDEBUG(D_INFO, "del uuid %s %s/%d\n", + obd_uuid2str(&data->un_uuid), + libcfs_nid2str(data->un_nids[0]), + data->un_nid_count); + + OBD_FREE(data, sizeof(*data)); + } + + return 0; +} + +/* check if @nid exists in nid list of @uuid */ +int class_check_uuid(struct obd_uuid *uuid, __u64 nid) +{ + struct uuid_nid_data *entry; + int found = 0; + ENTRY; + + CDEBUG(D_INFO, "check if uuid %s has %s.\n", + obd_uuid2str(uuid), libcfs_nid2str(nid)); + + spin_lock(&g_uuid_lock); + list_for_each_entry(entry, &g_uuid_list, un_list) { + int i; + + if (!obd_uuid_equals(&entry->un_uuid, uuid)) + continue; + + /* found the uuid, check if it has @nid */ + for (i = 0; i < entry->un_nid_count; i++) { + if (entry->un_nids[i] == nid) { + found = 1; + break; + } + } + break; + } + spin_unlock(&g_uuid_lock); + RETURN(found); +} +EXPORT_SYMBOL(class_check_uuid); diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c new file mode 100644 index 000000000000..b71344a04c7e --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c @@ -0,0 +1,202 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA + * + * GPL HEADER END + */ +/* + * Copyright (c) 2012, Intel Corporation. + * Use is subject to license terms. + * + * Author: Johann Lombardi <johann.lombardi@intel.com> + */ + +#include <lustre/lustre_idl.h> +#include <obd.h> +#include <md_object.h> + +/** + * Initialize new \a lma. Only fid is stored. + * + * \param lma - is the new LMA structure to be initialized + * \param fid - is the FID of the object this LMA belongs to + * \param incompat - features that MDS must understand to access object + */ +void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid, + __u32 incompat) +{ + lma->lma_compat = 0; + lma->lma_incompat = incompat; + lma->lma_self_fid = *fid; + + /* If a field is added in struct lustre_mdt_attrs, zero it explicitly + * and change the test below. */ + LASSERT(sizeof(*lma) == + (offsetof(struct lustre_mdt_attrs, lma_self_fid) + + sizeof(lma->lma_self_fid))); +}; +EXPORT_SYMBOL(lustre_lma_init); + +/** + * Swab, if needed, LMA structure which is stored on-disk in little-endian order. + * + * \param lma - is a pointer to the LMA structure to be swabbed. + */ +void lustre_lma_swab(struct lustre_mdt_attrs *lma) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&lma->lma_compat); + __swab32s(&lma->lma_incompat); + lustre_swab_lu_fid(&lma->lma_self_fid); + } +}; +EXPORT_SYMBOL(lustre_lma_swab); + +/** + * Swab, if needed, SOM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the SOM structure to be swabbed. + */ +void lustre_som_swab(struct som_attrs *attrs) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&attrs->som_compat); + __swab32s(&attrs->som_incompat); + __swab64s(&attrs->som_ioepoch); + __swab64s(&attrs->som_size); + __swab64s(&attrs->som_blocks); + __swab64s(&attrs->som_mountid); + } +}; +EXPORT_SYMBOL(lustre_som_swab); + +/* + * Swab and extract SOM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk SOM extended attribute. + * \param rc - is the SOM xattr stored in \a buf + * \param msd - is the md_som_data structure where to extract SOM attributes. + */ +int lustre_buf2som(void *buf, int rc, struct md_som_data *msd) +{ + struct som_attrs *attrs = (struct som_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no SOM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* check SOM compatibility */ + if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP)) + RETURN(-ENODATA); + + /* unpack SOM attributes */ + lustre_som_swab(attrs); + + /* fill in-memory msd structure */ + msd->msd_compat = attrs->som_compat; + msd->msd_incompat = attrs->som_incompat; + msd->msd_ioepoch = attrs->som_ioepoch; + msd->msd_size = attrs->som_size; + msd->msd_blocks = attrs->som_blocks; + msd->msd_mountid = attrs->som_mountid; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2som); + +/** + * Swab, if needed, HSM structure which is stored on-disk in little-endian + * order. + * + * \param attrs - is a pointer to the HSM structure to be swabbed. + */ +void lustre_hsm_swab(struct hsm_attrs *attrs) +{ + /* Use LUSTRE_MSG_MAGIC to detect local endianess. */ + if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) { + __swab32s(&attrs->hsm_compat); + __swab32s(&attrs->hsm_flags); + __swab64s(&attrs->hsm_arch_id); + __swab64s(&attrs->hsm_arch_ver); + } +}; +EXPORT_SYMBOL(lustre_hsm_swab); + +/* + * Swab and extract HSM attributes from on-disk xattr. + * + * \param buf - is a buffer containing the on-disk HSM extended attribute. + * \param rc - is the HSM xattr stored in \a buf + * \param mh - is the md_hsm structure where to extract HSM attributes. + */ +int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + if (rc == 0 || rc == -ENODATA) + /* no HSM attributes */ + RETURN(-ENODATA); + + if (rc < 0) + /* error hit while fetching xattr */ + RETURN(rc); + + /* unpack HSM attributes */ + lustre_hsm_swab(attrs); + + /* fill md_hsm structure */ + mh->mh_compat = attrs->hsm_compat; + mh->mh_flags = attrs->hsm_flags; + mh->mh_arch_id = attrs->hsm_arch_id; + mh->mh_arch_ver = attrs->hsm_arch_ver; + + RETURN(0); +} +EXPORT_SYMBOL(lustre_buf2hsm); + +/* + * Pack HSM attributes. + * + * \param buf - is the output buffer where to pack the on-disk HSM xattr. + * \param mh - is the md_hsm structure to pack. + */ +void lustre_hsm2buf(void *buf, struct md_hsm *mh) +{ + struct hsm_attrs *attrs = (struct hsm_attrs *)buf; + ENTRY; + + /* copy HSM attributes */ + attrs->hsm_compat = mh->mh_compat; + attrs->hsm_flags = mh->mh_flags; + attrs->hsm_arch_id = mh->mh_arch_id; + attrs->hsm_arch_ver = mh->mh_arch_ver; + + /* pack xattr */ + lustre_hsm_swab(attrs); +} +EXPORT_SYMBOL(lustre_hsm2buf); diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c new file mode 100644 index 000000000000..c4f0dbc23611 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/mea.c @@ -0,0 +1,112 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include <obd_class.h> +#include <linux/kmod.h> /* for request_module() */ +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/pagemap.h> +#include <lprocfs_status.h> +#include <lustre/lustre_idl.h> + +static int mea_last_char_hash(int count, char *name, int namelen) +{ + unsigned int c; + + c = name[namelen - 1]; + if (c == 0) + CWARN("looks like wrong len is passed\n"); + c = c % count; + return c; +} + +static int mea_all_chars_hash(int count, char *name, int namelen) +{ + unsigned int c = 0; + + while (--namelen >= 0) + c += name[namelen]; + c = c % count; + return c; +} + +int raw_name2idx(int hashtype, int count, const char *name, int namelen) +{ + unsigned int c = 0; + int idx; + + LASSERT(namelen > 0); + + if (filename_is_volatile(name, namelen, &idx)) { + if ((idx >= 0) && (idx < count)) + return idx; + goto hashchoice; + } + + if (count <= 1) + return 0; + +hashchoice: + switch (hashtype) { + case MEA_MAGIC_LAST_CHAR: + c = mea_last_char_hash(count, (char *)name, namelen); + break; + case MEA_MAGIC_ALL_CHARS: + c = mea_all_chars_hash(count, (char *)name, namelen); + break; + case MEA_MAGIC_HASH_SEGMENT: + CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n"); + break; + default: + CERROR("Unknown hash type 0x%x\n", hashtype); + } + + LASSERT(c < count); + return c; +} +EXPORT_SYMBOL(raw_name2idx); + +int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen) +{ + unsigned int c; + + LASSERT(mea && mea->mea_count); + + c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen); + + LASSERT(c < mea->mea_count); + return c; +} +EXPORT_SYMBOL(mea_name2idx); diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c new file mode 100644 index 000000000000..bbf06d009fd0 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c @@ -0,0 +1,1904 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_config.c + * + * Config API + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#include <obd_class.h> +#include <linux/string.h> +#include <lustre_log.h> +#include <lprocfs_status.h> +#include <lustre_param.h> + +#include "llog_internal.h" + +static cfs_hash_ops_t uuid_hash_ops; +static cfs_hash_ops_t nid_hash_ops; +static cfs_hash_ops_t nid_stat_hash_ops; + +/*********** string parsing utils *********/ + +/* returns 0 if we find this key in the buffer, else 1 */ +int class_find_param(char *buf, char *key, char **valp) +{ + char *ptr; + + if (!buf) + return 1; + + if ((ptr = strstr(buf, key)) == NULL) + return 1; + + if (valp) + *valp = ptr + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_find_param); + +/** + * Check whether the proc parameter \a param is an old parameter or not from + * the array \a ptr which contains the mapping from old parameters to new ones. + * If it's an old one, then return the pointer to the cfg_interop_param struc- + * ture which contains both the old and new parameters. + * + * \param param proc parameter + * \param ptr an array which contains the mapping from + * old parameters to new ones + * + * \retval valid-pointer pointer to the cfg_interop_param structure + * which contains the old and new parameters + * \retval NULL \a param or \a ptr is NULL, + * or \a param is not an old parameter + */ +struct cfg_interop_param *class_find_old_param(const char *param, + struct cfg_interop_param *ptr) +{ + char *value = NULL; + int name_len = 0; + + if (param == NULL || ptr == NULL) + RETURN(NULL); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + while (ptr->old_param != NULL) { + if (strncmp(param, ptr->old_param, name_len) == 0 && + name_len == strlen(ptr->old_param)) + RETURN(ptr); + ptr++; + } + + RETURN(NULL); +} +EXPORT_SYMBOL(class_find_old_param); + +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (q1 == NULL) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (q2 == NULL) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} +EXPORT_SYMBOL(class_get_next_param); + +/* returns 0 if this is the first key in the buffer, else 1. + valp points to first char after key. */ +int class_match_param(char *buf, char *key, char **valp) +{ + if (!buf) + return 1; + + if (memcmp(buf, key, strlen(key)) != 0) + return 1; + + if (valp) + *valp = buf + strlen(key); + + return 0; +} +EXPORT_SYMBOL(class_match_param); + +static int parse_nid(char *buf, void *value, int quiet) +{ + lnet_nid_t *nid = (lnet_nid_t *)value; + + *nid = libcfs_str2nid(buf); + if (*nid != LNET_NID_ANY) + return 0; + + if (!quiet) + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf); + return -EINVAL; +} + +static int parse_net(char *buf, void *value) +{ + __u32 *net = (__u32 *)value; + + *net = libcfs_str2net(buf); + CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net)); + return 0; +} + +enum { + CLASS_PARSE_NID = 1, + CLASS_PARSE_NET, +}; + +/* 0 is good nid, + 1 not found + < 0 error + endh is set to next separator */ +static int class_parse_value(char *buf, int opc, void *value, char **endh, + int quiet) +{ + char *endp; + char tmp; + int rc = 0; + + if (!buf) + return 1; + while (*buf == ',' || *buf == ':') + buf++; + if (*buf == ' ' || *buf == '/' || *buf == '\0') + return 1; + + /* nid separators or end of nids */ + endp = strpbrk(buf, ",: /"); + if (endp == NULL) + endp = buf + strlen(buf); + + tmp = *endp; + *endp = '\0'; + switch (opc) { + default: + LBUG(); + case CLASS_PARSE_NID: + rc = parse_nid(buf, value, quiet); + break; + case CLASS_PARSE_NET: + rc = parse_net(buf, value); + break; + } + *endp = tmp; + if (rc != 0) + return rc; + if (endh) + *endh = endp; + return 0; +} + +int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0); +} +EXPORT_SYMBOL(class_parse_nid); + +int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1); +} +EXPORT_SYMBOL(class_parse_nid_quiet); + +int class_parse_net(char *buf, __u32 *net, char **endh) +{ + return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0); +} +EXPORT_SYMBOL(class_parse_net); + +/* 1 param contains key and match + * 0 param contains key and not match + * -1 param does not contain key + */ +int class_match_nid(char *buf, char *key, lnet_nid_t nid) +{ + lnet_nid_t tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified nids */ + while (class_parse_nid(buf, &tmp, &buf) == 0) { + if (tmp == nid) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_nid); + +int class_match_net(char *buf, char *key, __u32 net) +{ + __u32 tmp; + int rc = -1; + + while (class_find_param(buf, key, &buf) == 0) { + /* please restrict to the nids pertaining to + * the specified networks */ + while (class_parse_net(buf, &tmp, &buf) == 0) { + if (tmp == net) + return 1; + } + rc = 0; + } + return rc; +} +EXPORT_SYMBOL(class_match_net); + +/********************** class fns **********************/ + +/** + * Create a new obd device and set the type, name and uuid. If successful, + * the new device can be accessed by either name or uuid. + */ +int class_attach(struct lustre_cfg *lcfg) +{ + struct obd_device *obd = NULL; + char *typename, *name, *uuid; + int rc, len; + ENTRY; + + if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) { + CERROR("No type passed!\n"); + RETURN(-EINVAL); + } + typename = lustre_cfg_string(lcfg, 1); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) { + CERROR("No name passed!\n"); + RETURN(-EINVAL); + } + name = lustre_cfg_string(lcfg, 0); + + if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) { + CERROR("No UUID passed!\n"); + RETURN(-EINVAL); + } + uuid = lustre_cfg_string(lcfg, 2); + + CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n", + MKSTR(typename), MKSTR(name), MKSTR(uuid)); + + obd = class_newdev(typename, name); + if (IS_ERR(obd)) { + /* Already exists or out of obds */ + rc = PTR_ERR(obd); + obd = NULL; + CERROR("Cannot create device %s of type %s : %d\n", + name, typename, rc); + GOTO(out, rc); + } + LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n", + name, typename); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08X != %08X\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0, + "%p obd_name %s != %s\n", obd, obd->obd_name, name); + + rwlock_init(&obd->obd_pool_lock); + obd->obd_pool_limit = 0; + obd->obd_pool_slv = 0; + + INIT_LIST_HEAD(&obd->obd_exports); + INIT_LIST_HEAD(&obd->obd_unlinked_exports); + INIT_LIST_HEAD(&obd->obd_delayed_exports); + INIT_LIST_HEAD(&obd->obd_exports_timed); + INIT_LIST_HEAD(&obd->obd_nid_stats); + spin_lock_init(&obd->obd_nid_lock); + spin_lock_init(&obd->obd_dev_lock); + mutex_init(&obd->obd_dev_mutex); + spin_lock_init(&obd->obd_osfs_lock); + /* obd->obd_osfs_age must be set to a value in the distant + * past to guarantee a fresh statfs is fetched on mount. */ + obd->obd_osfs_age = cfs_time_shift_64(-1000); + + /* XXX belongs in setup not attach */ + init_rwsem(&obd->obd_observer_link_sem); + /* recovery data */ + cfs_init_timer(&obd->obd_recovery_timer); + spin_lock_init(&obd->obd_recovery_task_lock); + init_waitqueue_head(&obd->obd_next_transno_waitq); + init_waitqueue_head(&obd->obd_evict_inprogress_waitq); + INIT_LIST_HEAD(&obd->obd_req_replay_queue); + INIT_LIST_HEAD(&obd->obd_lock_replay_queue); + INIT_LIST_HEAD(&obd->obd_final_req_queue); + INIT_LIST_HEAD(&obd->obd_evict_list); + + llog_group_init(&obd->obd_olg, FID_SEQ_LLOG); + + obd->obd_conn_inprogress = 0; + + len = strlen(uuid); + if (len >= sizeof(obd->obd_uuid)) { + CERROR("uuid must be < %d bytes long\n", + (int)sizeof(obd->obd_uuid)); + GOTO(out, rc = -EINVAL); + } + memcpy(obd->obd_uuid.uuid, uuid, len); + + /* do the attach */ + if (OBP(obd, attach)) { + rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg); + if (rc) + GOTO(out, rc = -EINVAL); + } + + /* Detach drops this */ + spin_lock(&obd->obd_dev_lock); + atomic_set(&obd->obd_refcount, 1); + spin_unlock(&obd->obd_dev_lock); + lu_ref_init(&obd->obd_reference); + lu_ref_add(&obd->obd_reference, "attach", obd); + + obd->obd_attached = 1; + CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n", + obd->obd_minor, typename, atomic_read(&obd->obd_refcount)); + RETURN(0); + out: + if (obd != NULL) { + class_release_dev(obd); + } + return rc; +} +EXPORT_SYMBOL(class_attach); + +/** Create hashes, self-export, and call type-specific setup. + * Setup is effectively the "start this obd" call. + */ +int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + struct obd_export *exp; + ENTRY; + + LASSERT(obd != NULL); + LASSERTF(obd == class_num2obd(obd->obd_minor), + "obd %p != obd_devs[%d] %p\n", + obd, obd->obd_minor, class_num2obd(obd->obd_minor)); + LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, + "obd %p obd_magic %08x != %08x\n", + obd, obd->obd_magic, OBD_DEVICE_MAGIC); + + /* have we attached a type to this device? */ + if (!obd->obd_attached) { + CERROR("Device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + + if (obd->obd_set_up) { + CERROR("Device %d already setup (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + + /* is someone else setting us up right now? (attach inits spinlock) */ + spin_lock(&obd->obd_dev_lock); + if (obd->obd_starting) { + spin_unlock(&obd->obd_dev_lock); + CERROR("Device %d setup in progress (type %s)\n", + obd->obd_minor, obd->obd_type->typ_name); + RETURN(-EEXIST); + } + /* just leave this on forever. I can't use obd_set_up here because + other fns check that status, and we're not actually set up yet. */ + obd->obd_starting = 1; + obd->obd_uuid_hash = NULL; + obd->obd_nid_hash = NULL; + obd->obd_nid_stats_hash = NULL; + spin_unlock(&obd->obd_dev_lock); + + /* create an uuid-export lustre hash */ + obd->obd_uuid_hash = cfs_hash_create("UUID_HASH", + HASH_UUID_CUR_BITS, + HASH_UUID_MAX_BITS, + HASH_UUID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &uuid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_uuid_hash) + GOTO(err_hash, err = -ENOMEM); + + /* create a nid-export lustre hash */ + obd->obd_nid_hash = cfs_hash_create("NID_HASH", + HASH_NID_CUR_BITS, + HASH_NID_MAX_BITS, + HASH_NID_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_hash) + GOTO(err_hash, err = -ENOMEM); + + /* create a nid-stats lustre hash */ + obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS", + HASH_NID_STATS_CUR_BITS, + HASH_NID_STATS_MAX_BITS, + HASH_NID_STATS_BKT_BITS, 0, + CFS_HASH_MIN_THETA, + CFS_HASH_MAX_THETA, + &nid_stat_hash_ops, CFS_HASH_DEFAULT); + if (!obd->obd_nid_stats_hash) + GOTO(err_hash, err = -ENOMEM); + + exp = class_new_export(obd, &obd->obd_uuid); + if (IS_ERR(exp)) + GOTO(err_hash, err = PTR_ERR(exp)); + + obd->obd_self_export = exp; + list_del_init(&exp->exp_obd_chain_timed); + class_export_put(exp); + + err = obd_setup(obd, lcfg); + if (err) + GOTO(err_exp, err); + + obd->obd_set_up = 1; + + spin_lock(&obd->obd_dev_lock); + /* cleanup drops this */ + class_incref(obd, "setup", obd); + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + RETURN(0); +err_exp: + if (obd->obd_self_export) { + class_unlink_export(obd->obd_self_export); + obd->obd_self_export = NULL; + } +err_hash: + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + obd->obd_starting = 0; + CERROR("setup %s failed (%d)\n", obd->obd_name, err); + return err; +} +EXPORT_SYMBOL(class_setup); + +/** We have finished using this obd and are ready to destroy it. + * There can be no more references to this obd. + */ +int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + ENTRY; + + if (obd->obd_set_up) { + CERROR("OBD device %d still set up\n", obd->obd_minor); + RETURN(-EBUSY); + } + + spin_lock(&obd->obd_dev_lock); + if (!obd->obd_attached) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD device %d not attached\n", obd->obd_minor); + RETURN(-ENODEV); + } + obd->obd_attached = 0; + spin_unlock(&obd->obd_dev_lock); + + CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n", + obd->obd_name, obd->obd_uuid.uuid); + + class_decref(obd, "attach", obd); + RETURN(0); +} +EXPORT_SYMBOL(class_detach); + +/** Start shutting down the obd. There may be in-progess ops when + * this is called. We tell them to start shutting down with a call + * to class_disconnect_exports(). + */ +int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + int err = 0; + char *flag; + ENTRY; + + OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS); + + if (!obd->obd_set_up) { + CERROR("Device %d not setup\n", obd->obd_minor); + RETURN(-ENODEV); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + spin_unlock(&obd->obd_dev_lock); + CERROR("OBD %d already stopping\n", obd->obd_minor); + RETURN(-ENODEV); + } + /* Leave this on forever */ + obd->obd_stopping = 1; + + /* wait for already-arrived-connections to finish. */ + while (obd->obd_conn_inprogress > 0) { + spin_unlock(&obd->obd_dev_lock); + + cond_resched(); + + spin_lock(&obd->obd_dev_lock); + } + spin_unlock(&obd->obd_dev_lock); + + if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) { + for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++) + switch (*flag) { + case 'F': + obd->obd_force = 1; + break; + case 'A': + LCONSOLE_WARN("Failing over %s\n", + obd->obd_name); + obd->obd_fail = 1; + obd->obd_no_transno = 1; + obd->obd_no_recov = 1; + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + } + break; + default: + CERROR("Unrecognised flag '%c'\n", *flag); + } + } + + LASSERT(obd->obd_self_export); + + /* The three references that should be remaining are the + * obd_self_export and the attach and setup references. */ + if (atomic_read(&obd->obd_refcount) > 3) { + /* refcounf - 3 might be the number of real exports + (excluding self export). But class_incref is called + by other things as well, so don't count on it. */ + CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n", + obd->obd_name, atomic_read(&obd->obd_refcount) - 3); + dump_exports(obd, 0); + class_disconnect_exports(obd); + } + + /* Precleanup, we must make sure all exports get destroyed. */ + err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS); + if (err) + CERROR("Precleanup %s returned %d\n", + obd->obd_name, err); + + /* destroy an uuid-export hash body */ + if (obd->obd_uuid_hash) { + cfs_hash_putref(obd->obd_uuid_hash); + obd->obd_uuid_hash = NULL; + } + + /* destroy a nid-export hash body */ + if (obd->obd_nid_hash) { + cfs_hash_putref(obd->obd_nid_hash); + obd->obd_nid_hash = NULL; + } + + /* destroy a nid-stats hash body */ + if (obd->obd_nid_stats_hash) { + cfs_hash_putref(obd->obd_nid_stats_hash); + obd->obd_nid_stats_hash = NULL; + } + + class_decref(obd, "setup", obd); + obd->obd_set_up = 0; + + RETURN(0); +} +EXPORT_SYMBOL(class_cleanup); + +struct obd_device *class_incref(struct obd_device *obd, + const char *scope, const void *source) +{ + lu_ref_add_atomic(&obd->obd_reference, scope, source); + atomic_inc(&obd->obd_refcount); + CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd, + atomic_read(&obd->obd_refcount)); + + return obd; +} +EXPORT_SYMBOL(class_incref); + +void class_decref(struct obd_device *obd, const char *scope, const void *source) +{ + int err; + int refs; + + spin_lock(&obd->obd_dev_lock); + atomic_dec(&obd->obd_refcount); + refs = atomic_read(&obd->obd_refcount); + spin_unlock(&obd->obd_dev_lock); + lu_ref_del(&obd->obd_reference, scope, source); + + CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs); + + if ((refs == 1) && obd->obd_stopping) { + /* All exports have been destroyed; there should + be no more in-progress ops by this point.*/ + + spin_lock(&obd->obd_self_export->exp_lock); + obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd); + spin_unlock(&obd->obd_self_export->exp_lock); + + /* note that we'll recurse into class_decref again */ + class_unlink_export(obd->obd_self_export); + return; + } + + if (refs == 0) { + CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n", + obd->obd_name, obd->obd_uuid.uuid); + LASSERT(!obd->obd_attached); + if (obd->obd_stopping) { + /* If we're not stopping, we were never set up */ + err = obd_cleanup(obd); + if (err) + CERROR("Cleanup %s returned %d\n", + obd->obd_name, err); + } + if (OBP(obd, detach)) { + err = OBP(obd, detach)(obd); + if (err) + CERROR("Detach returned %d\n", err); + } + class_release_dev(obd); + } +} +EXPORT_SYMBOL(class_decref); + +/** Add a failover nid location. + * Client obd types contact server obd types using this nid list. + */ +int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("can't add connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to add conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num); + + RETURN(rc); +} +EXPORT_SYMBOL(class_add_conn); + +/** Remove a failover nid location. + */ +int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg) +{ + struct obd_import *imp; + struct obd_uuid uuid; + int rc; + ENTRY; + + if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 || + LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) { + CERROR("invalid conn_uuid\n"); + RETURN(-EINVAL); + } + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + CERROR("can't del connection on non-client dev\n"); + RETURN(-EINVAL); + } + + imp = obd->u.cli.cl_import; + if (!imp) { + CERROR("try to del conn on immature client dev\n"); + RETURN(-EINVAL); + } + + obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1)); + rc = obd_del_conn(imp, &uuid); + + RETURN(rc); +} + +LIST_HEAD(lustre_profile_list); + +struct lustre_profile *class_get_profile(const char * prof) +{ + struct lustre_profile *lprof; + + ENTRY; + list_for_each_entry(lprof, &lustre_profile_list, lp_list) { + if (!strcmp(lprof->lp_profile, prof)) { + RETURN(lprof); + } + } + RETURN(NULL); +} +EXPORT_SYMBOL(class_get_profile); + +/** Create a named "profile". + * This defines the mdc and osc names to use for a client. + * This also is used to define the lov to be used by a mdt. + */ +int class_add_profile(int proflen, char *prof, int osclen, char *osc, + int mdclen, char *mdc) +{ + struct lustre_profile *lprof; + int err = 0; + ENTRY; + + CDEBUG(D_CONFIG, "Add profile %s\n", prof); + + OBD_ALLOC(lprof, sizeof(*lprof)); + if (lprof == NULL) + RETURN(-ENOMEM); + INIT_LIST_HEAD(&lprof->lp_list); + + LASSERT(proflen == (strlen(prof) + 1)); + OBD_ALLOC(lprof->lp_profile, proflen); + if (lprof->lp_profile == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_profile, prof, proflen); + + LASSERT(osclen == (strlen(osc) + 1)); + OBD_ALLOC(lprof->lp_dt, osclen); + if (lprof->lp_dt == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_dt, osc, osclen); + + if (mdclen > 0) { + LASSERT(mdclen == (strlen(mdc) + 1)); + OBD_ALLOC(lprof->lp_md, mdclen); + if (lprof->lp_md == NULL) + GOTO(out, err = -ENOMEM); + memcpy(lprof->lp_md, mdc, mdclen); + } + + list_add(&lprof->lp_list, &lustre_profile_list); + RETURN(err); + +out: + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, mdclen); + if (lprof->lp_dt) + OBD_FREE(lprof->lp_dt, osclen); + if (lprof->lp_profile) + OBD_FREE(lprof->lp_profile, proflen); + OBD_FREE(lprof, sizeof(*lprof)); + RETURN(err); +} + +void class_del_profile(const char *prof) +{ + struct lustre_profile *lprof; + ENTRY; + + CDEBUG(D_CONFIG, "Del profile %s\n", prof); + + lprof = class_get_profile(prof); + if (lprof) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof *lprof); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profile); + +/* COMPAT_146 */ +void class_del_profiles(void) +{ + struct lustre_profile *lprof, *n; + ENTRY; + + list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) { + list_del(&lprof->lp_list); + OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1); + OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1); + if (lprof->lp_md) + OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1); + OBD_FREE(lprof, sizeof *lprof); + } + EXIT; +} +EXPORT_SYMBOL(class_del_profiles); + +static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg) +{ + ENTRY; + if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0) + at_min = val; + else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0) + at_max = val; + else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0) + at_extra = val; + else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0) + at_early_margin = val; + else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0) + at_history = val; + else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0) + strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2), + JOBSTATS_JOBID_VAR_MAX_LEN + 1); + else + RETURN(-EINVAL); + + CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val); + RETURN(0); +} + + +/* We can't call ll_process_config or lquota_process_config directly because + * it lives in a module that must be loaded after this one. */ +static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL; +static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL; + +void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)) +{ + client_process_config = cpc; +} +EXPORT_SYMBOL(lustre_register_client_process_config); + +/** + * Rename the proc parameter in \a cfg with a new name \a new_name. + * + * \param cfg config structure which contains the proc parameter + * \param new_name new name of the proc parameter + * + * \retval valid-pointer pointer to the newly-allocated config structure + * which contains the renamed proc parameter + * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does + * not contain a proc parameter + * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs + */ +struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg, + const char *new_name) +{ + struct lustre_cfg_bufs *bufs = NULL; + struct lustre_cfg *new_cfg = NULL; + char *param = NULL; + char *new_param = NULL; + char *value = NULL; + int name_len = 0; + int new_len = 0; + ENTRY; + + if (cfg == NULL || new_name == NULL) + RETURN(ERR_PTR(-EINVAL)); + + param = lustre_cfg_string(cfg, 1); + if (param == NULL) + RETURN(ERR_PTR(-EINVAL)); + + value = strchr(param, '='); + if (value == NULL) + name_len = strlen(param); + else + name_len = value - param; + + new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len; + + OBD_ALLOC(new_param, new_len); + if (new_param == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + strcpy(new_param, new_name); + if (value != NULL) + strcat(new_param, value); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) { + OBD_FREE(new_param, new_len); + RETURN(ERR_PTR(-ENOMEM)); + } + + lustre_cfg_bufs_reset(bufs, NULL); + lustre_cfg_bufs_init(bufs, cfg); + lustre_cfg_bufs_set_string(bufs, 1, new_param); + + new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs); + + OBD_FREE(new_param, new_len); + OBD_FREE_PTR(bufs); + if (new_cfg == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + new_cfg->lcfg_num = cfg->lcfg_num; + new_cfg->lcfg_flags = cfg->lcfg_flags; + new_cfg->lcfg_nid = cfg->lcfg_nid; + new_cfg->lcfg_nal = cfg->lcfg_nal; + + RETURN(new_cfg); +} +EXPORT_SYMBOL(lustre_cfg_rename); + +void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg)) +{ + quota_process_config = qpc; +} +EXPORT_SYMBOL(lustre_register_quota_process_config); + +/** Process configuration commands given in lustre_cfg form. + * These may come from direct calls (e.g. class_manual_cleanup) + * or processing the config llog, or ioctl from lctl. + */ +int class_process_config(struct lustre_cfg *lcfg) +{ + struct obd_device *obd; + int err; + + LASSERT(lcfg && !IS_ERR(lcfg)); + CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command); + + /* Commands that don't need a device */ + switch(lcfg->lcfg_command) { + case LCFG_ATTACH: { + err = class_attach(lcfg); + GOTO(out, err); + } + case LCFG_ADD_UUID: { + CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64 + " (%s)\n", lustre_cfg_string(lcfg, 1), + lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid)); + + err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid); + GOTO(out, err); + } + case LCFG_DEL_UUID: { + CDEBUG(D_IOCTL, "removing mappings for uuid %s\n", + (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0) + ? "<all uuids>" : lustre_cfg_string(lcfg, 1)); + + err = class_del_uuid(lustre_cfg_string(lcfg, 1)); + GOTO(out, err); + } + case LCFG_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n", + lustre_cfg_string(lcfg, 1), + lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + /* set these mount options somewhere, so ll_fill_super + * can find them. */ + err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1), + lustre_cfg_string(lcfg, 1), + LUSTRE_CFG_BUFLEN(lcfg, 2), + lustre_cfg_string(lcfg, 2), + LUSTRE_CFG_BUFLEN(lcfg, 3), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err); + } + case LCFG_DEL_MOUNTOPT: { + CDEBUG(D_IOCTL, "mountopt: profile %s\n", + lustre_cfg_string(lcfg, 1)); + class_del_profile(lustre_cfg_string(lcfg, 1)); + GOTO(out, err = 0); + } + case LCFG_SET_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n", + obd_timeout, lcfg->lcfg_num); + obd_timeout = max(lcfg->lcfg_num, 1U); + obd_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_LDLM_TIMEOUT: { + CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n", + ldlm_timeout, lcfg->lcfg_num); + ldlm_timeout = max(lcfg->lcfg_num, 1U); + if (ldlm_timeout >= obd_timeout) + ldlm_timeout = max(obd_timeout / 3, 1U); + ldlm_timeout_set = 1; + GOTO(out, err = 0); + } + case LCFG_SET_UPCALL: { + LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n"); + /* COMPAT_146 Don't fail on old configs */ + GOTO(out, err = 0); + } + case LCFG_MARKER: { + struct cfg_marker *marker; + marker = lustre_cfg_buf(lcfg, 1); + CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step, + marker->cm_flags, marker->cm_tgtname, marker->cm_comment); + GOTO(out, err = 0); + } + case LCFG_PARAM: { + char *tmp; + /* llite has no obd */ + if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_LLITE, 0) == 0) && + client_process_config) { + err = (*client_process_config)(lcfg); + GOTO(out, err); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_SYS, &tmp) == 0)) { + /* Global param settings */ + err = class_set_global(tmp, lcfg->lcfg_num, lcfg); + /* + * Client or server should not fail to mount if + * it hits an unknown configuration parameter. + */ + if (err != 0) + CWARN("Ignoring unknown param %s\n", tmp); + + GOTO(out, err = 0); + } else if ((class_match_param(lustre_cfg_string(lcfg, 1), + PARAM_QUOTA, &tmp) == 0) && + quota_process_config) { + err = (*quota_process_config)(lcfg); + GOTO(out, err); + } + /* Fall through */ + break; + } + } + + /* Commands that require a device */ + obd = class_name2obd(lustre_cfg_string(lcfg, 0)); + if (obd == NULL) { + if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) + CERROR("this lcfg command requires a device name\n"); + else + CERROR("no device for: %s\n", + lustre_cfg_string(lcfg, 0)); + + GOTO(out, err = -EINVAL); + } + + switch(lcfg->lcfg_command) { + case LCFG_SETUP: { + err = class_setup(obd, lcfg); + GOTO(out, err); + } + case LCFG_DETACH: { + err = class_detach(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_CLEANUP: { + err = class_cleanup(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_ADD_CONN: { + err = class_add_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_DEL_CONN: { + err = class_del_conn(obd, lcfg); + GOTO(out, err = 0); + } + case LCFG_POOL_NEW: { + err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_ADD: { + err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_REM: { + err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2), + lustre_cfg_string(lcfg, 3)); + GOTO(out, err = 0); + break; + } + case LCFG_POOL_DEL: { + err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2)); + GOTO(out, err = 0); + break; + } + default: { + err = obd_process_config(obd, sizeof(*lcfg), lcfg); + GOTO(out, err); + + } + } +out: + if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) { + CWARN("Ignoring error %d on optional command %#x\n", err, + lcfg->lcfg_command); + err = 0; + } + return err; +} +EXPORT_SYMBOL(class_process_config); + +int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, + struct lustre_cfg *lcfg, void *data) +{ + struct lprocfs_vars *var; + struct file fakefile; + struct seq_file fake_seqfile; + char *key, *sval; + int i, keylen, vallen; + int matched = 0, j = 0; + int rc = 0; + int skip = 0; + ENTRY; + + if (lcfg->lcfg_command != LCFG_PARAM) { + CERROR("Unknown command: %d\n", lcfg->lcfg_command); + RETURN(-EINVAL); + } + + /* fake a seq file so that var->fops->write can work... */ + fakefile.private_data = &fake_seqfile; + fake_seqfile.private = data; + /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt + or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar + or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */ + for (i = 1; i < lcfg->lcfg_bufcount; i++) { + key = lustre_cfg_buf(lcfg, i); + /* Strip off prefix */ + class_match_param(key, prefix, &key); + sval = strchr(key, '='); + if (!sval || (*(sval + 1) == 0)) { + CERROR("Can't parse param %s (missing '=')\n", key); + /* rc = -EINVAL; continue parsing other params */ + continue; + } + keylen = sval - key; + sval++; + vallen = strlen(sval); + matched = 0; + j = 0; + /* Search proc entries */ + while (lvars[j].name) { + var = &lvars[j]; + if (class_match_param(key, (char *)var->name, 0) == 0 && + keylen == strlen(var->name)) { + matched++; + rc = -EROFS; + if (var->fops && var->fops->write) { + mm_segment_t oldfs; + oldfs = get_fs(); + set_fs(KERNEL_DS); + rc = (var->fops->write)(&fakefile, sval, + vallen, NULL); + set_fs(oldfs); + } + break; + } + j++; + } + if (!matched) { + /* If the prefix doesn't match, return error so we + can pass it down the stack */ + if (strnchr(key, keylen, '.')) + RETURN(-ENOSYS); + CERROR("%s: unknown param %s\n", + (char *)lustre_cfg_string(lcfg, 0), key); + /* rc = -EINVAL; continue parsing other params */ + skip++; + } else if (rc < 0) { + CERROR("writing proc entry %s err %d\n", + var->name, rc); + rc = 0; + } else { + CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n", + lustre_cfg_string(lcfg, 0), + (int)strlen(prefix) - 1, prefix, + (int)(sval - key - 1), key, sval); + } + } + + if (rc > 0) + rc = 0; + if (!rc && skip) + rc = skip; + RETURN(rc); +} +EXPORT_SYMBOL(class_process_proc_param); + +extern int lustre_check_exclusion(struct super_block *sb, char *svname); + +/** Parse a configuration llog, doing various manipulations on them + * for various reasons, (modifications for compatibility, skip obsolete + * records, change uuids, etc), then class_process_config() resulting + * net records. + */ +int class_config_llog_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + struct config_llog_instance *clli = data; + int cfg_len = rec->lrh_len; + char *cfg_buf = (char*) (rec + 1); + int rc = 0; + ENTRY; + + //class_config_dump_handler(handle, rec, data); + + switch (rec->lrh_type) { + case OBD_CFG_REC: { + struct lustre_cfg *lcfg, *lcfg_new; + struct lustre_cfg_bufs bufs; + char *inst_name = NULL; + int inst_len = 0; + int inst = 0, swab = 0; + + lcfg = (struct lustre_cfg *)cfg_buf; + if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { + lustre_swab_lustre_cfg(lcfg); + swab = 1; + } + + rc = lustre_cfg_sanity_check(cfg_buf, cfg_len); + if (rc) + GOTO(out, rc); + + /* Figure out config state info */ + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + lustre_swab_cfg_marker(marker, swab, + LUSTRE_CFG_BUFLEN(lcfg, 1)); + CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n", + clli->cfg_flags, marker->cm_flags); + if (marker->cm_flags & CM_START) { + /* all previous flags off */ + clli->cfg_flags = CFG_F_MARKER; + if (marker->cm_flags & CM_SKIP) { + clli->cfg_flags |= CFG_F_SKIP; + CDEBUG(D_CONFIG, "SKIP #%d\n", + marker->cm_step); + } else if ((marker->cm_flags & CM_EXCLUDE) || + (clli->cfg_sb && + lustre_check_exclusion(clli->cfg_sb, + marker->cm_tgtname))) { + clli->cfg_flags |= CFG_F_EXCLUDE; + CDEBUG(D_CONFIG, "EXCLUDE %d\n", + marker->cm_step); + } + } else if (marker->cm_flags & CM_END) { + clli->cfg_flags = 0; + } + } + /* A config command without a start marker before it is + illegal (post 146) */ + if (!(clli->cfg_flags & CFG_F_COMPAT146) && + !(clli->cfg_flags & CFG_F_MARKER) && + (lcfg->lcfg_command != LCFG_MARKER)) { + CWARN("Config not inside markers, ignoring! " + "(inst: %p, uuid: %s, flags: %#x)\n", + clli->cfg_instance, + clli->cfg_uuid.uuid, clli->cfg_flags); + clli->cfg_flags |= CFG_F_SKIP; + } + if (clli->cfg_flags & CFG_F_SKIP) { + CDEBUG(D_CONFIG, "skipping %#x\n", + clli->cfg_flags); + rc = 0; + /* No processing! */ + break; + } + + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" obd device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd " + "type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CDEBUG(D_INFO, "For 1.8 interoperability, " + "set this index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + + + if ((clli->cfg_flags & CFG_F_EXCLUDE) && + (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)) + /* Add inactive instead */ + lcfg->lcfg_command = LCFG_LOV_ADD_INA; + + lustre_cfg_bufs_init(&bufs, lcfg); + + if (clli && clli->cfg_instance && + LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){ + inst = 1; + inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) + + sizeof(clli->cfg_instance) * 2 + 4; + OBD_ALLOC(inst_name, inst_len); + if (inst_name == NULL) + GOTO(out, rc = -ENOMEM); + sprintf(inst_name, "%s-%p", + lustre_cfg_string(lcfg, 0), + clli->cfg_instance); + lustre_cfg_bufs_set_string(&bufs, 0, inst_name); + CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n", + lcfg->lcfg_command, inst_name); + } + + /* we override the llog's uuid for clients, to insure they + are unique */ + if (clli && clli->cfg_instance != NULL && + lcfg->lcfg_command == LCFG_ATTACH) { + lustre_cfg_bufs_set_string(&bufs, 2, + clli->cfg_uuid.uuid); + } + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (clli && clli->cfg_instance == NULL && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + clli->cfg_obdname); + } + + lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); + + lcfg_new->lcfg_num = lcfg->lcfg_num; + lcfg_new->lcfg_flags = lcfg->lcfg_flags; + + /* XXX Hack to try to remain binary compatible with + * pre-newconfig logs */ + if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */ + (lcfg->lcfg_nid >> 32) == 0) { + __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff); + + lcfg_new->lcfg_nid = + LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr); + CWARN("Converted pre-newconfig NAL %d NID %x to %s\n", + lcfg->lcfg_nal, addr, + libcfs_nid2str(lcfg_new->lcfg_nid)); + } else { + lcfg_new->lcfg_nid = lcfg->lcfg_nid; + } + + lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */ + + rc = class_process_config(lcfg_new); + lustre_cfg_free(lcfg_new); + + if (inst) + OBD_FREE(inst_name, inst_len); + break; + } + default: + CERROR("Unknown llog record type %#x encountered\n", + rec->lrh_type); + break; + } +out: + if (rc) { + CERROR("%s: cfg command failed: rc = %d\n", + handle->lgh_ctxt->loc_obd->obd_name, rc); + class_config_dump_handler(NULL, handle, rec, data); + } + RETURN(rc); +} +EXPORT_SYMBOL(class_config_llog_handler); + +int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_process_cat_data cd = {0, 0}; + struct llog_handle *llh; + llog_cb_t callback; + int rc; + ENTRY; + + CDEBUG(D_INFO, "looking up llog %s\n", name); + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + /* continue processing from where we last stopped to end-of-log */ + if (cfg) { + cd.lpcd_first_idx = cfg->cfg_last_idx; + callback = cfg->cfg_callback; + LASSERT(callback != NULL); + } else { + callback = class_config_llog_handler; + } + + cd.lpcd_last_idx = 0; + + rc = llog_process(env, llh, callback, cfg, &cd); + + CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name, + cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc); + if (cfg) + cfg->cfg_last_idx = cd.lpcd_last_idx; + +parse_out: + llog_close(env, llh); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_parse_llog); + +/** + * parse config record and output dump in supplied buffer. + * This is separated from class_config_dump_handler() to use + * for ioctl needs as well + */ +int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size) +{ + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + char *ptr = buf; + char *end = buf + size; + int rc = 0; + + ENTRY; + + LASSERT(rec->lrh_type == OBD_CFG_REC); + rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len); + if (rc < 0) + RETURN(rc); + + ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command); + if (lcfg->lcfg_flags) + ptr += snprintf(ptr, end-ptr, "flags=%#08x ", + lcfg->lcfg_flags); + + if (lcfg->lcfg_num) + ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num); + + if (lcfg->lcfg_nid) + ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n ", + libcfs_nid2str(lcfg->lcfg_nid), + lcfg->lcfg_nid); + + if (lcfg->lcfg_command == LCFG_MARKER) { + struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); + + ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'", + marker->cm_step, marker->cm_flags, + marker->cm_tgtname, marker->cm_comment); + } else { + int i; + + for (i = 0; i < lcfg->lcfg_bufcount; i++) { + ptr += snprintf(ptr, end-ptr, "%d:%s ", i, + lustre_cfg_string(lcfg, i)); + } + } + /* return consumed bytes */ + rc = ptr - buf; + RETURN(rc); +} + +int class_config_dump_handler(const struct lu_env *env, + struct llog_handle *handle, + struct llog_rec_hdr *rec, void *data) +{ + char *outstr; + int rc = 0; + + ENTRY; + + OBD_ALLOC(outstr, 256); + if (outstr == NULL) + RETURN(-ENOMEM); + + if (rec->lrh_type == OBD_CFG_REC) { + class_config_parse_rec(rec, outstr, 256); + LCONSOLE(D_WARNING, " %s\n", outstr); + } else { + LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type); + rc = -EINVAL; + } + + OBD_FREE(outstr, 256); + RETURN(rc); +} + +int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt, + char *name, struct config_llog_instance *cfg) +{ + struct llog_handle *llh; + int rc; + + ENTRY; + + LCONSOLE_INFO("Dumping config log %s\n", name); + + rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS); + if (rc) + RETURN(rc); + + rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(parse_out, rc); + + rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL); +parse_out: + llog_close(env, llh); + + LCONSOLE_INFO("End config log %s\n", name); + RETURN(rc); +} +EXPORT_SYMBOL(class_config_dump_llog); + +/** Call class_cleanup and class_detach. + * "Manual" only in the sense that we're faking lcfg commands. + */ +int class_manual_cleanup(struct obd_device *obd) +{ + char flags[3] = ""; + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + int rc; + ENTRY; + + if (!obd) { + CERROR("empty cleanup\n"); + RETURN(-EALREADY); + } + + if (obd->obd_force) + strcat(flags, "F"); + if (obd->obd_fail) + strcat(flags, "A"); + + CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n", + obd->obd_name, flags); + + lustre_cfg_bufs_reset(&bufs, obd->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, flags); + lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs); + if (!lcfg) + RETURN(-ENOMEM); + + rc = class_process_config(lcfg); + if (rc) { + CERROR("cleanup failed %d: %s\n", rc, obd->obd_name); + GOTO(out, rc); + } + + /* the lcfg is almost the same for both ops */ + lcfg->lcfg_command = LCFG_DETACH; + rc = class_process_config(lcfg); + if (rc) + CERROR("detach failed %d: %s\n", rc, obd->obd_name); +out: + lustre_cfg_free(lcfg); + RETURN(rc); +} +EXPORT_SYMBOL(class_manual_cleanup); + +/* + * uuid<->export lustre hash operations + */ + +static unsigned +uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid, + sizeof(((struct obd_uuid *)key)->uuid), mask); +} + +static void * +uuid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return &exp->exp_client_uuid; +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +uuid_keycmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + + return obd_uuid_equals(key, &exp->exp_client_uuid) && + !exp->exp_failed; +} + +static void * +uuid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_uuid_hash); +} + +static void +uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_get(exp); +} + +static void +uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t uuid_hash_ops = { + .hs_hash = uuid_hash, + .hs_key = uuid_key, + .hs_keycmp = uuid_keycmp, + .hs_object = uuid_export_object, + .hs_get = uuid_export_get, + .hs_put_locked = uuid_export_put_locked, +}; + + +/* + * nid<->export hash operations + */ + +static unsigned +nid_hash(cfs_hash_t *hs, const void *key, unsigned mask) +{ + return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask); +} + +static void * +nid_key(struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(&exp->exp_connection->c_peer.nid); +} + +/* + * NOTE: It is impossible to find an export that is in failed + * state with this function + */ +static int +nid_kepcmp(const void *key, struct hlist_node *hnode) +{ + struct obd_export *exp; + + LASSERT(key); + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + + RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key && + !exp->exp_failed); +} + +static void * +nid_export_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct obd_export, exp_nid_hash); +} + +static void +nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_get(exp); +} + +static void +nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct obd_export *exp; + + exp = hlist_entry(hnode, struct obd_export, exp_nid_hash); + class_export_put(exp); +} + +static cfs_hash_ops_t nid_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nid_key, + .hs_keycmp = nid_kepcmp, + .hs_object = nid_export_object, + .hs_get = nid_export_get, + .hs_put_locked = nid_export_put_locked, +}; + + +/* + * nid<->nidstats hash operations + */ + +static void * +nidstats_key(struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + + return &ns->nid; +} + +static int +nidstats_keycmp(const void *key, struct hlist_node *hnode) +{ + return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key; +} + +static void * +nidstats_object(struct hlist_node *hnode) +{ + return hlist_entry(hnode, struct nid_stat, nid_hash); +} + +static void +nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_getref(ns); +} + +static void +nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode) +{ + struct nid_stat *ns; + + ns = hlist_entry(hnode, struct nid_stat, nid_hash); + nidstat_putref(ns); +} + +static cfs_hash_ops_t nid_stat_hash_ops = { + .hs_hash = nid_hash, + .hs_key = nidstats_key, + .hs_keycmp = nidstats_keycmp, + .hs_object = nidstats_object, + .hs_get = nidstats_get, + .hs_put_locked = nidstats_put_locked, +}; diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c new file mode 100644 index 000000000000..99adad9793c5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c @@ -0,0 +1,1321 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obd_mount.c + * + * Client mount routines + * + * Author: Nathan Rutman <nathan@clusterfs.com> + */ + + +#define DEBUG_SUBSYSTEM S_CLASS +#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */) +#define PRINT_CMD CDEBUG + +#include <obd.h> +#include <lvfs.h> +#include <lustre_fsfilt.h> +#include <obd_class.h> +#include <lustre/lustre_user.h> +#include <linux/version.h> +#include <lustre_log.h> +#include <lustre_disk.h> +#include <lustre_param.h> + +static int (*client_fill_super)(struct super_block *sb, + struct vfsmount *mnt); + +static void (*kill_super_cb)(struct super_block *sb); + +/**************** config llog ********************/ + +/** Get a config log from the MGS and process it. + * This func is called for both clients and servers. + * Continue to process new statements appended to the logs + * (whenever the config lock is revoked) until lustre_end_log + * is called. + * @param sb The superblock is used by the MGC to write to the local copy of + * the config log + * @param logname The name of the llog to replicate from the MGS + * @param cfg Since the same mgc may be used to follow multiple config logs + * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for + * this log, and is added to the mgc's list of logs to follow. + */ +int lustre_process_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs *bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + LASSERT(mgc); + LASSERT(cfg); + + OBD_ALLOC_PTR(bufs); + if (bufs == NULL) + RETURN(-ENOMEM); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(bufs, 1, logname); + lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg)); + lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb)); + lcfg = lustre_cfg_new(LCFG_LOG_START, bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + + OBD_FREE_PTR(bufs); + + if (rc == -EINVAL) + LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'" + "failed from the MGS (%d). Make sure this " + "client and the MGS are running compatible " + "versions of Lustre.\n", + mgc->obd_name, logname, rc); + + if (rc) + LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' " + "failed (%d). This may be the result of " + "communication errors between this node and " + "the MGS, a bad configuration, or other " + "errors. See the syslog for more " + "information.\n", mgc->obd_name, logname, + rc); + + /* class_obd_list(); */ + RETURN(rc); +} +EXPORT_SYMBOL(lustre_process_log); + +/* Stop watching this config log for updates */ +int lustre_end_log(struct super_block *sb, char *logname, + struct config_llog_instance *cfg) +{ + struct lustre_cfg *lcfg; + struct lustre_cfg_bufs bufs; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *mgc = lsi->lsi_mgc; + int rc; + ENTRY; + + if (!mgc) + RETURN(-ENOENT); + + /* mgc_process_config */ + lustre_cfg_bufs_reset(&bufs, mgc->obd_name); + lustre_cfg_bufs_set_string(&bufs, 1, logname); + if (cfg) + lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg)); + lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs); + rc = obd_process_config(mgc, sizeof(*lcfg), lcfg); + lustre_cfg_free(lcfg); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_end_log); + +/**************** obd start *******************/ + +/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from + * lctl (and do for echo cli/srv. + */ +int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd, + char *s1, char *s2, char *s3, char *s4) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg * lcfg = NULL; + int rc; + + CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname, + cmd, s1, s2, s3, s4); + + lustre_cfg_bufs_reset(&bufs, cfgname); + if (s1) + lustre_cfg_bufs_set_string(&bufs, 1, s1); + if (s2) + lustre_cfg_bufs_set_string(&bufs, 2, s2); + if (s3) + lustre_cfg_bufs_set_string(&bufs, 3, s3); + if (s4) + lustre_cfg_bufs_set_string(&bufs, 4, s4); + + lcfg = lustre_cfg_new(cmd, &bufs); + lcfg->lcfg_nid = nid; + rc = class_process_config(lcfg); + lustre_cfg_free(lcfg); + return(rc); +} +EXPORT_SYMBOL(do_lcfg); + +/** Call class_attach and class_setup. These methods in turn call + * obd type-specific methods. + */ +int lustre_start_simple(char *obdname, char *type, char *uuid, + char *s1, char *s2, char *s3, char *s4) +{ + int rc; + CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type); + + rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0); + if (rc) { + CERROR("%s attach error %d\n", obdname, rc); + return rc; + } + rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4); + if (rc) { + CERROR("%s setup error %d\n", obdname, rc); + do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0); + } + return rc; +} + +DEFINE_MUTEX(mgc_start_lock); + +/** Set up a mgc obd to process startup logs + * + * \param sb [in] super block of the mgc obd + * + * \retval 0 success, otherwise error code + */ +int lustre_start_mgc(struct super_block *sb) +{ + struct obd_connect_data *data = NULL; + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + struct obd_export *exp; + struct obd_uuid *uuid; + class_uuid_t uuidc; + lnet_nid_t nid; + char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL; + char *ptr; + int recov_bk; + int rc = 0, i = 0, j, len; + ENTRY; + + LASSERT(lsi->lsi_lmd); + + /* Find the first non-lo MGS nid for our MGC name */ + if (IS_SERVER(lsi)) { + /* mount -o mgsnode=nid */ + ptr = lsi->lsi_lmd->lmd_mgs; + if (lsi->lsi_lmd->lmd_mgs && + (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) { + i++; + } else if (IS_MGS(lsi)) { + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND) + continue; + nid = id.nid; + i++; + break; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + if (class_parse_nid(ptr, &nid, &ptr) == 0) + i++; + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + RETURN(-EINVAL); + } + + mutex_lock(&mgc_start_lock); + + len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1; + OBD_ALLOC(mgcname, len); + OBD_ALLOC(niduuid, len + 2); + if (!mgcname || !niduuid) + GOTO(out_free, rc = -ENOMEM); + sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); + + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + + OBD_ALLOC_PTR(data); + if (data == NULL) + GOTO(out_free, rc = -ENOMEM); + + obd = class_name2obd(mgcname); + if (obd && !obd->obd_stopping) { + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Re-using an existing MGC */ + atomic_inc(&obd->u.cli.cl_mgc_refcount); + + /* IR compatibility check, only for clients */ + if (lmd_is_client(lsi->lsi_lmd)) { + int has_ir; + int vallen = sizeof(*data); + __u32 *flags = &lsi->lsi_lmd->lmd_flags; + + rc = obd_get_info(NULL, obd->obd_self_export, + strlen(KEY_CONN_DATA), KEY_CONN_DATA, + &vallen, data, NULL); + LASSERT(rc == 0); + has_ir = OCD_HAS_FLAG(data, IMP_RECOV); + if (has_ir ^ !(*flags & LMD_FLG_NOIR)) { + /* LMD_FLG_NOIR is for test purpose only */ + LCONSOLE_WARN( + "Trying to mount a client with IR setting " + "not compatible with current mgc. " + "Force to use current mgc setting that is " + "IR %s.\n", + has_ir ? "enabled" : "disabled"); + if (has_ir) + *flags &= ~LMD_FLG_NOIR; + else + *flags |= LMD_FLG_NOIR; + } + } + + recov_bk = 0; + /* If we are restarting the MGS, don't try to keep the MGC's + old connection, or registration will fail. */ + if (IS_MGS(lsi)) { + CDEBUG(D_MOUNT, "New MGS with live MGC\n"); + recov_bk = 1; + } + + /* Try all connections, but only once (again). + We don't want to block another target from starting + (using its local copy of the log), but we do want to connect + if at all possible. */ + recov_bk++; + CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk); + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + GOTO(out, rc = 0); + } + + CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname); + + /* Add the primary nids for the MGS */ + i = 0; + sprintf(niduuid, "%s_%x", mgcname, i); + if (IS_SERVER(lsi)) { + ptr = lsi->lsi_lmd->lmd_mgs; + if (IS_MGS(lsi)) { + /* Use local nids (including LO) */ + lnet_process_id_t id; + while ((rc = LNetGetId(i++, &id)) != -ENOENT) { + rc = do_lcfg(mgcname, id.nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + } + } else { + /* Use mgsnode= nids */ + /* mount -o mgsnode=nid */ + if (lsi->lsi_lmd->lmd_mgs) { + ptr = lsi->lsi_lmd->lmd_mgs; + } else if (class_find_param(ptr, PARAM_MGSNODE, + &ptr) != 0) { + CERROR("No MGS nids given.\n"); + GOTO(out_free, rc = -EINVAL); + } + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + i++; + } + } + } else { /* client */ + /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */ + ptr = lsi->lsi_lmd->lmd_dev; + while (class_parse_nid(ptr, &nid, &ptr) == 0) { + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + i++; + /* Stop at the first failover nid */ + if (*ptr == ':') + break; + } + } + if (i == 0) { + CERROR("No valid MGS nids found.\n"); + GOTO(out_free, rc = -EINVAL); + } + lsi->lsi_lmd->lmd_mgs_failnodes = 1; + + /* Random uuid for MGC allows easier reconnects */ + OBD_ALLOC_PTR(uuid); + ll_generate_random_uuid(uuidc); + class_uuid_unparse(uuidc, uuid); + + /* Start the MGC */ + rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME, + (char *)uuid->uuid, LUSTRE_MGS_OBDNAME, + niduuid, 0, 0); + OBD_FREE_PTR(uuid); + if (rc) + GOTO(out_free, rc); + + /* Add any failover MGS nids */ + i = 1; + while (ptr && ((*ptr == ':' || + class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) { + /* New failover node */ + sprintf(niduuid, "%s_%x", mgcname, i); + j = 0; + while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) { + j++; + rc = do_lcfg(mgcname, nid, + LCFG_ADD_UUID, niduuid, 0,0,0); + if (*ptr == ':') + break; + } + if (j > 0) { + rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN, + niduuid, 0, 0, 0); + i++; + } else { + /* at ":/fsname" */ + break; + } + } + lsi->lsi_lmd->lmd_mgs_failnodes = i; + + obd = class_name2obd(mgcname); + if (!obd) { + CERROR("Can't find mgcobd %s\n", mgcname); + GOTO(out_free, rc = -ENOTCONN); + } + + rc = obd_set_info_async(NULL, obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + + /* Keep a refcount of servers/clients who started with "mount", + so we know when we can get rid of the mgc. */ + atomic_set(&obd->u.cli.cl_mgc_refcount, 1); + + /* Try all connections, but only once. */ + recov_bk = 1; + rc = obd_set_info_async(NULL, obd->obd_self_export, + sizeof(KEY_INIT_RECOV_BACKUP), + KEY_INIT_RECOV_BACKUP, + sizeof(recov_bk), &recov_bk, NULL); + if (rc) + /* nonfatal */ + CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc); + + /* We connect to the MGS at setup, and don't disconnect until cleanup */ + data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT | + OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | + OBD_CONNECT_LVB_TYPE; + +#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0) + data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB; +#else +#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab" +#endif + + if (lmd_is_client(lsi->lsi_lmd) && + lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR) + data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV; + data->ocd_version = LUSTRE_VERSION_CODE; + rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL); + if (rc) { + CERROR("connect failed %d\n", rc); + GOTO(out, rc); + } + + obd->u.cli.cl_mgc_mgsexp = exp; + +out: + /* Keep the mgc info in the sb. Note that many lsi's can point + to the same mgc.*/ + lsi->lsi_mgc = obd; +out_free: + mutex_unlock(&mgc_start_lock); + + if (data) + OBD_FREE_PTR(data); + if (mgcname) + OBD_FREE(mgcname, len); + if (niduuid) + OBD_FREE(niduuid, len + 2); + RETURN(rc); +} + +static int lustre_stop_mgc(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct obd_device *obd; + char *niduuid = 0, *ptr = 0; + int i, rc = 0, len = 0; + ENTRY; + + if (!lsi) + RETURN(-ENOENT); + obd = lsi->lsi_mgc; + if (!obd) + RETURN(-ENOENT); + lsi->lsi_mgc = NULL; + + mutex_lock(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); + if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { + /* This is not fatal, every client that stops + will call in here. */ + CDEBUG(D_MOUNT, "mgc still has %d references.\n", + atomic_read(&obd->u.cli.cl_mgc_refcount)); + GOTO(out, rc = -EBUSY); + } + + /* The MGC has no recoverable data in any case. + * force shotdown set in umount_begin */ + obd->obd_no_recov = 1; + + if (obd->u.cli.cl_mgc_mgsexp) { + /* An error is not fatal, if we are unable to send the + disconnect mgs ping evictor cleans up the export */ + rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp); + if (rc) + CDEBUG(D_MOUNT, "disconnect failed %d\n", rc); + } + + /* Save the obdname for cleaning the nid uuids, which are + obdname_XX */ + len = strlen(obd->obd_name) + 6; + OBD_ALLOC(niduuid, len); + if (niduuid) { + strcpy(niduuid, obd->obd_name); + ptr = niduuid + strlen(niduuid); + } + + rc = class_manual_cleanup(obd); + if (rc) + GOTO(out, rc); + + /* Clean the nid uuids */ + if (!niduuid) + GOTO(out, rc = -ENOMEM); + + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { + sprintf(ptr, "_%x", i); + rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, + niduuid, 0, 0, 0); + if (rc) + CERROR("del MDC UUID %s failed: rc = %d\n", + niduuid, rc); + } +out: + if (niduuid) + OBD_FREE(niduuid, len); + + /* class_import_put will get rid of the additional connections */ + mutex_unlock(&mgc_start_lock); + RETURN(rc); +} + +/***************** lustre superblock **************/ + +struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi; + ENTRY; + + OBD_ALLOC_PTR(lsi); + if (!lsi) + RETURN(NULL); + OBD_ALLOC_PTR(lsi->lsi_lmd); + if (!lsi->lsi_lmd) { + OBD_FREE_PTR(lsi); + RETURN(NULL); + } + + lsi->lsi_lmd->lmd_exclude_count = 0; + lsi->lsi_lmd->lmd_recovery_time_soft = 0; + lsi->lsi_lmd->lmd_recovery_time_hard = 0; + s2lsi_nocast(sb) = lsi; + /* we take 1 extra ref for our setup */ + atomic_set(&lsi->lsi_mounts, 1); + + /* Default umount style */ + lsi->lsi_flags = LSI_UMOUNT_FAILOVER; + + RETURN(lsi); +} + +static int lustre_free_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi); + + /* someone didn't call server_put_mount. */ + LASSERT(atomic_read(&lsi->lsi_mounts) == 0); + + if (lsi->lsi_lmd != NULL) { + if (lsi->lsi_lmd->lmd_dev != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_dev, + strlen(lsi->lsi_lmd->lmd_dev) + 1); + if (lsi->lsi_lmd->lmd_profile != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_profile, + strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); + if (lsi->lsi_lmd->lmd_opts != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_opts, + strlen(lsi->lsi_lmd->lmd_opts) + 1); + if (lsi->lsi_lmd->lmd_exclude_count) + OBD_FREE(lsi->lsi_lmd->lmd_exclude, + sizeof(lsi->lsi_lmd->lmd_exclude[0]) * + lsi->lsi_lmd->lmd_exclude_count); + if (lsi->lsi_lmd->lmd_mgs != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgs, + strlen(lsi->lsi_lmd->lmd_mgs) + 1); + if (lsi->lsi_lmd->lmd_osd_type != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_osd_type, + strlen(lsi->lsi_lmd->lmd_osd_type) + 1); + if (lsi->lsi_lmd->lmd_params != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_params, 4096); + + OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + } + + LASSERT(lsi->lsi_llsbi == NULL); + OBD_FREE(lsi, sizeof(*lsi)); + s2lsi_nocast(sb) = NULL; + + RETURN(0); +} + +/* The lsi has one reference for every server that is using the disk - + e.g. MDT, MGS, and potentially MGC */ +int lustre_put_lsi(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + ENTRY; + + LASSERT(lsi != NULL); + + CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts)); + if (atomic_dec_and_test(&lsi->lsi_mounts)) { + if (IS_SERVER(lsi) && lsi->lsi_osd_exp) { + obd_disconnect(lsi->lsi_osd_exp); + /* wait till OSD is gone */ + obd_zombie_barrier(); + } + lustre_free_lsi(sb); + RETURN(1); + } + RETURN(0); +} + +/** Get the fsname ("lustre") from the server name ("lustre-OST003F"). + * @param [in] svname server name including type and index + * @param [out] fsname Buffer to copy filesystem name prefix into. + * Must have at least 'strlen(fsname) + 1' chars. + * @param [out] endptr if endptr isn't NULL it is set to end of fsname + * rc < 0 on error + */ +int server_name2fsname(const char *svname, char *fsname, const char **endptr) +{ + const char *dash = strrchr(svname, '-'); + if (!dash) { + dash = strrchr(svname, ':'); + if (!dash) + return -EINVAL; + } + + /* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass + * in the fsname, then determine the server index */ + if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) { + dash--; + for (; dash > svname && *dash != '-' && *dash != ':'; dash--) + ; + if (dash == svname) + return -EINVAL; + } + + if (fsname != NULL) { + strncpy(fsname, svname, dash - svname); + fsname[dash - svname] = '\0'; + } + + if (endptr != NULL) + *endptr = dash; + + return 0; +} +EXPORT_SYMBOL(server_name2fsname); + +/** + * Get service name (svname) from string + * rc < 0 on error + * if endptr isn't NULL it is set to end of fsname * + */ +int server_name2svname(const char *label, char *svname, const char **endptr, + size_t svsize) +{ + int rc; + const const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(label, NULL, &dash); + if (rc != 0) + return rc; + + if (*dash != '-') + return -1; + + if (strlcpy(svname, dash + 1, svsize) >= svsize) + return -E2BIG; + + return 0; +} +EXPORT_SYMBOL(server_name2svname); + + +/* Get the index from the obd name. + rc = server type, or + rc < 0 on error + if endptr isn't NULL it is set to end of name */ +int server_name2index(const char *svname, __u32 *idx, const char **endptr) +{ + unsigned long index; + int rc; + const char *dash; + + /* We use server_name2fsname() just for parsing */ + rc = server_name2fsname(svname, NULL, &dash); + if (rc != 0) + return rc; + + if (*dash != '-') + return -EINVAL; + + dash++; + + if (strncmp(dash, "MDT", 3) == 0) + rc = LDD_F_SV_TYPE_MDT; + else if (strncmp(dash, "OST", 3) == 0) + rc = LDD_F_SV_TYPE_OST; + else + return -EINVAL; + + dash += 3; + + if (strcmp(dash, "all") == 0) + return rc | LDD_F_SV_ALL; + + index = simple_strtoul(dash, (char **)endptr, 16); + *idx = index; + + return rc; +} +EXPORT_SYMBOL(server_name2index); + +/*************** mount common betweeen server and client ***************/ + +/* Common umount */ +int lustre_common_put_super(struct super_block *sb) +{ + int rc; + ENTRY; + + CDEBUG(D_MOUNT, "dropping sb %p\n", sb); + + /* Drop a ref to the MGC */ + rc = lustre_stop_mgc(sb); + if (rc && (rc != -ENOENT)) { + if (rc != -EBUSY) { + CERROR("Can't stop MGC: %d\n", rc); + RETURN(rc); + } + /* BUSY just means that there's some other obd that + needs the mgc. Let him clean it up. */ + CDEBUG(D_MOUNT, "MGC still in use\n"); + } + /* Drop a ref to the mounted disk */ + lustre_put_lsi(sb); + lu_types_stop(); + RETURN(rc); +} +EXPORT_SYMBOL(lustre_common_put_super); + +static void lmd_print(struct lustre_mount_data *lmd) +{ + int i; + + PRINT_CMD(D_MOUNT, " mount data:\n"); + if (lmd_is_client(lmd)) + PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile); + PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev); + PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags); + + if (lmd->lmd_opts) + PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts); + + if (lmd->lmd_recovery_time_soft) + PRINT_CMD(D_MOUNT, "recovery time soft: %d\n", + lmd->lmd_recovery_time_soft); + + if (lmd->lmd_recovery_time_hard) + PRINT_CMD(D_MOUNT, "recovery time hard: %d\n", + lmd->lmd_recovery_time_hard); + + for (i = 0; i < lmd->lmd_exclude_count; i++) { + PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i, + lmd->lmd_exclude[i]); + } +} + +/* Is this server on the exclusion list */ +int lustre_check_exclusion(struct super_block *sb, char *svname) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + struct lustre_mount_data *lmd = lsi->lsi_lmd; + __u32 index; + int i, rc; + ENTRY; + + rc = server_name2index(svname, &index, NULL); + if (rc != LDD_F_SV_TYPE_OST) + /* Only exclude OSTs */ + RETURN(0); + + CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname, + index, lmd->lmd_exclude_count, lmd->lmd_dev); + + for(i = 0; i < lmd->lmd_exclude_count; i++) { + if (index == lmd->lmd_exclude[i]) { + CWARN("Excluding %s (on exclusion list)\n", svname); + RETURN(1); + } + } + RETURN(0); +} + +/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */ +static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr) +{ + const char *s1 = ptr, *s2; + __u32 index, *exclude_list; + int rc = 0, devmax; + ENTRY; + + /* The shortest an ost name can be is 8 chars: -OST0000. + We don't actually know the fsname at this time, so in fact + a user could specify any fsname. */ + devmax = strlen(ptr) / 8 + 1; + + /* temp storage until we figure out how many we have */ + OBD_ALLOC(exclude_list, sizeof(index) * devmax); + if (!exclude_list) + RETURN(-ENOMEM); + + /* we enter this fn pointing at the '=' */ + while (*s1 && *s1 != ' ' && *s1 != ',') { + s1++; + rc = server_name2index(s1, &index, &s2); + if (rc < 0) { + CERROR("Can't parse server name '%s'\n", s1); + break; + } + if (rc == LDD_F_SV_TYPE_OST) + exclude_list[lmd->lmd_exclude_count++] = index; + else + CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1); + s1 = s2; + /* now we are pointing at ':' (next exclude) + or ',' (end of excludes) */ + if (lmd->lmd_exclude_count >= devmax) + break; + } + if (rc >= 0) /* non-err */ + rc = 0; + + if (lmd->lmd_exclude_count) { + /* permanent, freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_exclude, sizeof(index) * + lmd->lmd_exclude_count); + if (lmd->lmd_exclude) { + memcpy(lmd->lmd_exclude, exclude_list, + sizeof(index) * lmd->lmd_exclude_count); + } else { + rc = -ENOMEM; + lmd->lmd_exclude_count = 0; + } + } + OBD_FREE(exclude_list, sizeof(index) * devmax); + RETURN(rc); +} + +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + +static int lmd_parse_string(char **handle, char *ptr) +{ + char *tail; + int length; + + if ((handle == NULL) || (ptr == NULL)) + return -EINVAL; + + if (*handle != NULL) { + OBD_FREE(*handle, strlen(*handle) + 1); + *handle = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(*handle, length + 1); + if (*handle == NULL) + return -ENOMEM; + + memcpy(*handle, ptr, length); + (*handle)[length] = '\0'; + + return 0; +} + +/* Collect multiple values for mgsnid specifiers */ +static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr) +{ + lnet_nid_t nid; + char *tail = *ptr; + char *mgsnid; + int length; + int oldlen = 0; + + /* Find end of nidlist */ + while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {} + length = tail - *ptr; + if (length == 0) { + LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr); + return -EINVAL; + } + + if (lmd->lmd_mgs != NULL) + oldlen = strlen(lmd->lmd_mgs) + 1; + + OBD_ALLOC(mgsnid, oldlen + length + 1); + if (mgsnid == NULL) + return -ENOMEM; + + if (lmd->lmd_mgs != NULL) { + /* Multiple mgsnid= are taken to mean failover locations */ + memcpy(mgsnid, lmd->lmd_mgs, oldlen); + mgsnid[oldlen - 1] = ':'; + OBD_FREE(lmd->lmd_mgs, oldlen); + } + memcpy(mgsnid + oldlen, *ptr, length); + mgsnid[oldlen + length] = '\0'; + lmd->lmd_mgs = mgsnid; + *ptr = tail; + + return 0; +} + +/** Parse mount line options + * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre + * dev is passed as device=uml1:/lustre by mount.lustre + */ +static int lmd_parse(char *options, struct lustre_mount_data *lmd) +{ + char *s1, *s2, *devname = NULL; + struct lustre_mount_data *raw = (struct lustre_mount_data *)options; + int rc = 0; + ENTRY; + + LASSERT(lmd); + if (!options) { + LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that " + "/sbin/mount.lustre is installed.\n"); + RETURN(-EINVAL); + } + + /* Options should be a string - try to detect old lmd data */ + if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) { + LCONSOLE_ERROR_MSG(0x163, "You're using an old version of " + "/sbin/mount.lustre. Please install " + "version %s\n", LUSTRE_VERSION_STRING); + RETURN(-EINVAL); + } + lmd->lmd_magic = LMD_MAGIC; + + OBD_ALLOC(lmd->lmd_params, 4096); + if (lmd->lmd_params == NULL) + RETURN(-ENOMEM); + lmd->lmd_params[0] = '\0'; + + /* Set default flags here */ + + s1 = options; + while (*s1) { + int clear = 0; + int time_min = OBD_RECOVERY_TIME_MIN; + + /* Skip whitespace and extra commas */ + while (*s1 == ' ' || *s1 == ',') + s1++; + + /* Client options are parsed in ll_options: eg. flock, + user_xattr, acl */ + + /* Parse non-ldiskfs options here. Rather than modifying + ldiskfs, we just zero these out here */ + if (strncmp(s1, "abort_recov", 11) == 0) { + lmd->lmd_flags |= LMD_FLG_ABORT_RECOV; + clear++; + } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) { + lmd->lmd_recovery_time_soft = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) { + lmd->lmd_recovery_time_hard = max_t(int, + simple_strtoul(s1 + 19, NULL, 10), time_min); + clear++; + } else if (strncmp(s1, "noir", 4) == 0) { + lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */ + clear++; + } else if (strncmp(s1, "nosvc", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSVC; + clear++; + } else if (strncmp(s1, "nomgs", 5) == 0) { + lmd->lmd_flags |= LMD_FLG_NOMGS; + clear++; + } else if (strncmp(s1, "noscrub", 7) == 0) { + lmd->lmd_flags |= LMD_FLG_NOSCRUB; + clear++; + } else if (strncmp(s1, PARAM_MGSNODE, + sizeof(PARAM_MGSNODE) - 1) == 0) { + s2 = s1 + sizeof(PARAM_MGSNODE) - 1; + /* Assume the next mount opt is the first + invalid nid we get to. */ + rc = lmd_parse_mgs(lmd, &s2); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "writeconf", 9) == 0) { + lmd->lmd_flags |= LMD_FLG_WRITECONF; + clear++; + } else if (strncmp(s1, "update", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_UPDATE; + clear++; + } else if (strncmp(s1, "virgin", 6) == 0) { + lmd->lmd_flags |= LMD_FLG_VIRGIN; + clear++; + } else if (strncmp(s1, "noprimnode", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE; + clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + /* ost exclusion list */ + } else if (strncmp(s1, "exclude=", 8) == 0) { + rc = lmd_make_exclusion(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "mgs", 3) == 0) { + /* We are an MGS */ + lmd->lmd_flags |= LMD_FLG_MGS; + clear++; + } else if (strncmp(s1, "svname=", 7) == 0) { + rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7); + if (rc) + goto invalid; + clear++; + } else if (strncmp(s1, "param=", 6) == 0) { + int length; + char *tail = strchr(s1 + 6, ','); + if (tail == NULL) + length = strlen(s1); + else + length = tail - s1; + length -= 6; + strncat(lmd->lmd_params, s1 + 6, length); + strcat(lmd->lmd_params, " "); + clear++; + } else if (strncmp(s1, "osd=", 4) == 0) { + rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); + if (rc) + goto invalid; + clear++; + } + /* Linux 2.4 doesn't pass the device, so we stuck it at the + end of the options. */ + else if (strncmp(s1, "device=", 7) == 0) { + devname = s1 + 7; + /* terminate options right before device. device + must be the last one. */ + *s1 = '\0'; + break; + } + + /* Find next opt */ + s2 = strchr(s1, ','); + if (s2 == NULL) { + if (clear) + *s1 = '\0'; + break; + } + s2++; + if (clear) + memmove(s1, s2, strlen(s2) + 1); + else + s1 = s2; + } + + if (!devname) { + LCONSOLE_ERROR_MSG(0x164, "Can't find the device name " + "(need mount option 'device=...')\n"); + goto invalid; + } + + s1 = strstr(devname, ":/"); + if (s1) { + ++s1; + lmd->lmd_flags |= LMD_FLG_CLIENT; + /* Remove leading /s from fsname */ + while (*++s1 == '/') ; + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8); + if (!lmd->lmd_profile) + RETURN(-ENOMEM); + sprintf(lmd->lmd_profile, "%s-client", s1); + } + + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1); + if (!lmd->lmd_dev) + RETURN(-ENOMEM); + strcpy(lmd->lmd_dev, devname); + + /* Save mount options */ + s1 = options + strlen(options) - 1; + while (s1 >= options && (*s1 == ',' || *s1 == ' ')) + *s1-- = 0; + if (*options != 0) { + /* Freed in lustre_free_lsi */ + OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1); + if (!lmd->lmd_opts) + RETURN(-ENOMEM); + strcpy(lmd->lmd_opts, options); + } + + lmd_print(lmd); + lmd->lmd_magic = LMD_MAGIC; + + RETURN(rc); + +invalid: + CERROR("Bad mount options %s\n", options); + RETURN(-EINVAL); +} + +struct lustre_mount_data2 { + void *lmd2_data; + struct vfsmount *lmd2_mnt; +}; + +/** This is the entry point for the mount call into Lustre. + * This is called when a server or client is mounted, + * and this is where we start setting things up. + * @param data Mount options (e.g. -o flock,abort_recov) + */ +int lustre_fill_super(struct super_block *sb, void *data, int silent) +{ + struct lustre_mount_data *lmd; + struct lustre_mount_data2 *lmd2 = data; + struct lustre_sb_info *lsi; + int rc; + ENTRY; + + CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb); + + lsi = lustre_init_lsi(sb); + if (!lsi) + RETURN(-ENOMEM); + lmd = lsi->lsi_lmd; + + /* + * Disable lockdep during mount, because mount locking patterns are + * `special'. + */ + lockdep_off(); + + /* + * LU-639: the obd cleanup of last mount may not finish yet, wait here. + */ + obd_zombie_barrier(); + + /* Figure out the lmd from the mount options */ + if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) { + lustre_put_lsi(sb); + GOTO(out, rc = -EINVAL); + } + + if (lmd_is_client(lmd)) { + CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile); + if (!client_fill_super) { + LCONSOLE_ERROR_MSG(0x165, "Nothing registered for " + "client mount! Is the 'lustre' " + "module loaded?\n"); + lustre_put_lsi(sb); + rc = -ENODEV; + } else { + rc = lustre_start_mgc(sb); + if (rc) { + lustre_put_lsi(sb); + GOTO(out, rc); + } + /* Connect and start */ + /* (should always be ll_fill_super) */ + rc = (*client_fill_super)(sb, lmd2->lmd2_mnt); + /* c_f_s will call lustre_common_put_super on failure */ + } + } else { + CERROR("This is client-side-only module, " + "cannot handle server mount.\n"); + rc = -EINVAL; + } + + /* If error happens in fill_super() call, @lsi will be killed there. + * This is why we do not put it here. */ + GOTO(out, rc); +out: + if (rc) { + CERROR("Unable to mount %s (%d)\n", + s2lsi(sb) ? lmd->lmd_dev : "", rc); + } else { + CDEBUG(D_SUPER, "Mount %s complete\n", + lmd->lmd_dev); + } + lockdep_on(); + return rc; +} + + +/* We can't call ll_fill_super by name because it lives in a module that + must be loaded after this one. */ +void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb, + struct vfsmount *mnt)) +{ + client_fill_super = cfs; +} +EXPORT_SYMBOL(lustre_register_client_fill_super); + +void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb)) +{ + kill_super_cb = cfs; +} +EXPORT_SYMBOL(lustre_register_kill_super_cb); + +/***************** FS registration ******************/ +struct dentry *lustre_mount(struct file_system_type *fs_type, int flags, + const char *devname, void *data) +{ + struct lustre_mount_data2 lmd2 = { data, NULL }; + + return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super); +} + +void lustre_kill_super(struct super_block *sb) +{ + struct lustre_sb_info *lsi = s2lsi(sb); + + if (kill_super_cb && lsi && !IS_SERVER(lsi)) + (*kill_super_cb)(sb); + + kill_anon_super(sb); +} + +/** Register the "lustre" fs type + */ +struct file_system_type lustre_fs_type = { + .owner = THIS_MODULE, + .name = "lustre", + .mount = lustre_mount, + .kill_sb = lustre_kill_super, + .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV | + FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE, +}; + +int lustre_register_fs(void) +{ + return register_filesystem(&lustre_fs_type); +} + +int lustre_unregister_fs(void) +{ + return unregister_filesystem(&lustre_fs_type); +} diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c new file mode 100644 index 000000000000..01a0e1f83a68 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/obdo.c @@ -0,0 +1,362 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Intel Corporation. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/obdo.c + * + * Object Devices Class Driver + * These are the only exported functions, they provide some generic + * infrastructure for managing object devices + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +#include <obd_class.h> +#include <lustre/lustre_idl.h> + +void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent) +{ + dst->o_parent_oid = fid_oid(parent); + dst->o_parent_seq = fid_seq(parent); + dst->o_parent_ver = fid_ver(parent); + dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID; +} +EXPORT_SYMBOL(obdo_set_parent_fid); + +/* WARNING: the file systems must take care not to tinker with + attributes they don't manage (such as blocks). */ +void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) +{ + obd_flag newvalid = 0; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", + valid, LTIME_S(src->i_mtime), + LTIME_S(src->i_ctime)); + + if (valid & OBD_MD_FLATIME) { + dst->o_atime = LTIME_S(src->i_atime); + newvalid |= OBD_MD_FLATIME; + } + if (valid & OBD_MD_FLMTIME) { + dst->o_mtime = LTIME_S(src->i_mtime); + newvalid |= OBD_MD_FLMTIME; + } + if (valid & OBD_MD_FLCTIME) { + dst->o_ctime = LTIME_S(src->i_ctime); + newvalid |= OBD_MD_FLCTIME; + } + if (valid & OBD_MD_FLSIZE) { + dst->o_size = i_size_read(src); + newvalid |= OBD_MD_FLSIZE; + } + if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */ + dst->o_blocks = src->i_blocks; + newvalid |= OBD_MD_FLBLOCKS; + } + if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */ + dst->o_blksize = ll_inode_blksize(src); + newvalid |= OBD_MD_FLBLKSZ; + } + if (valid & OBD_MD_FLTYPE) { + dst->o_mode = (dst->o_mode & S_IALLUGO) | + (src->i_mode & S_IFMT); + newvalid |= OBD_MD_FLTYPE; + } + if (valid & OBD_MD_FLMODE) { + dst->o_mode = (dst->o_mode & S_IFMT) | + (src->i_mode & S_IALLUGO); + newvalid |= OBD_MD_FLMODE; + } + if (valid & OBD_MD_FLUID) { + dst->o_uid = src->i_uid; + newvalid |= OBD_MD_FLUID; + } + if (valid & OBD_MD_FLGID) { + dst->o_gid = src->i_gid; + newvalid |= OBD_MD_FLGID; + } + if (valid & OBD_MD_FLFLAGS) { + dst->o_flags = ll_inode_flags(src); + newvalid |= OBD_MD_FLFLAGS; + } + dst->o_valid |= newvalid; +} +EXPORT_SYMBOL(obdo_from_inode); + +void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid) +{ + CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n", + POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi)); + if (valid & OBD_MD_FLATIME) + dst->o_atime = src->o_atime; + if (valid & OBD_MD_FLMTIME) + dst->o_mtime = src->o_mtime; + if (valid & OBD_MD_FLCTIME) + dst->o_ctime = src->o_ctime; + if (valid & OBD_MD_FLSIZE) + dst->o_size = src->o_size; + if (valid & OBD_MD_FLBLOCKS) /* allocation of space */ + dst->o_blocks = src->o_blocks; + if (valid & OBD_MD_FLBLKSZ) + dst->o_blksize = src->o_blksize; + if (valid & OBD_MD_FLTYPE) + dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT); + if (valid & OBD_MD_FLMODE) + dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT); + if (valid & OBD_MD_FLUID) + dst->o_uid = src->o_uid; + if (valid & OBD_MD_FLGID) + dst->o_gid = src->o_gid; + if (valid & OBD_MD_FLFLAGS) + dst->o_flags = src->o_flags; + if (valid & OBD_MD_FLFID) { + dst->o_parent_seq = src->o_parent_seq; + dst->o_parent_ver = src->o_parent_ver; + } + if (valid & OBD_MD_FLGENER) + dst->o_parent_oid = src->o_parent_oid; + if (valid & OBD_MD_FLHANDLE) + dst->o_handle = src->o_handle; + if (valid & OBD_MD_FLCOOKIE) + dst->o_lcookie = src->o_lcookie; + + dst->o_valid |= valid; +} +EXPORT_SYMBOL(obdo_cpy_md); + +/* returns FALSE if comparison (by flags) is same, TRUE if changed */ +int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare) +{ + int res = 0; + + if ( compare & OBD_MD_FLATIME ) + res = (res || (dst->o_atime != src->o_atime)); + if ( compare & OBD_MD_FLMTIME ) + res = (res || (dst->o_mtime != src->o_mtime)); + if ( compare & OBD_MD_FLCTIME ) + res = (res || (dst->o_ctime != src->o_ctime)); + if ( compare & OBD_MD_FLSIZE ) + res = (res || (dst->o_size != src->o_size)); + if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */ + res = (res || (dst->o_blocks != src->o_blocks)); + if ( compare & OBD_MD_FLBLKSZ ) + res = (res || (dst->o_blksize != src->o_blksize)); + if ( compare & OBD_MD_FLTYPE ) + res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0)); + if ( compare & OBD_MD_FLMODE ) + res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0)); + if ( compare & OBD_MD_FLUID ) + res = (res || (dst->o_uid != src->o_uid)); + if ( compare & OBD_MD_FLGID ) + res = (res || (dst->o_gid != src->o_gid)); + if ( compare & OBD_MD_FLFLAGS ) + res = (res || (dst->o_flags != src->o_flags)); + if ( compare & OBD_MD_FLNLINK ) + res = (res || (dst->o_nlink != src->o_nlink)); + if ( compare & OBD_MD_FLFID ) { + res = (res || (dst->o_parent_seq != src->o_parent_seq)); + res = (res || (dst->o_parent_ver != src->o_parent_ver)); + } + if ( compare & OBD_MD_FLGENER ) + res = (res || (dst->o_parent_oid != src->o_parent_oid)); + /* XXX Don't know if thses should be included here - wasn't previously + if ( compare & OBD_MD_FLINLINE ) + res = (res || memcmp(dst->o_inline, src->o_inline)); + */ + return res; +} +EXPORT_SYMBOL(obdo_cmp_md); + +void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj) +{ + ioobj->ioo_oid = oa->o_oi; + if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP))) + ostid_set_seq_mdt0(&ioobj->ioo_oid); + + /* Since 2.4 this does not contain o_mode in the low 16 bits. + * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */ + ioobj->ioo_max_brw = 0; +} +EXPORT_SYMBOL(obdo_to_ioobj); + +void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid) +{ + if (ia_valid & ATTR_ATIME) { + oa->o_atime = LTIME_S(attr->ia_atime); + oa->o_valid |= OBD_MD_FLATIME; + } + if (ia_valid & ATTR_MTIME) { + oa->o_mtime = LTIME_S(attr->ia_mtime); + oa->o_valid |= OBD_MD_FLMTIME; + } + if (ia_valid & ATTR_CTIME) { + oa->o_ctime = LTIME_S(attr->ia_ctime); + oa->o_valid |= OBD_MD_FLCTIME; + } + if (ia_valid & ATTR_SIZE) { + oa->o_size = attr->ia_size; + oa->o_valid |= OBD_MD_FLSIZE; + } + if (ia_valid & ATTR_MODE) { + oa->o_mode = attr->ia_mode; + oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE; + if (!current_is_in_group(oa->o_gid) && + !cfs_capable(CFS_CAP_FSETID)) + oa->o_mode &= ~S_ISGID; + } + if (ia_valid & ATTR_UID) { + oa->o_uid = attr->ia_uid; + oa->o_valid |= OBD_MD_FLUID; + } + if (ia_valid & ATTR_GID) { + oa->o_gid = attr->ia_gid; + oa->o_valid |= OBD_MD_FLGID; + } +} +EXPORT_SYMBOL(obdo_from_iattr); + +void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid) +{ + valid &= oa->o_valid; + + if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) + CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n", + oa->o_valid, oa->o_mtime, oa->o_ctime); + + attr->ia_valid = 0; + if (valid & OBD_MD_FLATIME) { + LTIME_S(attr->ia_atime) = oa->o_atime; + attr->ia_valid |= ATTR_ATIME; + } + if (valid & OBD_MD_FLMTIME) { + LTIME_S(attr->ia_mtime) = oa->o_mtime; + attr->ia_valid |= ATTR_MTIME; + } + if (valid & OBD_MD_FLCTIME) { + LTIME_S(attr->ia_ctime) = oa->o_ctime; + attr->ia_valid |= ATTR_CTIME; + } + if (valid & OBD_MD_FLSIZE) { + attr->ia_size = oa->o_size; + attr->ia_valid |= ATTR_SIZE; + } +#if 0 /* you shouldn't be able to change a file's type with setattr */ + if (valid & OBD_MD_FLTYPE) { + attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT); + attr->ia_valid |= ATTR_MODE; + } +#endif + if (valid & OBD_MD_FLMODE) { + attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT); + attr->ia_valid |= ATTR_MODE; + if (!current_is_in_group(oa->o_gid) && + !cfs_capable(CFS_CAP_FSETID)) + attr->ia_mode &= ~S_ISGID; + } + if (valid & OBD_MD_FLUID) { + attr->ia_uid = oa->o_uid; + attr->ia_valid |= ATTR_UID; + } + if (valid & OBD_MD_FLGID) { + attr->ia_gid = oa->o_gid; + attr->ia_valid |= ATTR_GID; + } +} +EXPORT_SYMBOL(iattr_from_obdo); + +void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid) +{ + iattr_from_obdo(&op_data->op_attr, oa, valid); + if (valid & OBD_MD_FLBLOCKS) { + op_data->op_attr_blocks = oa->o_blocks; + op_data->op_attr.ia_valid |= ATTR_BLOCKS; + } + if (valid & OBD_MD_FLFLAGS) { + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags = + oa->o_flags; + op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG; + } +} +EXPORT_SYMBOL(md_from_obdo); + +void obdo_from_md(struct obdo *oa, struct md_op_data *op_data, + unsigned int valid) +{ + obdo_from_iattr(oa, &op_data->op_attr, valid); + if (valid & ATTR_BLOCKS) { + oa->o_blocks = op_data->op_attr_blocks; + oa->o_valid |= OBD_MD_FLBLOCKS; + } + if (valid & ATTR_ATTR_FLAG) { + oa->o_flags = + ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags; + oa->o_valid |= OBD_MD_FLFLAGS; + } +} +EXPORT_SYMBOL(obdo_from_md); + +void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = cpu_to_le64(sobdo->o_size); + dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime); + dobdo->o_atime = cpu_to_le64(sobdo->o_atime); + dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime); + dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks); + dobdo->o_mode = cpu_to_le32(sobdo->o_mode); + dobdo->o_uid = cpu_to_le32(sobdo->o_uid); + dobdo->o_gid = cpu_to_le32(sobdo->o_gid); + dobdo->o_flags = cpu_to_le32(sobdo->o_flags); + dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink); + dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize); + dobdo->o_valid = cpu_to_le64(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_cpu_to_le); + +void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo) +{ + dobdo->o_size = le64_to_cpu(sobdo->o_size); + dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime); + dobdo->o_atime = le64_to_cpu(sobdo->o_atime); + dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime); + dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks); + dobdo->o_mode = le32_to_cpu(sobdo->o_mode); + dobdo->o_uid = le32_to_cpu(sobdo->o_uid); + dobdo->o_gid = le32_to_cpu(sobdo->o_gid); + dobdo->o_flags = le32_to_cpu(sobdo->o_flags); + dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink); + dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize); + dobdo->o_valid = le64_to_cpu(sobdo->o_valid); +} +EXPORT_SYMBOL(obdo_le_to_cpu); diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c new file mode 100644 index 000000000000..c3b7a78dba50 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c @@ -0,0 +1,75 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/statfs_pack.c + * + * (Un)packing of OST/MDS requests + * + * Author: Andreas Dilger <adilger@clusterfs.com> + */ + +#define DEBUG_SUBSYSTEM S_CLASS + + +#include <lustre_export.h> +#include <lustre_net.h> +#include <obd_support.h> +#include <obd_class.h> + +void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs) +{ + memset(osfs, 0, sizeof(*osfs)); + osfs->os_type = sfs->f_type; + osfs->os_blocks = sfs->f_blocks; + osfs->os_bfree = sfs->f_bfree; + osfs->os_bavail = sfs->f_bavail; + osfs->os_files = sfs->f_files; + osfs->os_ffree = sfs->f_ffree; + osfs->os_bsize = sfs->f_bsize; + osfs->os_namelen = sfs->f_namelen; +} +EXPORT_SYMBOL(statfs_pack); + +void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs) +{ + memset(sfs, 0, sizeof(*sfs)); + sfs->f_type = osfs->os_type; + sfs->f_blocks = osfs->os_blocks; + sfs->f_bfree = osfs->os_bfree; + sfs->f_bavail = osfs->os_bavail; + sfs->f_files = osfs->os_files; + sfs->f_ffree = osfs->os_ffree; + sfs->f_bsize = osfs->os_bsize; + sfs->f_namelen = osfs->os_namelen; +} +EXPORT_SYMBOL(statfs_unpack); diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c new file mode 100644 index 000000000000..af5f27f82bc5 --- /dev/null +++ b/drivers/staging/lustre/lustre/obdclass/uuid.c @@ -0,0 +1,82 @@ +/* + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/uuid.c + * + * Public include file for the UUID library + */ + +#define DEBUG_SUBSYSTEM S_CLASS + +# include <linux/libcfs/libcfs.h> + +#include <obd_support.h> +#include <obd_class.h> + + +static inline __u32 consume(int nob, __u8 **ptr) +{ + __u32 value; + + LASSERT(nob <= sizeof value); + + for (value = 0; nob > 0; --nob) + value = (value << 8) | *((*ptr)++); + return value; +} + +#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr)) + +static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr) +{ + __u8 *ptr = in; + + LASSERT(nr * sizeof *uu == sizeof(class_uuid_t)); + + while (nr-- > 0) + CONSUME(uu[nr], &ptr); +} + +void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out) +{ + /* uu as an array of __u16's */ + __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)]; + + CLASSERT(ARRAY_SIZE(uuid) == 8); + + uuid_unpack(uu, uuid, ARRAY_SIZE(uuid)); + sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x", + uuid[0], uuid[1], uuid[2], uuid[3], + uuid[4], uuid[5], uuid[6], uuid[7]); +} +EXPORT_SYMBOL(class_uuid_unparse); |