aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/staging/lustre/lustre/obdclass
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/staging/lustre/lustre/obdclass')
-rw-r--r--drivers/staging/lustre/lustre/obdclass/Makefile13
-rw-r--r--drivers/staging/lustre/lustre/obdclass/acl.c546
-rw-r--r--drivers/staging/lustre/lustre/obdclass/capa.c401
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_internal.h121
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_io.c1753
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_lock.c2304
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_object.c1148
-rw-r--r--drivers/staging/lustre/lustre/obdclass/cl_page.c1605
-rw-r--r--drivers/staging/lustre/lustre/obdclass/class_obd.c689
-rw-r--r--drivers/staging/lustre/lustre/obdclass/debug.c124
-rw-r--r--drivers/staging/lustre/lustre/obdclass/dt_object.c1055
-rw-r--r--drivers/staging/lustre/lustre/obdclass/genops.c1853
-rw-r--r--drivers/staging/lustre/lustre/obdclass/idmap.c474
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linkea.c194
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-module.c408
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c222
-rw-r--r--drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c445
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog.c966
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_cat.c833
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_internal.h98
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_ioctl.c427
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_lvfs.c862
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_obd.c319
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_osd.c1323
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_swab.c407
-rw-r--r--drivers/staging/lustre/lustre/obdclass/llog_test.c1087
-rw-r--r--drivers/staging/lustre/lustre/obdclass/local_storage.c903
-rw-r--r--drivers/staging/lustre/lustre/obdclass/local_storage.h88
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c562
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lprocfs_status.c1985
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_object.c2185
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_ref.c50
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lu_ucred.c107
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lustre_handles.c263
-rw-r--r--drivers/staging/lustre/lustre/obdclass/lustre_peer.c218
-rw-r--r--drivers/staging/lustre/lustre/obdclass/md_attrs.c202
-rw-r--r--drivers/staging/lustre/lustre/obdclass/mea.c112
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obd_config.c1904
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obd_mount.c1321
-rw-r--r--drivers/staging/lustre/lustre/obdclass/obdo.c362
-rw-r--r--drivers/staging/lustre/lustre/obdclass/statfs_pack.c75
-rw-r--r--drivers/staging/lustre/lustre/obdclass/uuid.c82
42 files changed, 30096 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 000000000000..b80c13c6f5dd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+ llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+ genops.o uuid.o llog_ioctl.o lprocfs_status.o \
+ lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+ local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+ mea.o lu_object.o dt_object.o capa.o cl_object.o \
+ cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o \
+ lu_ucred.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 000000000000..c2a6702c9f2c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/acl.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+ ES_UNK = 0, /* unknown stat */
+ ES_UNC = 1, /* ACL entry is not changed */
+ ES_MOD = 2, /* ACL entry is modified */
+ ES_ADD = 3, /* ACL entry is added */
+ ES_DEL = 4 /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+ ext_acl_xattr_entry *s)
+{
+ d->e_tag = le16_to_cpu(s->e_tag);
+ d->e_perm = le16_to_cpu(s->e_perm);
+ d->e_id = le32_to_cpu(s->e_id);
+ d->e_stat = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+ ext_acl_xattr_entry *s)
+{
+ d->e_tag = cpu_to_le16(s->e_tag);
+ d->e_perm = cpu_to_le16(s->e_perm);
+ d->e_id = cpu_to_le32(s->e_id);
+ d->e_stat = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+ posix_acl_xattr_entry *s)
+{
+ d->e_tag = le16_to_cpu(s->e_tag);
+ d->e_perm = le16_to_cpu(s->e_perm);
+ d->e_id = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+ posix_acl_xattr_entry *s)
+{
+ d->e_tag = cpu_to_le16(s->e_tag);
+ d->e_perm = cpu_to_le16(s->e_perm);
+ d->e_id = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+ int old_count, int new_count)
+{
+ int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+ int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+ posix_acl_xattr_header *new;
+
+ if (unlikely(old_count <= new_count))
+ return old_size;
+
+ OBD_ALLOC(new, new_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ memcpy(new, *header, new_size);
+ OBD_FREE(*header, old_size);
+ *header = new;
+ return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+ int old_count)
+{
+ int ext_count = le32_to_cpu((*header)->a_count);
+ int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+ int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+ ext_acl_xattr_header *new;
+
+ if (unlikely(old_count <= ext_count))
+ return 0;
+
+ OBD_ALLOC(new, ext_size);
+ if (unlikely(new == NULL))
+ return -ENOMEM;
+
+ memcpy(new, *header, ext_size);
+ OBD_FREE(*header, old_size);
+ *header = new;
+ return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+ int count, i, esize;
+ ext_acl_xattr_header *new;
+ ENTRY;
+
+ if (unlikely(size < 0))
+ RETURN(ERR_PTR(-EINVAL));
+ else if (!size)
+ count = 0;
+ else
+ count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+ OBD_ALLOC(new, esize);
+ if (unlikely(new == NULL))
+ RETURN(ERR_PTR(-ENOMEM));
+
+ new->a_count = cpu_to_le32(count);
+ for (i = 0; i < count; i++) {
+ new->a_entries[i].e_tag = header->a_entries[i].e_tag;
+ new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+ new->a_entries[i].e_id = header->a_entries[i].e_id;
+ new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+ }
+
+ RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+ posix_acl_xattr_header **out)
+{
+ int count, i, j, rc = 0;
+ __u32 id;
+ posix_acl_xattr_header *new;
+ ENTRY;
+
+ if (unlikely(size < 0))
+ RETURN(-EINVAL);
+ else if (!size)
+ RETURN(0);
+
+ OBD_ALLOC(new, size);
+ if (unlikely(new == NULL))
+ RETURN(-ENOMEM);
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ for (i = 0, j = 0; i < count; i++) {
+ id = le32_to_cpu(header->a_entries[i].e_id);
+ switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ if (id != ACL_UNDEFINED_ID)
+ GOTO(_out, rc = -EIO);
+
+ memcpy(&new->a_entries[j++], &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ case ACL_USER:
+ if (id != NOBODY_UID)
+ memcpy(&new->a_entries[j++],
+ &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ case ACL_GROUP:
+ if (id != NOBODY_GID)
+ memcpy(&new->a_entries[j++],
+ &header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ break;
+ default:
+ GOTO(_out, rc = -EIO);
+ }
+ }
+
+ /* free unused space. */
+ rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+ if (rc >= 0) {
+ size = rc;
+ *out = new;
+ rc = 0;
+ }
+ EXIT;
+
+_out:
+ if (rc) {
+ OBD_FREE(new, size);
+ size = rc;
+ }
+ return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+ OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+ OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+ ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+ posix_acl_xattr_entry *entry, int *pos)
+{
+ int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+ once = 0;
+ start = *pos;
+ end = count;
+
+again:
+ for (i = start; i < end; i++) {
+ if (header->a_entries[i].e_tag == entry->e_tag &&
+ header->a_entries[i].e_id == entry->e_id) {
+ j = i;
+ if (++i >= count)
+ i = 0;
+ *pos = i;
+ return &header->a_entries[j];
+ }
+ }
+
+ if (!once) {
+ once = 1;
+ start = 0;
+ end = *pos;
+ goto again;
+ }
+
+ return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header,
+ posix_acl_xattr_header **out)
+{
+ int posix_count, posix_size, i, j;
+ int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+ posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+ posix_acl_xattr_header *new;
+ ext_acl_xattr_entry *ee, ae;
+ ENTRY;
+
+ lustre_posix_acl_cpu_to_le(&pe, &pe);
+ ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+ if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+ /* there are only base ACL entries at most. */
+ posix_count = 3;
+ posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+ OBD_ALLOC(new, posix_size);
+ if (unlikely(new == NULL))
+ RETURN(-ENOMEM);
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ for (i = 0, j = 0; i < ext_count; i++) {
+ lustre_ext_acl_le_to_cpu(&ae,
+ &ext_header->a_entries[i]);
+ switch (ae.e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_OTHER:
+ if (ae.e_id != ACL_UNDEFINED_ID)
+ GOTO(_out, rc = -EIO);
+
+ if (ae.e_stat != ES_DEL) {
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j++].e_id =
+ ext_header->a_entries[i].e_id;
+ }
+ break;
+ case ACL_MASK:
+ case ACL_USER:
+ case ACL_GROUP:
+ if (ae.e_stat == ES_DEL)
+ break;
+ default:
+ GOTO(_out, rc = -EIO);
+ }
+ }
+ } else {
+ /* maybe there are valid ACL_USER or ACL_GROUP entries in the
+ * original server-side ACL, they are regarded as ES_UNC stat.*/
+ int ori_posix_count;
+
+ if (unlikely(size < 0))
+ RETURN(-EINVAL);
+ else if (!size)
+ ori_posix_count = 0;
+ else
+ ori_posix_count =
+ CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ posix_count = ori_posix_count + ext_count;
+ posix_size =
+ CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+ OBD_ALLOC(new, posix_size);
+ if (unlikely(new == NULL))
+ RETURN(-ENOMEM);
+
+ new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+ /* 1. process the unchanged ACL entries
+ * in the original server-side ACL. */
+ pos = 0;
+ for (i = 0, j = 0; i < ori_posix_count; i++) {
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee == NULL)
+ memcpy(&new->a_entries[j++],
+ &posix_header->a_entries[i],
+ sizeof(posix_acl_xattr_entry));
+ }
+
+ /* 2. process the non-deleted entries
+ * from client-side extended ACL. */
+ for (i = 0; i < ext_count; i++) {
+ if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+ ES_DEL) {
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j++].e_id =
+ ext_header->a_entries[i].e_id;
+ }
+ }
+ }
+
+ /* free unused space. */
+ rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+ if (rc >= 0) {
+ posix_size = rc;
+ *out = new;
+ rc = 0;
+ }
+ EXIT;
+
+_out:
+ if (rc) {
+ OBD_FREE(new, posix_size);
+ posix_size = rc;
+ }
+ return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+ ext_acl_xattr_header *ext_header)
+{
+ int ori_ext_count, posix_count, ext_count, ext_size;
+ int i, j, pos = 0, rc = 0;
+ posix_acl_xattr_entry pae;
+ ext_acl_xattr_header *new;
+ ext_acl_xattr_entry *ee, eae;
+ ENTRY;
+
+ if (unlikely(size < 0))
+ RETURN(ERR_PTR(-EINVAL));
+ else if (!size)
+ posix_count = 0;
+ else
+ posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+ ori_ext_count = le32_to_cpu(ext_header->a_count);
+ ext_count = posix_count + ori_ext_count;
+ ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+ OBD_ALLOC(new, ext_size);
+ if (unlikely(new == NULL))
+ RETURN(ERR_PTR(-ENOMEM));
+
+ for (i = 0, j = 0; i < posix_count; i++) {
+ lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+ switch (pae.e_tag) {
+ case ACL_USER_OBJ:
+ case ACL_GROUP_OBJ:
+ case ACL_MASK:
+ case ACL_OTHER:
+ if (pae.e_id != ACL_UNDEFINED_ID)
+ GOTO(out, rc = -EIO);
+ case ACL_USER:
+ /* ignore "nobody" entry. */
+ if (pae.e_id == NOBODY_UID)
+ break;
+
+ new->a_entries[j].e_tag =
+ posix_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ posix_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id =
+ posix_header->a_entries[i].e_id;
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee) {
+ if (posix_header->a_entries[i].e_perm !=
+ ee->e_perm)
+ /* entry modified. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_MOD);
+ else
+ /* entry unchanged. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_UNC);
+ } else {
+ /* new entry. */
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_ADD);
+ }
+ break;
+ case ACL_GROUP:
+ /* ignore "nobody" entry. */
+ if (pae.e_id == NOBODY_GID)
+ break;
+ new->a_entries[j].e_tag =
+ posix_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ posix_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id =
+ posix_header->a_entries[i].e_id;
+ ee = lustre_ext_acl_xattr_search(ext_header,
+ &posix_header->a_entries[i], &pos);
+ if (ee) {
+ if (posix_header->a_entries[i].e_perm !=
+ ee->e_perm)
+ /* entry modified. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_MOD);
+ else
+ /* entry unchanged. */
+ ee->e_stat =
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_UNC);
+ } else {
+ /* new entry. */
+ new->a_entries[j++].e_stat =
+ cpu_to_le32(ES_ADD);
+ }
+ break;
+ default:
+ GOTO(out, rc = -EIO);
+ }
+ }
+
+ /* process deleted entries. */
+ for (i = 0; i < ori_ext_count; i++) {
+ lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+ if (eae.e_stat == ES_UNK) {
+ /* ignore "nobody" entry. */
+ if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+ (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+ continue;
+
+ new->a_entries[j].e_tag =
+ ext_header->a_entries[i].e_tag;
+ new->a_entries[j].e_perm =
+ ext_header->a_entries[i].e_perm;
+ new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+ new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+ }
+ }
+
+ new->a_count = cpu_to_le32(j);
+ /* free unused space. */
+ rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+ EXIT;
+
+out:
+ if (rc) {
+ OBD_FREE(new, ext_size);
+ new = ERR_PTR(rc);
+ }
+ return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 000000000000..3e532f5106e4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/capa.c
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000 /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+ DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+ struct hlist_head *hash;
+ int nr_hash, i;
+
+ OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+ if (!hash)
+ return NULL;
+
+ nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+ LASSERT(nr_hash > NR_CAPAHASH);
+
+ for (i = 0; i < NR_CAPAHASH; i++)
+ INIT_HLIST_HEAD(hash + i);
+ return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+ return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+ LASSERT(capa_on_server(ocapa));
+ hlist_del_init(&ocapa->u.tgt.c_hash);
+ list_del_init(&ocapa->c_list);
+ capa_count[ocapa->c_site]--;
+ /* release the ref when alloc */
+ capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+ int i;
+ struct hlist_node *next;
+ struct obd_capa *oc;
+
+ spin_lock(&capa_lock);
+ for (i = 0; i < NR_CAPAHASH; i++) {
+ hlist_for_each_entry_safe(oc, next, hash + i,
+ u.tgt.c_hash)
+ capa_delete(oc);
+ }
+ spin_unlock(&capa_lock);
+
+ OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+ return (fid_oid(fid) ^ fid_ver(fid)) *
+ (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+ return cfs_time_before(cfs_time_sub(oc->c_expiry,
+ cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+ cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+ struct hlist_head *head, int alive)
+{
+ struct obd_capa *ocapa;
+ int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+ hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+ if (memcmp(&ocapa->c_capa, capa, len))
+ continue;
+ /* don't return one that will expire soon in this case */
+ if (alive && capa_is_to_expire(ocapa))
+ continue;
+
+ LASSERT(capa_on_server(ocapa));
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+ return ocapa;
+ }
+
+ return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+ struct obd_capa *ocapa;
+ struct list_head *node = head->next;
+ int count = 0;
+
+ /* free LRU_CAPA_DELETE_COUNT unused capa from head */
+ while (count++ < LRU_CAPA_DELETE_COUNT) {
+ ocapa = list_entry(node, struct obd_capa, c_list);
+ node = node->next;
+ if (atomic_read(&ocapa->c_refc))
+ continue;
+
+ DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+ capa_delete(ocapa);
+ }
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+ struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+ struct obd_capa *ocapa, *old = NULL;
+ struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+ ocapa = alloc_capa(CAPA_SITE_SERVER);
+ if (IS_ERR(ocapa))
+ return NULL;
+
+ spin_lock(&capa_lock);
+ old = find_capa(capa, head, 0);
+ if (!old) {
+ ocapa->c_capa = *capa;
+ set_capa_expiry(ocapa);
+ hlist_add_head(&ocapa->u.tgt.c_hash, head);
+ list_add_tail(&ocapa->c_list, list);
+ capa_get(ocapa);
+ capa_count[CAPA_SITE_SERVER]++;
+ if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+ capa_delete_lru(list);
+ spin_unlock(&capa_lock);
+ return ocapa;
+ } else {
+ capa_get(old);
+ spin_unlock(&capa_lock);
+ capa_put(ocapa);
+ return old;
+ }
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+ int alive)
+{
+ struct obd_capa *ocapa;
+
+ spin_lock(&capa_lock);
+ ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+ if (ocapa) {
+ list_move_tail(&ocapa->c_list,
+ &capa_list[CAPA_SITE_SERVER]);
+ capa_get(ocapa);
+ }
+ spin_unlock(&capa_lock);
+
+ return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+ struct ll_crypto_hash *tfm;
+ struct capa_hmac_alg *alg;
+ int keylen;
+ struct scatterlist sl;
+
+ if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+ CERROR("unknown capability hmac algorithm!\n");
+ return -EFAULT;
+ }
+
+ alg = &capa_hmac_algs[capa_alg(capa)];
+
+ tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+ if (!tfm) {
+ CERROR("crypto_alloc_tfm failed, check whether your kernel"
+ "has crypto support!\n");
+ return -ENOMEM;
+ }
+ keylen = alg->ha_keylen;
+
+ sg_set_page(&sl, virt_to_page(capa),
+ offsetof(struct lustre_capa, lc_hmac),
+ (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+ ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+ ll_crypto_free_hash(tfm);
+
+ return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+ struct ll_crypto_cipher *tfm;
+ struct scatterlist sd;
+ struct scatterlist ss;
+ struct blkcipher_desc desc;
+ unsigned int min;
+ int rc;
+ char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+ ENTRY;
+
+ /* passing "aes" in a variable instead of a constant string keeps gcc
+ * 4.3.2 happy */
+ tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+ if (IS_ERR(tfm)) {
+ CERROR("failed to load transform for aes\n");
+ RETURN(PTR_ERR(tfm));
+ }
+
+ min = ll_crypto_tfm_alg_min_keysize(tfm);
+ if (keylen < min) {
+ CERROR("keylen at least %d bits for aes\n", min * 8);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+ if (rc) {
+ CERROR("failed to setting key for aes\n");
+ GOTO(out, rc);
+ }
+
+ sg_set_page(&sd, virt_to_page(d), 16,
+ (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+ sg_set_page(&ss, virt_to_page(s), 16,
+ (unsigned long)(s) % PAGE_CACHE_SIZE);
+ desc.tfm = tfm;
+ desc.info = NULL;
+ desc.flags = 0;
+ rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+ if (rc) {
+ CERROR("failed to encrypt for aes\n");
+ GOTO(out, rc);
+ }
+
+ EXIT;
+
+out:
+ ll_crypto_free_blkcipher(tfm);
+ return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+ struct ll_crypto_cipher *tfm;
+ struct scatterlist sd;
+ struct scatterlist ss;
+ struct blkcipher_desc desc;
+ unsigned int min;
+ int rc;
+ char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+ ENTRY;
+
+ /* passing "aes" in a variable instead of a constant string keeps gcc
+ * 4.3.2 happy */
+ tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+ if (IS_ERR(tfm)) {
+ CERROR("failed to load transform for aes\n");
+ RETURN(PTR_ERR(tfm));
+ }
+
+ min = ll_crypto_tfm_alg_min_keysize(tfm);
+ if (keylen < min) {
+ CERROR("keylen at least %d bits for aes\n", min * 8);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+ if (rc) {
+ CERROR("failed to setting key for aes\n");
+ GOTO(out, rc);
+ }
+
+ sg_set_page(&sd, virt_to_page(d), 16,
+ (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+ sg_set_page(&ss, virt_to_page(s), 16,
+ (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+ desc.tfm = tfm;
+ desc.info = NULL;
+ desc.flags = 0;
+ rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+ if (rc) {
+ CERROR("failed to decrypt for aes\n");
+ GOTO(out, rc);
+ }
+
+ EXIT;
+
+out:
+ ll_crypto_free_blkcipher(tfm);
+ return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+ spin_lock(&ocapa->c_lock);
+ *(struct lustre_capa *)capa = ocapa->c_capa;
+ spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+ struct libcfs_debug_msg_data *msgdata,
+ const char *fmt, ... )
+{
+ va_list args;
+ va_start(args, fmt);
+ libcfs_debug_vmsg2(msgdata, fmt, args,
+ " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+ " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+ "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+ capa_uid(c), capa_gid(c), capa_flags(c),
+ capa_alg(c), capa_keyid(c), capa_timeout(c),
+ capa_expiry(c));
+ va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 000000000000..7eb0ad7b3644
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+ CNL_TOP,
+ CNL_SUB,
+ CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+ /**
+ * Number of outstanding calls to cl_lock_mutex_get() made by the
+ * current thread. For debugging.
+ */
+ int ctc_nr_locks_locked;
+ /** List of locked locks. */
+ struct lu_ref ctc_locks_locked;
+ /** Number of outstanding holds on locks. */
+ int ctc_nr_held;
+ /** Number of outstanding uses on locks. */
+ int ctc_nr_used;
+ /** Number of held extent locks. */
+ int ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+ /*
+ * Common fields.
+ */
+ struct cl_io clt_io;
+ struct cl_2queue clt_queue;
+
+ /*
+ * Fields used by cl_lock.c
+ */
+ struct cl_lock_descr clt_descr;
+ struct cl_page_list clt_list;
+ /**
+ * Counters for every level of lock nesting.
+ */
+ struct cl_thread_counters clt_counters[CNL_NR];
+ /** @} debugging */
+
+ /*
+ * Fields used by cl_page.c
+ */
+ struct cl_page *clt_pvec[CLT_PVEC_SIZE];
+
+ /*
+ * Fields used by cl_io.c
+ */
+ /**
+ * Pointer to the topmost ongoing IO in this thread.
+ */
+ struct cl_io *clt_current_io;
+ /**
+ * Used for submitting a sync io.
+ */
+ struct cl_sync_io clt_anchor;
+ /**
+ * Fields used by cl_lock_discard_pages().
+ */
+ pgoff_t clt_next_index;
+ pgoff_t clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 000000000000..75c9be8875e0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c
@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+ list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io) \
+ list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+ return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+ return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+ return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+ struct cl_io *up;
+
+ up = io->ci_parent;
+ return
+ /*
+ * io can own pages only when it is ongoing. Sub-io might
+ * still be in CIS_LOCKED state when top-io is in
+ * CIS_IO_GOING.
+ */
+ ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+ (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+ struct cl_io_slice *slice;
+ struct cl_thread_info *info;
+
+ LINVRNT(cl_io_type_is_valid(io->ci_type));
+ LINVRNT(cl_io_invariant(io));
+ ENTRY;
+
+ while (!list_empty(&io->ci_layers)) {
+ slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+ cis_linkage);
+ list_del_init(&slice->cis_linkage);
+ if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+ slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+ /*
+ * Invalidate slice to catch use after free. This assumes that
+ * slices are allocated within session and can be touched
+ * after ->cio_fini() returns.
+ */
+ slice->cis_io = NULL;
+ }
+ io->ci_state = CIS_FINI;
+ info = cl_env_info(env);
+ if (info->clt_current_io == io)
+ info->clt_current_io = NULL;
+
+ /* sanity check for layout change */
+ switch(io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ break;
+ case CIT_FAULT:
+ case CIT_FSYNC:
+ LASSERT(!io->ci_need_restart);
+ break;
+ case CIT_SETATTR:
+ case CIT_MISC:
+ /* Check ignore layout change conf */
+ LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+ !io->ci_need_restart));
+ break;
+ default:
+ LBUG();
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_object *scan;
+ int result;
+
+ LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+ LINVRNT(cl_io_type_is_valid(iot));
+ LINVRNT(cl_io_invariant(io));
+ ENTRY;
+
+ io->ci_type = iot;
+ INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+ INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+ INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+ INIT_LIST_HEAD(&io->ci_layers);
+
+ result = 0;
+ cl_object_for_each(scan, obj) {
+ if (scan->co_ops->coo_io_init != NULL) {
+ result = scan->co_ops->coo_io_init(env, scan, io);
+ if (result != 0)
+ break;
+ }
+ }
+ if (result == 0)
+ io->ci_state = CIS_INIT;
+ RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+
+ LASSERT(obj != cl_object_top(obj));
+ if (info->clt_current_io == NULL)
+ info->clt_current_io = io;
+ return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, struct cl_object *obj)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+
+ LASSERT(obj == cl_object_top(obj));
+ LASSERT(info->clt_current_io == NULL);
+
+ info->clt_current_io = io;
+ return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+ enum cl_io_type iot, loff_t pos, size_t count)
+{
+ LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+ LINVRNT(io->ci_obj != NULL);
+ ENTRY;
+
+ LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+ "io range: %u ["LPU64", "LPU64") %u %u\n",
+ iot, (__u64)pos, (__u64)pos + count,
+ io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+ io->u.ci_rw.crw_pos = pos;
+ io->u.ci_rw.crw_count = count;
+ RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+ return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+ __diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ int ret;
+
+ ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+ if (ret)
+ return ret;
+ if (d0->cld_end < d1->cld_start)
+ return -1;
+ if (d0->cld_start > d0->cld_end)
+ return 1;
+ return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+ const struct cl_lock_descr *d1)
+{
+ d0->cld_start = min(d0->cld_start, d1->cld_start);
+ d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+ if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+ d0->cld_mode = CLM_WRITE;
+
+ if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+ d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+ int done = 0;
+
+ ENTRY;
+ /* hidden treasure: bubble sort for now. */
+ do {
+ struct cl_io_lock_link *curr;
+ struct cl_io_lock_link *prev;
+ struct cl_io_lock_link *temp;
+
+ done = 1;
+ prev = NULL;
+
+ list_for_each_entry_safe(curr, temp,
+ &io->ci_lockset.cls_todo,
+ cill_linkage) {
+ if (prev != NULL) {
+ switch (cl_lock_descr_sort(&prev->cill_descr,
+ &curr->cill_descr)) {
+ case 0:
+ /*
+ * IMPOSSIBLE: Identical locks are
+ * already removed at
+ * this point.
+ */
+ default:
+ LBUG();
+ case +1:
+ list_move_tail(&curr->cill_linkage,
+ &prev->cill_linkage);
+ done = 0;
+ continue; /* don't change prev: it's
+ * still "previous" */
+ case -1: /* already in order */
+ break;
+ }
+ }
+ prev = curr;
+ }
+ } while (!done);
+ EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval 0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+ const struct cl_lock_descr *need)
+{
+ struct cl_io_lock_link *scan;
+
+ ENTRY;
+ list_for_each_entry(scan, queue, cill_linkage) {
+ if (cl_lock_descr_match(&scan->cill_descr, need))
+ RETURN(+1);
+ }
+ RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+ const struct cl_lock_descr *need)
+{
+ struct cl_io_lock_link *scan;
+
+ ENTRY;
+ list_for_each_entry(scan, queue, cill_linkage) {
+ if (cl_lock_descr_cmp(&scan->cill_descr, need))
+ continue;
+ cl_lock_descr_merge(&scan->cill_descr, need);
+ CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+ scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+ scan->cill_descr.cld_end);
+ RETURN(+1);
+ }
+ RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+ const struct cl_lock_descr *need)
+{
+ return cl_queue_match(&set->cls_curr, need) ||
+ cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+ const struct cl_lock_descr *need)
+{
+ return cl_queue_merge(&set->cls_todo, need) ||
+ cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+ struct cl_io *io, struct cl_lockset *set,
+ struct cl_io_lock_link *link)
+{
+ struct cl_lock *lock;
+ int result;
+
+ ENTRY;
+
+ lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+ if (!IS_ERR(lock)) {
+ link->cill_lock = lock;
+ list_move(&link->cill_linkage, &set->cls_curr);
+ if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+ result = cl_wait(env, lock);
+ if (result == 0)
+ list_move(&link->cill_linkage,
+ &set->cls_done);
+ } else
+ result = 0;
+ } else
+ result = PTR_ERR(lock);
+ RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link)
+{
+ struct cl_lock *lock = link->cill_lock;
+
+ ENTRY;
+ list_del_init(&link->cill_linkage);
+ if (lock != NULL) {
+ cl_lock_release(env, lock, "io", io);
+ link->cill_lock = NULL;
+ }
+ if (link->cill_fini != NULL)
+ link->cill_fini(env, link);
+ EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_lockset *set)
+{
+ struct cl_io_lock_link *link;
+ struct cl_io_lock_link *temp;
+ struct cl_lock *lock;
+ int result;
+
+ ENTRY;
+ result = 0;
+ list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+ if (!cl_lockset_match(set, &link->cill_descr)) {
+ /* XXX some locking to guarantee that locks aren't
+ * expanded in between. */
+ result = cl_lockset_lock_one(env, io, set, link);
+ if (result != 0)
+ break;
+ } else
+ cl_lock_link_fini(env, io, link);
+ }
+ if (result == 0) {
+ list_for_each_entry_safe(link, temp,
+ &set->cls_curr, cill_linkage) {
+ lock = link->cill_lock;
+ result = cl_wait(env, lock);
+ if (result == 0)
+ list_move(&link->cill_linkage,
+ &set->cls_done);
+ else
+ break;
+ }
+ }
+ RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_IT_STARTED);
+ LINVRNT(cl_io_invariant(io));
+
+ ENTRY;
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+ if (result != 0)
+ break;
+ }
+ if (result == 0) {
+ cl_io_locks_sort(io);
+ result = cl_lockset_lock(env, io, &io->ci_lockset);
+ }
+ if (result != 0)
+ cl_io_unlock(env, io);
+ else
+ io->ci_state = CIS_LOCKED;
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+ struct cl_lockset *set;
+ struct cl_io_lock_link *link;
+ struct cl_io_lock_link *temp;
+ const struct cl_io_slice *scan;
+
+ LASSERT(cl_io_is_loopable(io));
+ LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+ LINVRNT(cl_io_invariant(io));
+
+ ENTRY;
+ set = &io->ci_lockset;
+
+ list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+ cl_lock_link_fini(env, io, link);
+
+ list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+ cl_lock_link_fini(env, io, link);
+
+ list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+ cl_unuse(env, link->cill_lock);
+ cl_lock_link_fini(env, io, link);
+ }
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+ scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+ }
+ io->ci_state = CIS_UNLOCKED;
+ LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+ LINVRNT(cl_io_invariant(io));
+
+ ENTRY;
+ result = 0;
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+ scan);
+ if (result != 0)
+ break;
+ }
+ if (result == 0)
+ io->ci_state = CIS_IT_STARTED;
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_UNLOCKED);
+ LINVRNT(cl_io_invariant(io));
+
+ ENTRY;
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+ scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+ }
+ io->ci_state = CIS_IT_ENDED;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+ nob == 0);
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(cl_io_invariant(io));
+
+ ENTRY;
+
+ io->u.ci_rw.crw_pos += nob;
+ io->u.ci_rw.crw_count -= nob;
+
+ /* layers have to be notified. */
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+ scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+ nob);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_io_lock_link *link)
+{
+ int result;
+
+ ENTRY;
+ if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+ result = +1;
+ else {
+ list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+ result = 0;
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+ struct cl_io_lock_link *link)
+{
+ OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_lock_descr *descr)
+{
+ struct cl_io_lock_link *link;
+ int result;
+
+ ENTRY;
+ OBD_ALLOC_PTR(link);
+ if (link != NULL) {
+ link->cill_descr = *descr;
+ link->cill_fini = cl_free_io_lock_link;
+ result = cl_io_lock_add(env, io, link);
+ if (result) /* lock match */
+ link->cill_fini(env, link);
+ } else
+ result = -ENOMEM;
+
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+ ENTRY;
+
+ io->ci_state = CIS_IO_GOING;
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+ continue;
+ result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+ if (result != 0)
+ break;
+ }
+ if (result >= 0)
+ result = 0;
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+ const struct cl_io_slice *scan;
+
+ LINVRNT(cl_io_is_loopable(io));
+ LINVRNT(io->ci_state == CIS_IO_GOING);
+ LINVRNT(cl_io_invariant(io));
+ ENTRY;
+
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+ scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+ /* TODO: error handling. */
+ }
+ io->ci_state = CIS_IO_FINISHED;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+ const struct cl_page_slice *slice;
+
+ slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+ LINVRNT(slice != NULL);
+ return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+ int result = 1;
+ loff_t start;
+ loff_t end;
+ pgoff_t idx;
+
+ idx = page->cp_index;
+ switch (io->ci_type) {
+ case CIT_READ:
+ case CIT_WRITE:
+ /*
+ * check that [start, end) and [pos, pos + count) extents
+ * overlap.
+ */
+ if (!cl_io_is_append(io)) {
+ const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+ start = cl_offset(page->cp_obj, idx);
+ end = cl_offset(page->cp_obj, idx + 1);
+ result = crw->crw_pos < end &&
+ start < crw->crw_pos + crw->crw_count;
+ }
+ break;
+ case CIT_FAULT:
+ result = io->u.ci_fault.ft_index == idx;
+ break;
+ default:
+ LBUG();
+ }
+ return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page)
+{
+ const struct cl_io_slice *scan;
+ struct cl_2queue *queue;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+ LINVRNT(cl_page_is_owned(page, io));
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_page_in_io(page, io));
+ LINVRNT(cl_io_invariant(io));
+ ENTRY;
+
+ queue = &io->ci_queue;
+
+ cl_2queue_init(queue);
+ /*
+ * ->cio_read_page() methods called in the loop below are supposed to
+ * never block waiting for network (the only subtle point is the
+ * creation of new pages for read-ahead that might result in cache
+ * shrinking, but currently only clean pages are shrunk and this
+ * requires no network io).
+ *
+ * Should this ever starts blocking, retry loop would be needed for
+ * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+ */
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->cio_read_page != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ LINVRNT(slice != NULL);
+ result = scan->cis_iop->cio_read_page(env, scan, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ if (result == 0)
+ result = cl_io_submit_rw(env, io, CRT_READ, queue);
+ /*
+ * Unlock unsent pages in case of error.
+ */
+ cl_page_list_disown(env, io, &queue->c2_qin);
+ cl_2queue_fini(env, queue);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_WRITE);
+ LINVRNT(cl_page_is_owned(page, io));
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+ LASSERT(cl_page_in_io(page, io));
+ ENTRY;
+
+ cl_io_for_each_reverse(scan, io) {
+ if (scan->cis_iop->cio_prepare_write != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ result = scan->cis_iop->cio_prepare_write(env, scan,
+ slice,
+ from, to);
+ if (result != 0)
+ break;
+ }
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, unsigned from, unsigned to)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(io->ci_type == CIT_WRITE);
+ LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+ LINVRNT(cl_io_invariant(io));
+ /*
+ * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+ * already called cl_page_cache_add(), moving page into CPS_CACHED
+ * state. Better (and more general) way of dealing with such situation
+ * is needed.
+ */
+ LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+ LASSERT(cl_page_in_io(page, io));
+ ENTRY;
+
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->cio_commit_write != NULL) {
+ const struct cl_page_slice *slice;
+
+ slice = cl_io_slice_page(scan, page);
+ result = scan->cis_iop->cio_commit_write(env, scan,
+ slice,
+ from, to);
+ if (result != 0)
+ break;
+ }
+ }
+ LINVRNT(result <= 0);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type crt, struct cl_2queue *queue)
+{
+ const struct cl_io_slice *scan;
+ int result = 0;
+
+ LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+ ENTRY;
+
+ cl_io_for_each(scan, io) {
+ if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+ continue;
+ result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+ queue);
+ if (result != 0)
+ break;
+ }
+ /*
+ * If ->cio_submit() failed, no pages were sent.
+ */
+ LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+ enum cl_req_type iot, struct cl_2queue *queue,
+ long timeout)
+{
+ struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+ struct cl_page *pg;
+ int rc;
+
+ cl_page_list_for_each(pg, &queue->c2_qin) {
+ LASSERT(pg->cp_sync_io == NULL);
+ pg->cp_sync_io = anchor;
+ }
+
+ cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+ rc = cl_io_submit_rw(env, io, iot, queue);
+ if (rc == 0) {
+ /*
+ * If some pages weren't sent for any reason (e.g.,
+ * read found up-to-date pages in the cache, or write found
+ * clean pages), count them as completed to avoid infinite
+ * wait.
+ */
+ cl_page_list_for_each(pg, &queue->c2_qin) {
+ pg->cp_sync_io = NULL;
+ cl_sync_io_note(anchor, +1);
+ }
+
+ /* wait for the IO to be finished. */
+ rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+ anchor, timeout);
+ } else {
+ LASSERT(list_empty(&queue->c2_qout.pl_pages));
+ cl_page_list_for_each(pg, &queue->c2_qin)
+ pg->cp_sync_io = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue)
+{
+ struct cl_page *page;
+ int result = 0;
+
+ CERROR("Canceling ongoing page trasmission\n");
+ cl_page_list_for_each(page, queue) {
+ int rc;
+
+ LINVRNT(cl_page_in_io(page, io));
+ rc = cl_page_cancel(env, page);
+ result = result ?: rc;
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ * - cl_io_iter_init()
+ *
+ * - cl_io_lock()
+ *
+ * - cl_io_start()
+ *
+ * - cl_io_end()
+ *
+ * - cl_io_unlock()
+ *
+ * - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+ int result = 0;
+
+ LINVRNT(cl_io_is_loopable(io));
+ ENTRY;
+
+ do {
+ size_t nob;
+
+ io->ci_continue = 0;
+ result = cl_io_iter_init(env, io);
+ if (result == 0) {
+ nob = io->ci_nob;
+ result = cl_io_lock(env, io);
+ if (result == 0) {
+ /*
+ * Notify layers that locks has been taken,
+ * and do actual i/o.
+ *
+ * - llite: kms, short read;
+ * - llite: generic_file_read();
+ */
+ result = cl_io_start(env, io);
+ /*
+ * Send any remaining pending
+ * io, etc.
+ *
+ * - llite: ll_rw_stats_tally.
+ */
+ cl_io_end(env, io);
+ cl_io_unlock(env, io);
+ cl_io_rw_advance(env, io, io->ci_nob - nob);
+ }
+ }
+ cl_io_iter_fini(env, io);
+ } while (result == 0 && io->ci_continue);
+ if (result == 0)
+ result = io->ci_result;
+ RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+ struct cl_object *obj,
+ const struct cl_io_operations *ops)
+{
+ struct list_head *linkage = &slice->cis_linkage;
+
+ LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+ list_empty(linkage));
+ ENTRY;
+
+ list_add_tail(linkage, &io->ci_layers);
+ slice->cis_io = io;
+ slice->cis_obj = obj;
+ slice->cis_iop = ops;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+ ENTRY;
+ plist->pl_nr = 0;
+ INIT_LIST_HEAD(&plist->pl_pages);
+ plist->pl_owner = current;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+ ENTRY;
+ /* it would be better to check that page is owned by "current" io, but
+ * it is not passed here. */
+ LASSERT(page->cp_owner != NULL);
+ LINVRNT(plist->pl_owner == current);
+
+ lockdep_off();
+ mutex_lock(&page->cp_mutex);
+ lockdep_on();
+ LASSERT(list_empty(&page->cp_batch));
+ list_add_tail(&page->cp_batch, &plist->pl_pages);
+ ++plist->pl_nr;
+ page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+ cl_page_get(page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+ struct cl_page_list *plist, struct cl_page *page)
+{
+ LASSERT(plist->pl_nr > 0);
+ LINVRNT(plist->pl_owner == current);
+
+ ENTRY;
+ list_del_init(&page->cp_batch);
+ lockdep_off();
+ mutex_unlock(&page->cp_mutex);
+ lockdep_on();
+ --plist->pl_nr;
+ lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+ cl_page_put(env, page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+ struct cl_page *page)
+{
+ LASSERT(src->pl_nr > 0);
+ LINVRNT(dst->pl_owner == current);
+ LINVRNT(src->pl_owner == current);
+
+ ENTRY;
+ list_move_tail(&page->cp_batch, &dst->pl_pages);
+ --src->pl_nr;
+ ++dst->pl_nr;
+ lu_ref_set_at(&page->cp_reference,
+ page->cp_queue_ref, "queue", src, dst);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+ struct cl_page *page;
+ struct cl_page *tmp;
+
+ LINVRNT(list->pl_owner == current);
+ LINVRNT(head->pl_owner == current);
+
+ ENTRY;
+ cl_page_list_for_each_safe(page, tmp, list)
+ cl_page_list_move(head, list, page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+
+ LINVRNT(plist->pl_owner == current);
+
+ ENTRY;
+ cl_page_list_for_each_safe(page, temp, plist) {
+ LASSERT(plist->pl_nr > 0);
+
+ list_del_init(&page->cp_batch);
+ lockdep_off();
+ mutex_unlock(&page->cp_mutex);
+ lockdep_on();
+ --plist->pl_nr;
+ /*
+ * cl_page_disown0 rather than usual cl_page_disown() is used,
+ * because pages are possibly in CPS_FREEING state already due
+ * to the call to cl_page_list_discard().
+ */
+ /*
+ * XXX cl_page_disown0() will fail if page is not locked.
+ */
+ cl_page_disown0(env, io, page);
+ lu_ref_del(&page->cp_reference, "queue", plist);
+ cl_page_put(env, page);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+
+ LINVRNT(plist->pl_owner == current);
+
+ ENTRY;
+ cl_page_list_for_each_safe(page, temp, plist)
+ cl_page_list_del(env, plist, page);
+ LASSERT(plist->pl_nr == 0);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ struct cl_page *temp;
+ pgoff_t index = 0;
+ int result;
+
+ LINVRNT(plist->pl_owner == current);
+
+ ENTRY;
+ result = 0;
+ cl_page_list_for_each_safe(page, temp, plist) {
+ LASSERT(index <= page->cp_index);
+ index = page->cp_index;
+ if (cl_page_own(env, io, page) == 0)
+ result = result ?: page->cp_error;
+ else
+ cl_page_list_del(env, plist, page);
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page_list *plist)
+{
+ struct cl_page *page;
+
+ LINVRNT(plist->pl_owner == current);
+
+ cl_page_list_for_each(page, plist)
+ cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *plist)
+{
+ struct cl_page *page;
+
+ LINVRNT(plist->pl_owner == current);
+ ENTRY;
+ cl_page_list_for_each(page, plist)
+ cl_page_discard(env, io, page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *plist)
+{
+ struct cl_page *page;
+ int result;
+
+ LINVRNT(plist->pl_owner == current);
+ ENTRY;
+ result = 0;
+ cl_page_list_for_each(page, plist) {
+ result = cl_page_unmap(env, io, page);
+ if (result != 0)
+ break;
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+ ENTRY;
+ cl_page_list_init(&queue->c2_qin);
+ cl_page_list_init(&queue->c2_qout);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+ ENTRY;
+ cl_page_list_add(&queue->c2_qin, page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ ENTRY;
+ cl_page_list_disown(env, io, &queue->c2_qin);
+ cl_page_list_disown(env, io, &queue->c2_qout);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ ENTRY;
+ cl_page_list_discard(env, io, &queue->c2_qin);
+ cl_page_list_discard(env, io, &queue->c2_qout);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_2queue *queue)
+{
+ cl_page_list_assume(env, io, &queue->c2_qin);
+ cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+ ENTRY;
+ cl_page_list_fini(env, &queue->c2_qout);
+ cl_page_list_fini(env, &queue->c2_qin);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+ ENTRY;
+ cl_2queue_init(queue);
+ cl_2queue_add(queue, page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+ ENTRY;
+ while (io->ci_parent != NULL)
+ io = io->ci_parent;
+ RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+ struct cl_device *dev,
+ const struct cl_req_operations *ops)
+{
+ ENTRY;
+ list_add_tail(&slice->crs_linkage, &req->crq_layers);
+ slice->crs_dev = dev;
+ slice->crs_ops = ops;
+ slice->crs_req = req;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+ unsigned i;
+
+ LASSERT(list_empty(&req->crq_pages));
+ LASSERT(req->crq_nrpages == 0);
+ LINVRNT(list_empty(&req->crq_layers));
+ LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+ ENTRY;
+
+ if (req->crq_o != NULL) {
+ for (i = 0; i < req->crq_nrobjs; ++i) {
+ struct cl_object *obj = req->crq_o[i].ro_obj;
+ if (obj != NULL) {
+ lu_object_ref_del_at(&obj->co_lu,
+ req->crq_o[i].ro_obj_ref,
+ "cl_req", req);
+ cl_object_put(env, obj);
+ }
+ }
+ OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+ }
+ OBD_FREE_PTR(req);
+ EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+ struct cl_page *page)
+{
+ struct cl_device *dev;
+ struct cl_page_slice *slice;
+ int result;
+
+ ENTRY;
+ result = 0;
+ page = cl_page_top(page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+ if (dev->cd_ops->cdo_req_init != NULL) {
+ result = dev->cd_ops->cdo_req_init(env,
+ dev, req);
+ if (result != 0)
+ break;
+ }
+ }
+ page = page->cp_child;
+ } while (page != NULL && result == 0);
+ RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+ struct cl_req_slice *slice;
+
+ ENTRY;
+ /*
+ * for the lack of list_for_each_entry_reverse_safe()...
+ */
+ while (!list_empty(&req->crq_layers)) {
+ slice = list_entry(req->crq_layers.prev,
+ struct cl_req_slice, crs_linkage);
+ list_del_init(&slice->crs_linkage);
+ if (slice->crs_ops->cro_completion != NULL)
+ slice->crs_ops->cro_completion(env, slice, rc);
+ }
+ cl_req_free(env, req);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+ enum cl_req_type crt, int nr_objects)
+{
+ struct cl_req *req;
+
+ LINVRNT(nr_objects > 0);
+ ENTRY;
+
+ OBD_ALLOC_PTR(req);
+ if (req != NULL) {
+ int result;
+
+ OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+ if (req->crq_o != NULL) {
+ req->crq_nrobjs = nr_objects;
+ req->crq_type = crt;
+ INIT_LIST_HEAD(&req->crq_pages);
+ INIT_LIST_HEAD(&req->crq_layers);
+ result = cl_req_init(env, req, page);
+ } else
+ result = -ENOMEM;
+ if (result != 0) {
+ cl_req_completion(env, req, result);
+ req = ERR_PTR(result);
+ }
+ } else
+ req = ERR_PTR(-ENOMEM);
+ RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+ struct cl_req *req, struct cl_page *page)
+{
+ struct cl_object *obj;
+ struct cl_req_obj *rqo;
+ int i;
+
+ ENTRY;
+ page = cl_page_top(page);
+
+ LASSERT(list_empty(&page->cp_flight));
+ LASSERT(page->cp_req == NULL);
+
+ CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+ req, req->crq_type, req->crq_nrpages);
+
+ list_add_tail(&page->cp_flight, &req->crq_pages);
+ ++req->crq_nrpages;
+ page->cp_req = req;
+ obj = cl_object_top(page->cp_obj);
+ for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+ if (rqo->ro_obj == NULL) {
+ rqo->ro_obj = obj;
+ cl_object_get(obj);
+ rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+ "cl_req", req);
+ break;
+ }
+ }
+ LASSERT(i < req->crq_nrobjs);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+ struct cl_req *req = page->cp_req;
+
+ ENTRY;
+ page = cl_page_top(page);
+
+ LASSERT(!list_empty(&page->cp_flight));
+ LASSERT(req->crq_nrpages > 0);
+
+ list_del_init(&page->cp_flight);
+ --req->crq_nrpages;
+ page->cp_req = NULL;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+ int i;
+ int result;
+ const struct cl_req_slice *slice;
+
+ ENTRY;
+ /*
+ * Check that the caller of cl_req_alloc() didn't lie about the number
+ * of objects.
+ */
+ for (i = 0; i < req->crq_nrobjs; ++i)
+ LASSERT(req->crq_o[i].ro_obj != NULL);
+
+ result = 0;
+ list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+ if (slice->crs_ops->cro_prep != NULL) {
+ result = slice->crs_ops->cro_prep(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+ struct cl_req_attr *attr, obd_valid flags)
+{
+ const struct cl_req_slice *slice;
+ struct cl_page *page;
+ int i;
+
+ LASSERT(!list_empty(&req->crq_pages));
+ ENTRY;
+
+ /* Take any page to use as a model. */
+ page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+ for (i = 0; i < req->crq_nrobjs; ++i) {
+ list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+ const struct cl_page_slice *scan;
+ const struct cl_object *obj;
+
+ scan = cl_page_at(page,
+ slice->crs_dev->cd_lu_dev.ld_type);
+ LASSERT(scan != NULL);
+ obj = scan->cpl_obj;
+ if (slice->crs_ops->cro_attr_set != NULL)
+ slice->crs_ops->cro_attr_set(env, slice, obj,
+ attr + i, flags);
+ }
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+ ENTRY;
+ init_waitqueue_head(&anchor->csi_waitq);
+ atomic_set(&anchor->csi_sync_nr, nrpages);
+ atomic_set(&anchor->csi_barrier, nrpages > 0);
+ anchor->csi_sync_rc = 0;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+ struct cl_page_list *queue, struct cl_sync_io *anchor,
+ long timeout)
+{
+ struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+ NULL, NULL, NULL);
+ int rc;
+ ENTRY;
+
+ LASSERT(timeout >= 0);
+
+ rc = l_wait_event(anchor->csi_waitq,
+ atomic_read(&anchor->csi_sync_nr) == 0,
+ &lwi);
+ if (rc < 0) {
+ CERROR("SYNC IO failed with error: %d, try to cancel "
+ "%d remaining pages\n",
+ rc, atomic_read(&anchor->csi_sync_nr));
+
+ (void)cl_io_cancel(env, io, queue);
+
+ lwi = (struct l_wait_info) { 0 };
+ (void)l_wait_event(anchor->csi_waitq,
+ atomic_read(&anchor->csi_sync_nr) == 0,
+ &lwi);
+ } else {
+ rc = anchor->csi_sync_rc;
+ }
+ LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+ cl_page_list_assume(env, io, queue);
+
+ /* wait until cl_sync_io_note() has done wakeup */
+ while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+ cpu_relax();
+ }
+
+ POISON(anchor, 0x5a, sizeof *anchor);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+ ENTRY;
+ if (anchor->csi_sync_rc == 0 && ioret < 0)
+ anchor->csi_sync_rc = ioret;
+ /*
+ * Synchronous IO done without releasing page lock (e.g., as a part of
+ * ->{prepare,commit}_write(). Completion is used to signal the end of
+ * IO.
+ */
+ LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+ if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+ wake_up_all(&anchor->csi_waitq);
+ /* it's safe to nuke or reuse anchor now */
+ atomic_set(&anchor->csi_barrier, 0);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 000000000000..d34e044fc854
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+ {
+ .ckd_cache = &cl_lock_kmem,
+ .ckd_name = "cl_lock_kmem",
+ .ckd_size = sizeof (struct cl_lock)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ return ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+ atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+ lock->cll_holds >= lock->cll_users &&
+ lock->cll_holds >= 0 &&
+ lock->cll_users >= 0 &&
+ lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ int result;
+
+ result = atomic_read(&lock->cll_ref) > 0 &&
+ cl_lock_invariant_trusted(env, lock);
+ if (!result && env != NULL)
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+ return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+ return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+ const struct cl_lock *lock)
+{
+ struct cl_thread_info *info;
+ enum clt_nesting_level nesting;
+
+ info = cl_env_info(env);
+ nesting = cl_lock_nesting(lock);
+ LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+ return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+ const char *prefix, const struct cl_lock *lock,
+ const char *func, const int line)
+{
+ struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+ CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+ "(%p/%d/%d) at %s():%d\n",
+ prefix, lock, atomic_read(&lock->cll_ref),
+ lock->cll_guarder, lock->cll_depth,
+ lock->cll_state, lock->cll_error, lock->cll_holds,
+ lock->cll_users, lock->cll_flags,
+ env, h->coh_nesting, cl_lock_nr_mutexed(env),
+ func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock) \
+ cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+ lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+ struct cl_lock *lock, __u32 enqflags)
+{
+ cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+ lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+ struct cl_lock *lock)
+{
+ cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+ lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+ struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+ struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+ struct cl_object *obj,
+ const struct cl_lock_operations *ops)
+{
+ ENTRY;
+ slice->cls_lock = lock;
+ list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+ slice->cls_obj = obj;
+ slice->cls_ops = ops;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+ LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+ need == CLM_PHANTOM || need == CLM_GROUP);
+ LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+ has == CLM_PHANTOM || has == CLM_GROUP);
+ CLASSERT(CLM_PHANTOM < CLM_READ);
+ CLASSERT(CLM_READ < CLM_WRITE);
+ CLASSERT(CLM_WRITE < CLM_GROUP);
+
+ if (has != CLM_GROUP)
+ return need <= has;
+ else
+ return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need)
+{
+ return
+ has->cld_start <= need->cld_start &&
+ has->cld_end >= need->cld_end &&
+ cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+ (has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+ const struct cl_lock_descr *need)
+{
+ return
+ cl_object_same(has->cld_obj, need->cld_obj) &&
+ cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object *obj = lock->cll_descr.cld_obj;
+
+ LINVRNT(!cl_lock_is_mutexed(lock));
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+ might_sleep();
+ while (!list_empty(&lock->cll_layers)) {
+ struct cl_lock_slice *slice;
+
+ slice = list_entry(lock->cll_layers.next,
+ struct cl_lock_slice, cls_linkage);
+ list_del_init(lock->cll_layers.next);
+ slice->cls_ops->clo_fini(env, slice);
+ }
+ CS_LOCK_DEC(obj, total);
+ CS_LOCKSTATE_DEC(obj, lock->cll_state);
+ lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+ cl_object_put(env, obj);
+ lu_ref_fini(&lock->cll_reference);
+ lu_ref_fini(&lock->cll_holders);
+ mutex_destroy(&lock->cll_guard);
+ OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+ EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object *obj;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ ENTRY;
+ obj = lock->cll_descr.cld_obj;
+ LINVRNT(obj != NULL);
+
+ CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+
+ if (atomic_dec_and_test(&lock->cll_ref)) {
+ if (lock->cll_state == CLS_FREEING) {
+ LASSERT(list_empty(&lock->cll_linkage));
+ cl_lock_free(env, lock);
+ }
+ CS_LOCK_DEC(obj, busy);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_invariant(NULL, lock));
+ CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+ atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+ CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+ atomic_read(&lock->cll_ref), lock, RETIP);
+ if (atomic_inc_return(&lock->cll_ref) == 1)
+ CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+ cl_lock_mutex_get(env, lock);
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+ struct cl_object *obj,
+ const struct cl_io *io,
+ const struct cl_lock_descr *descr)
+{
+ struct cl_lock *lock;
+ struct lu_object_header *head;
+
+ ENTRY;
+ OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+ if (lock != NULL) {
+ atomic_set(&lock->cll_ref, 1);
+ lock->cll_descr = *descr;
+ lock->cll_state = CLS_NEW;
+ cl_object_get(obj);
+ lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+ "cl_lock", lock);
+ INIT_LIST_HEAD(&lock->cll_layers);
+ INIT_LIST_HEAD(&lock->cll_linkage);
+ INIT_LIST_HEAD(&lock->cll_inclosure);
+ lu_ref_init(&lock->cll_reference);
+ lu_ref_init(&lock->cll_holders);
+ mutex_init(&lock->cll_guard);
+ lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+ init_waitqueue_head(&lock->cll_wq);
+ head = obj->co_lu.lo_header;
+ CS_LOCKSTATE_INC(obj, CLS_NEW);
+ CS_LOCK_INC(obj, total);
+ CS_LOCK_INC(obj, create);
+ cl_lock_lockdep_init(lock);
+ list_for_each_entry(obj, &head->loh_layers,
+ co_lu.lo_linkage) {
+ int err;
+
+ err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+ if (err != 0) {
+ cl_lock_finish(env, lock);
+ lock = ERR_PTR(err);
+ break;
+ }
+ }
+ } else
+ lock = ERR_PTR(-ENOMEM);
+ RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+ struct cl_lock *lock)
+{
+ enum cl_lock_state state = lock->cll_state;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(state != CLS_INTRANSIT);
+ LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+ "Malformed lock state %d.\n", state);
+
+ cl_lock_state_set(env, lock, CLS_INTRANSIT);
+ lock->cll_intransit_owner = current;
+ cl_lock_hold_add(env, lock, "intransit", current);
+ return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ * Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+ LASSERT(state != CLS_INTRANSIT);
+ LASSERT(lock->cll_intransit_owner == current);
+
+ lock->cll_intransit_owner = NULL;
+ cl_lock_state_set(env, lock, state);
+ cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+ LASSERT(cl_lock_is_mutexed(lock));
+ return lock->cll_state == CLS_INTRANSIT &&
+ lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+ const struct cl_lock *lock,
+ const struct cl_lock_descr *need,
+ const struct cl_io *io)
+{
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_invariant_trusted(env, lock));
+ ENTRY;
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_fits_into != NULL &&
+ !slice->cls_ops->clo_fits_into(env, slice, need, io))
+ RETURN(0);
+ }
+ RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+ struct cl_object *obj,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need)
+{
+ struct cl_lock *lock;
+ struct cl_object_header *head;
+
+ ENTRY;
+
+ head = cl_object_header(obj);
+ LINVRNT(spin_is_locked(&head->coh_lock_guard));
+ CS_LOCK_INC(obj, lookup);
+ list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+ int matched;
+
+ matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+ lock->cll_state < CLS_FREEING &&
+ lock->cll_error == 0 &&
+ !(lock->cll_flags & CLF_CANCELLED) &&
+ cl_lock_fits_into(env, lock, need, io);
+ CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+ PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+ matched);
+ if (matched) {
+ cl_lock_get_trust(lock);
+ CS_LOCK_INC(obj, hit);
+ RETURN(lock);
+ }
+ }
+ RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need)
+{
+ struct cl_object_header *head;
+ struct cl_object *obj;
+ struct cl_lock *lock;
+
+ ENTRY;
+
+ obj = need->cld_obj;
+ head = cl_object_header(obj);
+
+ spin_lock(&head->coh_lock_guard);
+ lock = cl_lock_lookup(env, obj, io, need);
+ spin_unlock(&head->coh_lock_guard);
+
+ if (lock == NULL) {
+ lock = cl_lock_alloc(env, obj, io, need);
+ if (!IS_ERR(lock)) {
+ struct cl_lock *ghost;
+
+ spin_lock(&head->coh_lock_guard);
+ ghost = cl_lock_lookup(env, obj, io, need);
+ if (ghost == NULL) {
+ list_add_tail(&lock->cll_linkage,
+ &head->coh_locks);
+ spin_unlock(&head->coh_lock_guard);
+ CS_LOCK_INC(obj, busy);
+ } else {
+ spin_unlock(&head->coh_lock_guard);
+ /*
+ * Other threads can acquire references to the
+ * top-lock through its sub-locks. Hence, it
+ * cannot be cl_lock_free()-ed immediately.
+ */
+ cl_lock_finish(env, lock);
+ lock = ghost;
+ }
+ }
+ }
+ RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_object_header *head;
+ struct cl_object *obj;
+ struct cl_lock *lock;
+
+ obj = need->cld_obj;
+ head = cl_object_header(obj);
+
+ do {
+ spin_lock(&head->coh_lock_guard);
+ lock = cl_lock_lookup(env, obj, io, need);
+ spin_unlock(&head->coh_lock_guard);
+ if (lock == NULL)
+ return NULL;
+
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state == CLS_INTRANSIT)
+ /* Don't care return value. */
+ cl_lock_state_wait(env, lock);
+ if (lock->cll_state == CLS_FREEING) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ lock = NULL;
+ }
+ } while (lock == NULL);
+
+ cl_lock_hold_add(env, lock, scope, source);
+ cl_lock_user_add(env, lock);
+ if (lock->cll_state == CLS_CACHED)
+ cl_use_try(env, lock, 1);
+ if (lock->cll_state == CLS_HELD) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_acquire(env, lock, 0);
+ cl_lock_put(env, lock);
+ } else {
+ cl_unuse_try(env, lock);
+ cl_lock_unhold(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ lock = NULL;
+ }
+
+ return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+ const struct lu_device_type *dtype)
+{
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+ ENTRY;
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+ RETURN(slice);
+ }
+ RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_counters *counters;
+
+ counters = cl_lock_counters(env, lock);
+ lock->cll_depth++;
+ counters->ctc_nr_locks_locked++;
+ lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+ cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ if (lock->cll_guarder == current) {
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(lock->cll_depth > 0);
+ } else {
+ struct cl_object_header *hdr;
+ struct cl_thread_info *info;
+ int i;
+
+ LINVRNT(lock->cll_guarder != current);
+ hdr = cl_object_header(lock->cll_descr.cld_obj);
+ /*
+ * Check that mutices are taken in the bottom-to-top order.
+ */
+ info = cl_env_info(env);
+ for (i = 0; i < hdr->coh_nesting; ++i)
+ LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+ mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+ lock->cll_guarder = current;
+ LINVRNT(lock->cll_depth == 0);
+ }
+ cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+
+ LINVRNT(cl_lock_invariant_trusted(env, lock));
+ ENTRY;
+
+ result = 0;
+ if (lock->cll_guarder == current) {
+ LINVRNT(lock->cll_depth > 0);
+ cl_lock_mutex_tail(env, lock);
+ } else if (mutex_trylock(&lock->cll_guard)) {
+ LINVRNT(lock->cll_depth == 0);
+ lock->cll_guarder = current;
+ cl_lock_mutex_tail(env, lock);
+ } else
+ result = -EBUSY;
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_counters *counters;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(lock->cll_guarder == current);
+ LINVRNT(lock->cll_depth > 0);
+
+ counters = cl_lock_counters(env, lock);
+ LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+ cl_lock_trace(D_TRACE, env, "put mutex", lock);
+ lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+ counters->ctc_nr_locks_locked--;
+ if (--lock->cll_depth == 0) {
+ lock->cll_guarder = NULL;
+ mutex_unlock(&lock->cll_guard);
+ }
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+ return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+ struct cl_thread_info *info;
+ int i;
+ int locked;
+
+ /*
+ * NOTE: if summation across all nesting levels (currently 2) proves
+ * too expensive, a summary counter can be added to
+ * struct cl_thread_info.
+ */
+ info = cl_env_info(env);
+ for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ locked += info->clt_counters[i].ctc_nr_locks_locked;
+ return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ ENTRY;
+ if (!(lock->cll_flags & CLF_CANCELLED)) {
+ const struct cl_lock_slice *slice;
+
+ lock->cll_flags |= CLF_CANCELLED;
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_cancel != NULL)
+ slice->cls_ops->clo_cancel(env, slice);
+ }
+ }
+ EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_object_header *head;
+ const struct cl_lock_slice *slice;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ ENTRY;
+ if (lock->cll_state < CLS_FREEING) {
+ LASSERT(lock->cll_state != CLS_INTRANSIT);
+ cl_lock_state_set(env, lock, CLS_FREEING);
+
+ head = cl_object_header(lock->cll_descr.cld_obj);
+
+ spin_lock(&head->coh_lock_guard);
+ list_del_init(&lock->cll_linkage);
+ spin_unlock(&head->coh_lock_guard);
+
+ /*
+ * From now on, no new references to this lock can be acquired
+ * by cl_lock_lookup().
+ */
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_delete != NULL)
+ slice->cls_ops->clo_delete(env, slice);
+ }
+ /*
+ * From now on, no new references to this lock can be acquired
+ * by layer-specific means (like a pointer from struct
+ * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+ * lov).
+ *
+ * Lock will be finally freed in cl_lock_put() when last of
+ * existing references goes away.
+ */
+ }
+ EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+ int delta)
+{
+ struct cl_thread_counters *counters;
+ enum clt_nesting_level nesting;
+
+ lock->cll_holds += delta;
+ nesting = cl_lock_nesting(lock);
+ if (nesting == CNL_TOP) {
+ counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+ counters->ctc_nr_held += delta;
+ LASSERT(counters->ctc_nr_held >= 0);
+ }
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+ int delta)
+{
+ struct cl_thread_counters *counters;
+ enum clt_nesting_level nesting;
+
+ lock->cll_users += delta;
+ nesting = cl_lock_nesting(lock);
+ if (nesting == CNL_TOP) {
+ counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+ counters->ctc_nr_used += delta;
+ LASSERT(counters->ctc_nr_used >= 0);
+ }
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_holds > 0);
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+ lu_ref_del(&lock->cll_holders, scope, source);
+ cl_lock_hold_mod(env, lock, -1);
+ if (lock->cll_holds == 0) {
+ CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+ if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+ lock->cll_descr.cld_mode == CLM_GROUP ||
+ lock->cll_state != CLS_CACHED)
+ /*
+ * If lock is still phantom or grouplock when user is
+ * done with it---destroy the lock.
+ */
+ lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+ if (lock->cll_flags & CLF_CANCELPEND) {
+ lock->cll_flags &= ~CLF_CANCELPEND;
+ cl_lock_cancel0(env, lock);
+ }
+ if (lock->cll_flags & CLF_DOOMED) {
+ /* no longer doomed: it's dead... Jim. */
+ lock->cll_flags &= ~CLF_DOOMED;
+ cl_lock_delete0(env, lock);
+ }
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+ wait_queue_t waiter;
+ sigset_t blocked;
+ int result;
+
+ ENTRY;
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_depth == 1);
+ LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+ cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+ result = lock->cll_error;
+ if (result == 0) {
+ /* To avoid being interrupted by the 'non-fatal' signals
+ * (SIGCHLD, for instance), we'd block them temporarily.
+ * LU-305 */
+ blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+ init_waitqueue_entry_current(&waiter);
+ add_wait_queue(&lock->cll_wq, &waiter);
+ set_current_state(TASK_INTERRUPTIBLE);
+ cl_lock_mutex_put(env, lock);
+
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+ /* Returning ERESTARTSYS instead of EINTR so syscalls
+ * can be restarted if signals are pending here */
+ result = -ERESTARTSYS;
+ if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+ waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+ if (!cfs_signal_pending())
+ result = 0;
+ }
+
+ cl_lock_mutex_get(env, lock);
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&lock->cll_wq, &waiter);
+
+ /* Restore old blocked signals */
+ cfs_restore_sigs(blocked);
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ const struct cl_lock_slice *slice;
+
+ ENTRY;
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+ if (slice->cls_ops->clo_state != NULL)
+ slice->cls_ops->clo_state(env, slice, state);
+ wake_up_all(&lock->cll_wq);
+ EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+ cl_lock_state_signal(env, lock, lock->cll_state);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+ enum cl_lock_state state)
+{
+ ENTRY;
+ LASSERT(lock->cll_state <= state ||
+ (lock->cll_state == CLS_CACHED &&
+ (state == CLS_HELD || /* lock found in cache */
+ state == CLS_NEW || /* sub-lock canceled */
+ state == CLS_INTRANSIT)) ||
+ /* lock is in transit state */
+ lock->cll_state == CLS_INTRANSIT);
+
+ if (lock->cll_state != state) {
+ CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+ CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+ cl_lock_state_signal(env, lock, state);
+ lock->cll_state = state;
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ do {
+ result = 0;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+ result = -ENOSYS;
+ list_for_each_entry_reverse(slice, &lock->cll_layers,
+ cls_linkage) {
+ if (slice->cls_ops->clo_unuse != NULL) {
+ result = slice->cls_ops->clo_unuse(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ } while (result == CLO_REPEAT);
+
+ return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ * use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+ enum cl_lock_state state;
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+ LASSERT(lock->cll_state == CLS_CACHED);
+ if (lock->cll_error)
+ RETURN(lock->cll_error);
+
+ result = -ENOSYS;
+ state = cl_lock_intransit(env, lock);
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_use != NULL) {
+ result = slice->cls_ops->clo_use(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+
+ LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+ lock->cll_state);
+
+ if (result == 0) {
+ state = CLS_HELD;
+ } else {
+ if (result == -ESTALE) {
+ /*
+ * ESTALE means sublock being cancelled
+ * at this time, and set lock state to
+ * be NEW here and ask the caller to repeat.
+ */
+ state = CLS_NEW;
+ result = CLO_REPEAT;
+ }
+
+ /* @atomic means back-off-on-failure. */
+ if (atomic) {
+ int rc;
+ rc = cl_unuse_try_internal(env, lock);
+ /* Vet the results. */
+ if (rc < 0 && result > 0)
+ result = rc;
+ }
+
+ }
+ cl_lock_extransit(env, lock, state);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+ struct cl_lock *lock,
+ struct cl_io *io, __u32 flags)
+{
+ int result;
+ const struct cl_lock_slice *slice;
+
+ ENTRY;
+ result = -ENOSYS;
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_enqueue != NULL) {
+ result = slice->cls_ops->clo_enqueue(env,
+ slice, io, flags);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 flags)
+{
+ int result;
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+ do {
+ LINVRNT(cl_lock_is_mutexed(lock));
+
+ result = lock->cll_error;
+ if (result != 0)
+ break;
+
+ switch (lock->cll_state) {
+ case CLS_NEW:
+ cl_lock_state_set(env, lock, CLS_QUEUING);
+ /* fall-through */
+ case CLS_QUEUING:
+ /* kick layers. */
+ result = cl_enqueue_kick(env, lock, io, flags);
+ /* For AGL case, the cl_lock::cll_state may
+ * become CLS_HELD already. */
+ if (result == 0 && lock->cll_state == CLS_QUEUING)
+ cl_lock_state_set(env, lock, CLS_ENQUEUED);
+ break;
+ case CLS_INTRANSIT:
+ LASSERT(cl_lock_is_intransit(lock));
+ result = CLO_WAIT;
+ break;
+ case CLS_CACHED:
+ /* yank lock from the cache. */
+ result = cl_use_try(env, lock, 0);
+ break;
+ case CLS_ENQUEUED:
+ case CLS_HELD:
+ result = 0;
+ break;
+ default:
+ case CLS_FREEING:
+ /*
+ * impossible, only held locks with increased
+ * ->cll_holds can be enqueued, and they cannot be
+ * freed.
+ */
+ LBUG();
+ }
+ } while (result == CLO_REPEAT);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+ struct cl_lock *lock,
+ int keep_mutex)
+{
+ struct cl_lock *conflict;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(cl_lock_is_mutexed(lock));
+ LASSERT(lock->cll_state == CLS_QUEUING);
+ LASSERT(lock->cll_conflict != NULL);
+
+ conflict = lock->cll_conflict;
+ lock->cll_conflict = NULL;
+
+ cl_lock_mutex_put(env, lock);
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+ cl_lock_mutex_get(env, conflict);
+ cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+ cl_lock_cancel(env, conflict);
+ cl_lock_delete(env, conflict);
+
+ while (conflict->cll_state != CLS_FREEING) {
+ rc = cl_lock_state_wait(env, conflict);
+ if (rc != 0)
+ break;
+ }
+ cl_lock_mutex_put(env, conflict);
+ lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+ cl_lock_put(env, conflict);
+
+ if (keep_mutex)
+ cl_lock_mutex_get(env, lock);
+
+ LASSERT(rc <= 0);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 enqflags)
+{
+ int result;
+
+ ENTRY;
+
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_holds > 0);
+
+ cl_lock_user_add(env, lock);
+ do {
+ result = cl_enqueue_try(env, lock, io, enqflags);
+ if (result == CLO_WAIT) {
+ if (lock->cll_conflict != NULL)
+ result = cl_lock_enqueue_wait(env, lock, 1);
+ else
+ result = cl_lock_state_wait(env, lock);
+ if (result == 0)
+ continue;
+ }
+ break;
+ } while (1);
+ if (result != 0)
+ cl_unuse_try(env, lock);
+ LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+ lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD));
+ RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_io *io, __u32 enqflags)
+{
+ int result;
+
+ ENTRY;
+
+ cl_lock_lockdep_acquire(env, lock, enqflags);
+ cl_lock_mutex_get(env, lock);
+ result = cl_enqueue_locked(env, lock, io, enqflags);
+ cl_lock_mutex_put(env, lock);
+ if (result != 0)
+ cl_lock_lockdep_release(env, lock);
+ LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD));
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+ enum cl_lock_state state = CLS_NEW;
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+ if (lock->cll_users > 1) {
+ cl_lock_user_del(env, lock);
+ RETURN(0);
+ }
+
+ /* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+ * underlying resources. */
+ if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+ cl_lock_user_del(env, lock);
+ RETURN(0);
+ }
+
+ /*
+ * New lock users (->cll_users) are not protecting unlocking
+ * from proceeding. From this point, lock eventually reaches
+ * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+ * CLS_FREEING.
+ */
+ state = cl_lock_intransit(env, lock);
+
+ result = cl_unuse_try_internal(env, lock);
+ LASSERT(lock->cll_state == CLS_INTRANSIT);
+ LASSERT(result != CLO_WAIT);
+ cl_lock_user_del(env, lock);
+ if (result == 0 || result == -ESTALE) {
+ /*
+ * Return lock back to the cache. This is the only
+ * place where lock is moved into CLS_CACHED state.
+ *
+ * If one of ->clo_unuse() methods returned -ESTALE, lock
+ * cannot be placed into cache and has to be
+ * re-initialized. This happens e.g., when a sub-lock was
+ * canceled while unlocking was in progress.
+ */
+ if (state == CLS_HELD && result == 0)
+ state = CLS_CACHED;
+ else
+ state = CLS_NEW;
+ cl_lock_extransit(env, lock, state);
+
+ /*
+ * Hide -ESTALE error.
+ * If the lock is a glimpse lock, and it has multiple
+ * stripes. Assuming that one of its sublock returned -ENAVAIL,
+ * and other sublocks are matched write locks. In this case,
+ * we can't set this lock to error because otherwise some of
+ * its sublocks may not be canceled. This causes some dirty
+ * pages won't be written to OSTs. -jay
+ */
+ result = 0;
+ } else {
+ CERROR("result = %d, this is unlikely!\n", result);
+ state = CLS_NEW;
+ cl_lock_extransit(env, lock, state);
+ }
+ RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+ ENTRY;
+
+ result = cl_unuse_try(env, lock);
+ if (result)
+ CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+ EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+ ENTRY;
+ cl_lock_mutex_get(env, lock);
+ cl_unuse_locked(env, lock);
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_release(env, lock);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+ do {
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERTF(lock->cll_state == CLS_QUEUING ||
+ lock->cll_state == CLS_ENQUEUED ||
+ lock->cll_state == CLS_HELD ||
+ lock->cll_state == CLS_INTRANSIT,
+ "lock state: %d\n", lock->cll_state);
+ LASSERT(lock->cll_users > 0);
+ LASSERT(lock->cll_holds > 0);
+
+ result = lock->cll_error;
+ if (result != 0)
+ break;
+
+ if (cl_lock_is_intransit(lock)) {
+ result = CLO_WAIT;
+ break;
+ }
+
+ if (lock->cll_state == CLS_HELD)
+ /* nothing to do */
+ break;
+
+ result = -ENOSYS;
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_wait != NULL) {
+ result = slice->cls_ops->clo_wait(env, slice);
+ if (result != 0)
+ break;
+ }
+ }
+ LASSERT(result != -ENOSYS);
+ if (result == 0) {
+ LASSERT(lock->cll_state != CLS_INTRANSIT);
+ cl_lock_state_set(env, lock, CLS_HELD);
+ }
+ } while (result == CLO_REPEAT);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ * lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+ int result;
+
+ ENTRY;
+ cl_lock_mutex_get(env, lock);
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+ "Wrong state %d \n", lock->cll_state);
+ LASSERT(lock->cll_holds > 0);
+
+ do {
+ result = cl_wait_try(env, lock);
+ if (result == CLO_WAIT) {
+ result = cl_lock_state_wait(env, lock);
+ if (result == 0)
+ continue;
+ }
+ break;
+ } while (1);
+ if (result < 0) {
+ cl_unuse_try(env, lock);
+ cl_lock_lockdep_release(env, lock);
+ }
+ cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+ cl_lock_mutex_put(env, lock);
+ LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ unsigned long pound;
+ unsigned long ounce;
+
+ ENTRY;
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ pound = 0;
+ list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_weigh != NULL) {
+ ounce = slice->cls_ops->clo_weigh(env, slice);
+ pound += ounce;
+ if (pound < ounce) /* over-weight^Wflow */
+ pound = ~0UL;
+ }
+ }
+ RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+ const struct cl_lock_descr *desc)
+{
+ const struct cl_lock_slice *slice;
+ struct cl_object *obj = lock->cll_descr.cld_obj;
+ struct cl_object_header *hdr = cl_object_header(obj);
+ int result;
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+ /* don't allow object to change */
+ LASSERT(obj == desc->cld_obj);
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_modify != NULL) {
+ result = slice->cls_ops->clo_modify(env, slice, desc);
+ if (result != 0)
+ RETURN(result);
+ }
+ }
+ CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+ PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+ /*
+ * Just replace description in place. Nothing more is needed for
+ * now. If locks were indexed according to their extent and/or mode,
+ * that index would have to be updated here.
+ */
+ spin_lock(&hdr->coh_lock_guard);
+ lock->cll_descr = *desc;
+ spin_unlock(&hdr->coh_lock_guard);
+ RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+ struct cl_lock_closure *closure,
+ struct cl_lock *origin, int wait)
+{
+ LINVRNT(cl_lock_is_mutexed(origin));
+ LINVRNT(cl_lock_invariant(env, origin));
+
+ INIT_LIST_HEAD(&closure->clc_list);
+ closure->clc_origin = origin;
+ closure->clc_wait = wait;
+ closure->clc_nr = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure)
+{
+ const struct cl_lock_slice *slice;
+ int result;
+
+ ENTRY;
+ LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+ LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+ result = cl_lock_enclosure(env, lock, closure);
+ if (result == 0) {
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ if (slice->cls_ops->clo_closure != NULL) {
+ result = slice->cls_ops->clo_closure(env, slice,
+ closure);
+ if (result != 0)
+ break;
+ }
+ }
+ }
+ if (result != 0)
+ cl_lock_disclosure(env, closure);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+ struct cl_lock_closure *closure)
+{
+ int result = 0;
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+ if (!cl_lock_mutex_try(env, lock)) {
+ /*
+ * If lock->cll_inclosure is not empty, lock is already in
+ * this closure.
+ */
+ if (list_empty(&lock->cll_inclosure)) {
+ cl_lock_get_trust(lock);
+ lu_ref_add(&lock->cll_reference, "closure", closure);
+ list_add(&lock->cll_inclosure, &closure->clc_list);
+ closure->clc_nr++;
+ } else
+ cl_lock_mutex_put(env, lock);
+ result = 0;
+ } else {
+ cl_lock_disclosure(env, closure);
+ if (closure->clc_wait) {
+ cl_lock_get_trust(lock);
+ lu_ref_add(&lock->cll_reference, "closure-w", closure);
+ cl_lock_mutex_put(env, closure->clc_origin);
+
+ LASSERT(cl_lock_nr_mutexed(env) == 0);
+ cl_lock_mutex_get(env, lock);
+ cl_lock_mutex_put(env, lock);
+
+ cl_lock_mutex_get(env, closure->clc_origin);
+ lu_ref_del(&lock->cll_reference, "closure-w", closure);
+ cl_lock_put(env, lock);
+ }
+ result = CLO_REPEAT;
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+ struct cl_lock_closure *closure)
+{
+ struct cl_lock *scan;
+ struct cl_lock *temp;
+
+ cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+ list_for_each_entry_safe(scan, temp, &closure->clc_list,
+ cll_inclosure){
+ list_del_init(&scan->cll_inclosure);
+ cl_lock_mutex_put(env, scan);
+ lu_ref_del(&scan->cll_reference, "closure", closure);
+ cl_lock_put(env, scan);
+ closure->clc_nr--;
+ }
+ LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+ LASSERT(closure->clc_nr == 0);
+ LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ * cl_lock_nr_mutexed(env) == 1)
+ * [i.e., if a top-lock is deleted, mutices of no other locks can be
+ * held, as deletion of sub-locks might require releasing a top-lock
+ * mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+ cl_lock_nr_mutexed(env) == 1));
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+ if (lock->cll_holds == 0)
+ cl_lock_delete0(env, lock);
+ else
+ lock->cll_flags |= CLF_DOOMED;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ ENTRY;
+ if (lock->cll_error == 0 && error != 0) {
+ cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+ lock->cll_error = error;
+ cl_lock_signal(env, lock);
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+ if (lock->cll_holds == 0)
+ cl_lock_cancel0(env, lock);
+ else
+ lock->cll_flags |= CLF_CANCELPEND;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+ struct cl_object *obj, pgoff_t index,
+ struct cl_lock *except,
+ int pending, int canceld)
+{
+ struct cl_object_header *head;
+ struct cl_lock *scan;
+ struct cl_lock *lock;
+ struct cl_lock_descr *need;
+
+ ENTRY;
+
+ head = cl_object_header(obj);
+ need = &cl_env_info(env)->clt_descr;
+ lock = NULL;
+
+ need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+ * not PHANTOM */
+ need->cld_start = need->cld_end = index;
+ need->cld_enq_flags = 0;
+
+ spin_lock(&head->coh_lock_guard);
+ /* It is fine to match any group lock since there could be only one
+ * with a uniq gid and it conflicts with all other lock modes too */
+ list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+ if (scan != except &&
+ (scan->cll_descr.cld_mode == CLM_GROUP ||
+ cl_lock_ext_match(&scan->cll_descr, need)) &&
+ scan->cll_state >= CLS_HELD &&
+ scan->cll_state < CLS_FREEING &&
+ /*
+ * This check is racy as the lock can be canceled right
+ * after it is done, but this is fine, because page exists
+ * already.
+ */
+ (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+ (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+ /* Don't increase cs_hit here since this
+ * is just a helper function. */
+ cl_lock_get_trust(scan);
+ lock = scan;
+ break;
+ }
+ }
+ spin_unlock(&head->coh_lock_guard);
+ RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+ struct lu_device_type *dtype;
+ const struct cl_page_slice *slice;
+
+ dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+ slice = cl_page_at(page, dtype);
+ LASSERT(slice != NULL);
+ return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_lock *lock = cbdata;
+ pgoff_t index = pgoff_at_lock(page, lock);
+
+ if (index >= info->clt_fn_index) {
+ struct cl_lock *tmp;
+
+ /* refresh non-overlapped index */
+ tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+ lock, 1, 0);
+ if (tmp != NULL) {
+ /* Cache the first-non-overlapped index so as to skip
+ * all pages within [index, clt_fn_index). This
+ * is safe because if tmp lock is canceled, it will
+ * discard these pages. */
+ info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+ if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+ info->clt_fn_index = CL_PAGE_EOF;
+ cl_lock_put(env, tmp);
+ } else if (cl_page_own(env, io, page) == 0) {
+ /* discard the page */
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ } else {
+ LASSERT(page->cp_state == CPS_FREEING);
+ }
+ }
+
+ info->clt_next_index = index + 1;
+ return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_lock *lock = cbdata;
+
+ LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+ KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+ !PageWriteback(cl_page_vmpage(env, page))));
+ KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+ !PageDirty(cl_page_vmpage(env, page))));
+
+ info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+ if (cl_page_own(env, io, page) == 0) {
+ /* discard the page */
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ } else {
+ LASSERT(page->cp_state == CPS_FREEING);
+ }
+
+ return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+ struct cl_thread_info *info = cl_env_info(env);
+ struct cl_io *io = &info->clt_io;
+ struct cl_lock_descr *descr = &lock->cll_descr;
+ cl_page_gang_cb_t cb;
+ int res;
+ int result;
+
+ LINVRNT(cl_lock_invariant(env, lock));
+ ENTRY;
+
+ io->ci_obj = cl_object_top(descr->cld_obj);
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+ if (result != 0)
+ GOTO(out, result);
+
+ cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+ info->clt_fn_index = info->clt_next_index = descr->cld_start;
+ do {
+ res = cl_page_gang_lookup(env, descr->cld_obj, io,
+ info->clt_next_index, descr->cld_end,
+ cb, (void *)lock);
+ if (info->clt_next_index > descr->cld_end)
+ break;
+
+ if (res == CLP_GANG_RESCHED)
+ cond_resched();
+ } while (res != CLP_GANG_OKAY);
+out:
+ cl_io_fini(env, io);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ * destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+ struct cl_object_header *head;
+ struct cl_lock *lock;
+
+ ENTRY;
+ head = cl_object_header(obj);
+ /*
+ * If locks are destroyed without cancellation, all pages must be
+ * already destroyed (as otherwise they will be left unprotected).
+ */
+ LASSERT(ergo(!cancel,
+ head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+ spin_lock(&head->coh_lock_guard);
+ while (!list_empty(&head->coh_locks)) {
+ lock = container_of(head->coh_locks.next,
+ struct cl_lock, cll_linkage);
+ cl_lock_get_trust(lock);
+ spin_unlock(&head->coh_lock_guard);
+ lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state < CLS_FREEING) {
+ LASSERT(lock->cll_users <= 1);
+ if (unlikely(lock->cll_users == 1)) {
+ struct l_wait_info lwi = { 0 };
+
+ cl_lock_mutex_put(env, lock);
+ l_wait_event(lock->cll_wq,
+ lock->cll_users == 0,
+ &lwi);
+ goto again;
+ }
+
+ if (cancel)
+ cl_lock_cancel(env, lock);
+ cl_lock_delete(env, lock);
+ }
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, "prune", current);
+ cl_lock_put(env, lock);
+ spin_lock(&head->coh_lock_guard);
+ }
+ spin_unlock(&head->coh_lock_guard);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+ const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+
+ ENTRY;
+
+ while (1) {
+ lock = cl_lock_find(env, io, need);
+ if (IS_ERR(lock))
+ break;
+ cl_lock_mutex_get(env, lock);
+ if (lock->cll_state < CLS_FREEING &&
+ !(lock->cll_flags & CLF_CANCELLED)) {
+ cl_lock_hold_mod(env, lock, +1);
+ lu_ref_add(&lock->cll_holders, scope, source);
+ lu_ref_add(&lock->cll_reference, scope, source);
+ break;
+ }
+ cl_lock_mutex_put(env, lock);
+ cl_lock_put(env, lock);
+ }
+ RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+
+ ENTRY;
+
+ lock = cl_lock_hold_mutex(env, io, need, scope, source);
+ if (!IS_ERR(lock))
+ cl_lock_mutex_put(env, lock);
+ RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+ const struct cl_lock_descr *need,
+ const char *scope, const void *source)
+{
+ struct cl_lock *lock;
+ int rc;
+ __u32 enqflags = need->cld_enq_flags;
+
+ ENTRY;
+ do {
+ lock = cl_lock_hold_mutex(env, io, need, scope, source);
+ if (IS_ERR(lock))
+ break;
+
+ rc = cl_enqueue_locked(env, lock, io, enqflags);
+ if (rc == 0) {
+ if (cl_lock_fits_into(env, lock, need, io)) {
+ if (!(enqflags & CEF_AGL)) {
+ cl_lock_mutex_put(env, lock);
+ cl_lock_lockdep_acquire(env, lock,
+ enqflags);
+ break;
+ }
+ rc = 1;
+ }
+ cl_unuse_locked(env, lock);
+ }
+ cl_lock_trace(D_DLMTRACE, env,
+ rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+ cl_lock_hold_release(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+ if (rc > 0) {
+ LASSERT(enqflags & CEF_AGL);
+ lock = NULL;
+ } else if (rc != 0) {
+ lock = ERR_PTR(rc);
+ }
+ } while (rc == 0);
+ RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_state != CLS_FREEING);
+
+ ENTRY;
+ cl_lock_hold_mod(env, lock, +1);
+ cl_lock_get(lock);
+ lu_ref_add(&lock->cll_holders, scope, source);
+ lu_ref_add(&lock->cll_reference, scope, source);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+ ENTRY;
+ cl_lock_hold_release(env, lock, scope, source);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+ const char *scope, const void *source)
+{
+ LINVRNT(cl_lock_invariant(env, lock));
+ ENTRY;
+ cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+ cl_lock_mutex_get(env, lock);
+ cl_lock_hold_release(env, lock, scope, source);
+ cl_lock_mutex_put(env, lock);
+ lu_ref_del(&lock->cll_reference, scope, source);
+ cl_lock_put(env, lock);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+
+ ENTRY;
+ cl_lock_used_mod(env, lock, +1);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+ LINVRNT(cl_lock_is_mutexed(lock));
+ LINVRNT(cl_lock_invariant(env, lock));
+ LASSERT(lock->cll_users > 0);
+
+ ENTRY;
+ cl_lock_used_mod(env, lock, -1);
+ if (lock->cll_users == 0)
+ wake_up_all(&lock->cll_wq);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+ static const char *names[] = {
+ [CLM_PHANTOM] = "P",
+ [CLM_READ] = "R",
+ [CLM_WRITE] = "W",
+ [CLM_GROUP] = "G"
+ };
+ if (0 <= mode && mode < ARRAY_SIZE(names))
+ return names[mode];
+ else
+ return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct cl_lock_descr *descr)
+{
+ const struct lu_fid *fid;
+
+ fid = lu_object_fid(&descr->cld_obj->co_lu);
+ (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_lock *lock)
+{
+ const struct cl_lock_slice *slice;
+ (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+ lock, atomic_read(&lock->cll_ref),
+ lock->cll_state, lock->cll_error, lock->cll_holds,
+ lock->cll_users, lock->cll_flags);
+ cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+ (*printer)(env, cookie, " {\n");
+
+ list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+ (*printer)(env, cookie, " %s@%p: ",
+ slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+ slice);
+ if (slice->cls_ops->clo_print != NULL)
+ slice->cls_ops->clo_print(env, cookie, printer, slice);
+ (*printer)(env, cookie, "\n");
+ }
+ (*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+ return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+ lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 000000000000..cdb5fba04591
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c
@@ -0,0 +1,1148 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ * i_mutex
+ * PG_locked
+ * ->coh_page_guard
+ * ->coh_lock_guard
+ * ->coh_attr_guard
+ * ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+ int result;
+
+ ENTRY;
+ result = lu_object_header_init(&h->coh_lu);
+ if (result == 0) {
+ spin_lock_init(&h->coh_page_guard);
+ spin_lock_init(&h->coh_lock_guard);
+ spin_lock_init(&h->coh_attr_guard);
+ lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+ lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+ lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+ h->coh_pages = 0;
+ /* XXX hard coded GFP_* mask. */
+ INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+ INIT_LIST_HEAD(&h->coh_locks);
+ h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+ LASSERT(list_empty(&h->coh_locks));
+ lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+ struct cl_device *cd, const struct lu_fid *fid,
+ const struct cl_object_conf *c)
+{
+ might_sleep();
+ return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+ lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+ lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+ struct cl_object_header *hdr = cl_object_header(o);
+ struct cl_object *top;
+
+ while (hdr->coh_parent != NULL)
+ hdr = hdr->coh_parent;
+
+ top = lu2cl(lu_object_top(&hdr->coh_lu));
+ CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+ return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+ return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+ spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+ spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+ struct cl_attr *attr)
+{
+ struct lu_object_header *top;
+ int result;
+
+ LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+ ENTRY;
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+ if (obj->co_ops->coo_attr_get != NULL) {
+ result = obj->co_ops->coo_attr_get(env, obj, attr);
+ if (result != 0) {
+ if (result > 0)
+ result = 0;
+ break;
+ }
+ }
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_attr *attr, unsigned v)
+{
+ struct lu_object_header *top;
+ int result;
+
+ LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+ ENTRY;
+
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry_reverse(obj, &top->loh_layers,
+ co_lu.lo_linkage) {
+ if (obj->co_ops->coo_attr_set != NULL) {
+ result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+ if (result != 0) {
+ if (result > 0)
+ result = 0;
+ break;
+ }
+ }
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+ struct ost_lvb *lvb)
+{
+ struct lu_object_header *top;
+ int result;
+
+ ENTRY;
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry_reverse(obj, &top->loh_layers,
+ co_lu.lo_linkage) {
+ if (obj->co_ops->coo_glimpse != NULL) {
+ result = obj->co_ops->coo_glimpse(env, obj, lvb);
+ if (result != 0)
+ break;
+ }
+ }
+ LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+ "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+ "ctime: "LPU64" blocks: "LPU64"\n",
+ lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+ lvb->lvb_ctime, lvb->lvb_blocks);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+ const struct cl_object_conf *conf)
+{
+ struct lu_object_header *top;
+ int result;
+
+ ENTRY;
+ top = obj->co_lu.lo_header;
+ result = 0;
+ list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+ if (obj->co_ops->coo_conf_set != NULL) {
+ result = obj->co_ops->coo_conf_set(env, obj, conf);
+ if (result != 0)
+ break;
+ }
+ }
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+ struct cl_object_header *hdr;
+
+ hdr = cl_object_header(obj);
+ LASSERT(hdr->coh_tree.rnode == NULL);
+ LASSERT(hdr->coh_pages == 0);
+
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+ /*
+ * Destroy all locks. Object destruction (including cl_inode_fini())
+ * cannot cancel the locks, because in the case of a local client,
+ * where client and server share the same thread running
+ * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+ * waiting on __wait_on_freeing_inode().
+ */
+ cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+ ENTRY;
+ cl_pages_prune(env, obj);
+ cl_locks_prune(env, obj, 1);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+ struct cl_object_header *head = cl_object_header(obj);
+ int has;
+
+ spin_lock(&head->coh_lock_guard);
+ has = list_empty(&head->coh_locks);
+ spin_unlock(&head->coh_lock_guard);
+
+ return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+ int i;
+
+ cs->cs_name = name;
+ for (i = 0; i < CS_NR; i++)
+ atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+ int i;
+ /*
+ * lookup hit total cached create
+ * env: ...... ...... ...... ...... ......
+ */
+ if (h) {
+ const char *names[CS_NR] = CS_NAMES;
+
+ seq_printf(m, "%6s", " ");
+ for (i = 0; i < CS_NR; i++)
+ seq_printf(m, "%8s", names[i]);
+ seq_printf(m, "\n");
+ }
+
+ seq_printf(m, "%5.5s:", cs->cs_name);
+ for (i = 0; i < CS_NR; i++)
+ seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+ return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+ int i;
+ int result;
+
+ result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+ if (result == 0) {
+ cache_stats_init(&s->cs_pages, "pages");
+ cache_stats_init(&s->cs_locks, "locks");
+ for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+ atomic_set(&s->cs_pages_state[0], 0);
+ for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+ atomic_set(&s->cs_locks_state[i], 0);
+ }
+ return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+ lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+ .cs_name = "envs",
+ .cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+ int i;
+ static const char *pstate[] = {
+ [CPS_CACHED] = "c",
+ [CPS_OWNED] = "o",
+ [CPS_PAGEOUT] = "w",
+ [CPS_PAGEIN] = "r",
+ [CPS_FREEING] = "f"
+ };
+ static const char *lstate[] = {
+ [CLS_NEW] = "n",
+ [CLS_QUEUING] = "q",
+ [CLS_ENQUEUED] = "e",
+ [CLS_HELD] = "h",
+ [CLS_INTRANSIT] = "t",
+ [CLS_CACHED] = "c",
+ [CLS_FREEING] = "f"
+ };
+/*
+ lookup hit total busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+ env: ...... ...... ...... ...... ......
+ */
+ lu_site_stats_print(&site->cs_lu, m);
+ cache_stats_print(&site->cs_pages, m, 1);
+ seq_printf(m, " [");
+ for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+ seq_printf(m, "%s: %u ", pstate[i],
+ atomic_read(&site->cs_pages_state[i]));
+ seq_printf(m, "]\n");
+ cache_stats_print(&site->cs_locks, m, 0);
+ seq_printf(m, " [");
+ for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+ seq_printf(m, "%s: %u ", lstate[i],
+ atomic_read(&site->cs_locks_state[i]));
+ seq_printf(m, "]\n");
+ cache_stats_print(&cl_env_stats, m, 0);
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ * - cl_env: for liblustre.
+ * - tux_info: ony on RedHat kernel.
+ * - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+ void *ce_magic;
+ struct lu_env ce_lu;
+ struct lu_context ce_ses;
+
+ /**
+ * This allows cl_env to be entered into cl_env_hash which implements
+ * the current thread -> client environment lookup.
+ */
+ struct hlist_node ce_node;
+ /**
+ * Owner for the current cl_env.
+ *
+ * If LL_TASK_CL_ENV is defined, this point to the owning current,
+ * only for debugging purpose ;
+ * Otherwise hash is used, and this is the key for cfs_hash.
+ * Now current thread pid is stored. Note using thread pointer would
+ * lead to unbalanced hash because of its specific allocation locality
+ * and could be varied for different platforms and OSes, even different
+ * OS versions.
+ */
+ void *ce_owner;
+
+ /*
+ * Linkage into global list of all client environments. Used for
+ * garbage collection.
+ */
+ struct list_head ce_linkage;
+ /*
+ *
+ */
+ int ce_ref;
+ /*
+ * Debugging field: address of the caller who made original
+ * allocation.
+ */
+ void *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+ LASSERT(cle->ce_ref == 0);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+ LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+ cle->ce_ref = 1;
+ cle->ce_debug = debug;
+ CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+ const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+ return cfs_hash_u64_hash((__u64)key, mask);
+#else
+ return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+ struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+ return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+ struct cl_env *cle = cl_env_hops_obj(hn);
+
+ LASSERT(cle->ce_owner != NULL);
+ return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+ struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+ LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+ .hs_hash = cl_env_hops_hash,
+ .hs_key = cl_env_hops_obj,
+ .hs_keycmp = cl_env_hops_keycmp,
+ .hs_object = cl_env_hops_obj,
+ .hs_get = cl_env_hops_noop,
+ .hs_put_locked = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+ struct cl_env *cle;
+
+ cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+ LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+ return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+ if (cle) {
+ int rc;
+
+ LASSERT(cle->ce_owner == NULL);
+ cle->ce_owner = (void *) (long) current->pid;
+ rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+ &cle->ce_node);
+ LASSERT(rc == 0);
+ }
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+ void *cookie;
+
+ LASSERT(cle->ce_owner == (void *) (long) current->pid);
+ cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+ &cle->ce_node);
+ LASSERT(cookie == cle);
+ cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+ cl_env_hash = cfs_hash_create("cl_env",
+ HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+ HASH_CL_ENV_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &cl_env_hops,
+ CFS_HASH_RW_BKTLOCK);
+ return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+ cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+ if (cle == NULL)
+ cle = cl_env_fetch();
+
+ if (cle && cle->ce_owner)
+ cl_env_do_detach(cle);
+
+ return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+ struct lu_env *env;
+ struct cl_env *cle;
+
+ OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+ if (cle != NULL) {
+ int rc;
+
+ INIT_LIST_HEAD(&cle->ce_linkage);
+ cle->ce_magic = &cl_env_init0;
+ env = &cle->ce_lu;
+ rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+ if (rc == 0) {
+ rc = lu_context_init(&cle->ce_ses,
+ LCT_SESSION | ses_tags);
+ if (rc == 0) {
+ lu_context_enter(&cle->ce_ses);
+ env->le_ses = &cle->ce_ses;
+ cl_env_init0(cle, debug);
+ } else
+ lu_env_fini(env);
+ }
+ if (rc != 0) {
+ OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+ env = ERR_PTR(rc);
+ } else {
+ CL_ENV_INC(create);
+ CL_ENV_INC(total);
+ }
+ } else
+ env = ERR_PTR(-ENOMEM);
+ return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+ CL_ENV_DEC(total);
+ lu_context_fini(&cle->ce_lu.le_ctx);
+ lu_context_fini(&cle->ce_ses);
+ OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+ return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+ struct lu_env *env;
+ struct cl_env *cle;
+
+ CL_ENV_INC(lookup);
+
+ /* check that we don't go far from untrusted pointer */
+ CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+ env = NULL;
+ cle = cl_env_fetch();
+ if (cle != NULL) {
+ CL_ENV_INC(hit);
+ env = &cle->ce_lu;
+ *refcheck = ++cle->ce_ref;
+ }
+ CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+ return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+ struct lu_env *env;
+
+ env = cl_env_peek(refcheck);
+ if (env == NULL) {
+ env = cl_env_new(lu_context_tags_default,
+ lu_session_tags_default,
+ __builtin_return_address(0));
+
+ if (!IS_ERR(env)) {
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+ cl_env_attach(cle);
+ *refcheck = cle->ce_ref;
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ }
+ }
+ return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+ struct lu_env *env;
+
+ LASSERT(cl_env_peek(refcheck) == NULL);
+ env = cl_env_new(tags, tags, __builtin_return_address(0));
+ if (!IS_ERR(env)) {
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+ *refcheck = cle->ce_ref;
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ }
+ return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+ LASSERT(cle->ce_owner == NULL);
+ lu_context_exit(&cle->ce_lu.le_ctx);
+ lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle;
+
+ cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 0);
+ LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+ if (--cle->ce_ref == 0) {
+ CL_ENV_DEC(busy);
+ cl_env_detach(cle);
+ cle->ce_debug = NULL;
+ cl_env_exit(cle);
+ cl_env_fini(cle);
+ }
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+ return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+ cl_env_detach(NULL);
+ cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 0);
+
+ cl_env_attach(cle);
+ cl_env_get(refcheck);
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+ struct cl_env *cle = cl_env_container(env);
+
+ LASSERT(cle->ce_ref > 1);
+
+ CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+ cl_env_detach(cle);
+ cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+ struct lu_env *env;
+
+ nest->cen_cookie = NULL;
+ env = cl_env_peek(&nest->cen_refcheck);
+ if (env != NULL) {
+ if (!cl_io_is_going(env))
+ return env;
+ else {
+ cl_env_put(env, &nest->cen_refcheck);
+ nest->cen_cookie = cl_env_reenter();
+ }
+ }
+ env = cl_env_get(&nest->cen_refcheck);
+ if (IS_ERR(env)) {
+ cl_env_reexit(nest->cen_cookie);
+ return env;
+ }
+
+ LASSERT(!cl_io_is_going(env));
+ return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+ cl_env_put(env, &nest->cen_refcheck);
+ cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+ ENTRY;
+ lvb->lvb_size = attr->cat_size;
+ lvb->lvb_mtime = attr->cat_mtime;
+ lvb->lvb_atime = attr->cat_atime;
+ lvb->lvb_ctime = attr->cat_ctime;
+ lvb->lvb_blocks = attr->cat_blocks;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+ ENTRY;
+ attr->cat_size = lvb->lvb_size;
+ attr->cat_mtime = lvb->lvb_mtime;
+ attr->cat_atime = lvb->lvb_atime;
+ attr->cat_ctime = lvb->lvb_ctime;
+ attr->cat_blocks = lvb->lvb_blocks;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+ struct lu_device_type *ldt,
+ struct lu_device *next)
+{
+ const char *typename;
+ struct lu_device *d;
+
+ LASSERT(ldt != NULL);
+
+ typename = ldt->ldt_name;
+ d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+ if (!IS_ERR(d)) {
+ int rc;
+
+ if (site != NULL)
+ d->ld_site = site;
+ rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+ if (rc == 0) {
+ lu_device_get(d);
+ lu_ref_add(&d->ld_reference,
+ "lu-stack", &lu_site_init);
+ } else {
+ ldt->ldt_ops->ldto_device_free(env, d);
+ CERROR("can't init device '%s', %d\n", typename, rc);
+ d = ERR_PTR(rc);
+ }
+ } else
+ CERROR("Cannot allocate device: '%s'\n", typename);
+ return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+ lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int cl_lock_init(void);
+void cl_lock_fini(void);
+
+int cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+ return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+ struct lu_context_key *key)
+{
+ struct cl_thread_info *info;
+
+ info = cl0_key_init(ctx, key);
+ if (!IS_ERR(info)) {
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+ }
+ return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct cl_thread_info *info;
+ int i;
+
+ info = data;
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+ lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+ cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+ struct lu_context_key *key, void *data)
+{
+ struct cl_thread_info *info = data;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+ LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+ LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+ lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+ lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+ }
+}
+
+static struct lu_context_key cl_key = {
+ .lct_tags = LCT_CL_THREAD,
+ .lct_init = cl_key_init,
+ .lct_fini = cl_key_fini,
+ .lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+ {
+ .ckd_cache = &cl_env_kmem,
+ .ckd_name = "cl_env_kmem",
+ .ckd_size = sizeof (struct cl_env)
+ },
+ {
+ .ckd_cache = NULL
+ }
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+ int result;
+
+ result = cl_env_store_init();
+ if (result)
+ return result;
+
+ result = lu_kmem_init(cl_object_caches);
+ if (result)
+ goto out_store;
+
+ LU_CONTEXT_KEY_INIT(&cl_key);
+ result = lu_context_key_register(&cl_key);
+ if (result)
+ goto out_kmem;
+
+ result = cl_lock_init();
+ if (result)
+ goto out_context;
+
+ result = cl_page_init();
+ if (result)
+ goto out_lock;
+
+ return 0;
+out_lock:
+ cl_lock_fini();
+out_context:
+ lu_context_key_degister(&cl_key);
+out_kmem:
+ lu_kmem_fini(cl_object_caches);
+out_store:
+ cl_env_store_fini();
+ return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+ cl_lock_fini();
+ cl_page_fini();
+ lu_context_key_degister(&cl_key);
+ lu_kmem_fini(cl_object_caches);
+ cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 000000000000..bb9335911c34
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+ int radix);
+
+# define PASSERT(env, page, expr) \
+ do { \
+ if (unlikely(!(expr))) { \
+ CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \
+ LASSERT(0); \
+ } \
+ } while (0)
+
+# define PINVRNT(env, page, exp) \
+ ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+ while (page->cp_parent != NULL)
+ page = page->cp_parent;
+ return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+ LASSERT(atomic_read(&page->cp_ref) > 0);
+ atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+ const struct lu_device_type *dtype)
+{
+ const struct cl_page_slice *slice;
+ ENTRY;
+
+ page = cl_page_top_trusted((struct cl_page *)page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+ RETURN(slice);
+ }
+ page = page->cp_child;
+ } while (page != NULL);
+ RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+ struct cl_page *page;
+
+ LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+ page = radix_tree_lookup(&hdr->coh_tree, index);
+ if (page != NULL)
+ cl_page_get_trust(page);
+ return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+ struct cl_io *io, pgoff_t start, pgoff_t end,
+ cl_page_gang_cb_t cb, void *cbdata)
+{
+ struct cl_object_header *hdr;
+ struct cl_page *page;
+ struct cl_page **pvec;
+ const struct cl_page_slice *slice;
+ const struct lu_device_type *dtype;
+ pgoff_t idx;
+ unsigned int nr;
+ unsigned int i;
+ unsigned int j;
+ int res = CLP_GANG_OKAY;
+ int tree_lock = 1;
+ ENTRY;
+
+ idx = start;
+ hdr = cl_object_header(obj);
+ pvec = cl_env_info(env)->clt_pvec;
+ dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+ spin_lock(&hdr->coh_page_guard);
+ while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+ idx, CLT_PVEC_SIZE)) > 0) {
+ int end_of_region = 0;
+ idx = pvec[nr - 1]->cp_index + 1;
+ for (i = 0, j = 0; i < nr; ++i) {
+ page = pvec[i];
+ pvec[i] = NULL;
+
+ LASSERT(page->cp_type == CPT_CACHEABLE);
+ if (page->cp_index > end) {
+ end_of_region = 1;
+ break;
+ }
+ if (page->cp_state == CPS_FREEING)
+ continue;
+
+ slice = cl_page_at_trusted(page, dtype);
+ /*
+ * Pages for lsm-less file has no underneath sub-page
+ * for osc, in case of ...
+ */
+ PASSERT(env, page, slice != NULL);
+
+ page = slice->cpl_page;
+ /*
+ * Can safely call cl_page_get_trust() under
+ * radix-tree spin-lock.
+ *
+ * XXX not true, because @page is from object another
+ * than @hdr and protected by different tree lock.
+ */
+ cl_page_get_trust(page);
+ lu_ref_add_atomic(&page->cp_reference,
+ "gang_lookup", current);
+ pvec[j++] = page;
+ }
+
+ /*
+ * Here a delicate locking dance is performed. Current thread
+ * holds a reference to a page, but has to own it before it
+ * can be placed into queue. Owning implies waiting, so
+ * radix-tree lock is to be released. After a wait one has to
+ * check that pages weren't truncated (cl_page_own() returns
+ * error in the latter case).
+ */
+ spin_unlock(&hdr->coh_page_guard);
+ tree_lock = 0;
+
+ for (i = 0; i < j; ++i) {
+ page = pvec[i];
+ if (res == CLP_GANG_OKAY)
+ res = (*cb)(env, io, page, cbdata);
+ lu_ref_del(&page->cp_reference,
+ "gang_lookup", current);
+ cl_page_put(env, page);
+ }
+ if (nr < CLT_PVEC_SIZE || end_of_region)
+ break;
+
+ if (res == CLP_GANG_OKAY && need_resched())
+ res = CLP_GANG_RESCHED;
+ if (res != CLP_GANG_OKAY)
+ break;
+
+ spin_lock(&hdr->coh_page_guard);
+ tree_lock = 1;
+ }
+ if (tree_lock)
+ spin_unlock(&hdr->coh_page_guard);
+ RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+ struct cl_object *obj = page->cp_obj;
+ int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+ PASSERT(env, page, list_empty(&page->cp_batch));
+ PASSERT(env, page, page->cp_owner == NULL);
+ PASSERT(env, page, page->cp_req == NULL);
+ PASSERT(env, page, page->cp_parent == NULL);
+ PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+ ENTRY;
+ might_sleep();
+ while (!list_empty(&page->cp_layers)) {
+ struct cl_page_slice *slice;
+
+ slice = list_entry(page->cp_layers.next,
+ struct cl_page_slice, cpl_linkage);
+ list_del_init(page->cp_layers.next);
+ slice->cpl_ops->cpo_fini(env, slice);
+ }
+ CS_PAGE_DEC(obj, total);
+ CS_PAGESTATE_DEC(obj, page->cp_state);
+ lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+ cl_object_put(env, obj);
+ lu_ref_fini(&page->cp_reference);
+ OBD_FREE(page, pagesize);
+ EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+ enum cl_page_state state)
+{
+ /* bypass const. */
+ *(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+ struct cl_object *o, pgoff_t ind, struct page *vmpage,
+ enum cl_page_type type)
+{
+ struct cl_page *page;
+ struct lu_object_header *head;
+
+ ENTRY;
+ OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+ __GFP_IO);
+ if (page != NULL) {
+ int result = 0;
+ atomic_set(&page->cp_ref, 1);
+ if (type == CPT_CACHEABLE) /* for radix tree */
+ atomic_inc(&page->cp_ref);
+ page->cp_obj = o;
+ cl_object_get(o);
+ page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+ page->cp_index = ind;
+ cl_page_state_set_trust(page, CPS_CACHED);
+ page->cp_type = type;
+ INIT_LIST_HEAD(&page->cp_layers);
+ INIT_LIST_HEAD(&page->cp_batch);
+ INIT_LIST_HEAD(&page->cp_flight);
+ mutex_init(&page->cp_mutex);
+ lu_ref_init(&page->cp_reference);
+ head = o->co_lu.lo_header;
+ list_for_each_entry(o, &head->loh_layers,
+ co_lu.lo_linkage) {
+ if (o->co_ops->coo_page_init != NULL) {
+ result = o->co_ops->coo_page_init(env, o,
+ page, vmpage);
+ if (result != 0) {
+ cl_page_delete0(env, page, 0);
+ cl_page_free(env, page);
+ page = ERR_PTR(result);
+ break;
+ }
+ }
+ }
+ if (result == 0) {
+ CS_PAGE_INC(o, total);
+ CS_PAGE_INC(o, create);
+ CS_PAGESTATE_DEC(o, CPS_CACHED);
+ }
+ } else {
+ page = ERR_PTR(-ENOMEM);
+ }
+ RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+ struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type,
+ struct cl_page *parent)
+{
+ struct cl_page *page = NULL;
+ struct cl_page *ghost = NULL;
+ struct cl_object_header *hdr;
+ int err;
+
+ LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+ might_sleep();
+
+ ENTRY;
+
+ hdr = cl_object_header(o);
+ CS_PAGE_INC(o, lookup);
+
+ CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+ idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+ /* fast path. */
+ if (type == CPT_CACHEABLE) {
+ /* vmpage lock is used to protect the child/parent
+ * relationship */
+ KLASSERT(PageLocked(vmpage));
+ /*
+ * cl_vmpage_page() can be called here without any locks as
+ *
+ * - "vmpage" is locked (which prevents ->private from
+ * concurrent updates), and
+ *
+ * - "o" cannot be destroyed while current thread holds a
+ * reference on it.
+ */
+ page = cl_vmpage_page(vmpage, o);
+ PINVRNT(env, page,
+ ergo(page != NULL,
+ cl_page_vmpage(env, page) == vmpage &&
+ (void *)radix_tree_lookup(&hdr->coh_tree,
+ idx) == page));
+ }
+
+ if (page != NULL) {
+ CS_PAGE_INC(o, hit);
+ RETURN(page);
+ }
+
+ /* allocate and initialize cl_page */
+ page = cl_page_alloc(env, o, idx, vmpage, type);
+ if (IS_ERR(page))
+ RETURN(page);
+
+ if (type == CPT_TRANSIENT) {
+ if (parent) {
+ LASSERT(page->cp_parent == NULL);
+ page->cp_parent = parent;
+ parent->cp_child = page;
+ }
+ RETURN(page);
+ }
+
+ /*
+ * XXX optimization: use radix_tree_preload() here, and change tree
+ * gfp mask to GFP_KERNEL in cl_object_header_init().
+ */
+ spin_lock(&hdr->coh_page_guard);
+ err = radix_tree_insert(&hdr->coh_tree, idx, page);
+ if (err != 0) {
+ ghost = page;
+ /*
+ * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+ * from this race, but
+ *
+ * 0. it's better to have cl_page interface "locally
+ * consistent" so that its correctness can be reasoned
+ * about without appealing to the (obscure world of) VM
+ * locking.
+ *
+ * 1. handling this race allows ->coh_tree to remain
+ * consistent even when VM locking is somehow busted,
+ * which is very useful during diagnosing and debugging.
+ */
+ page = ERR_PTR(err);
+ CL_PAGE_DEBUG(D_ERROR, env, ghost,
+ "fail to insert into radix tree: %d\n", err);
+ } else {
+ if (parent) {
+ LASSERT(page->cp_parent == NULL);
+ page->cp_parent = parent;
+ parent->cp_child = page;
+ }
+ hdr->coh_pages++;
+ }
+ spin_unlock(&hdr->coh_page_guard);
+
+ if (unlikely(ghost != NULL)) {
+ cl_page_delete0(env, ghost, 0);
+ cl_page_free(env, ghost);
+ }
+ RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ enum cl_page_type type)
+{
+ return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+ pgoff_t idx, struct page *vmpage,
+ struct cl_page *parent)
+{
+ return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+ struct cl_object_header *header;
+ struct cl_page *parent;
+ struct cl_page *child;
+ struct cl_io *owner;
+
+ /*
+ * Page invariant is protected by a VM lock.
+ */
+ LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+ header = cl_object_header(pg->cp_obj);
+ parent = pg->cp_parent;
+ child = pg->cp_child;
+ owner = pg->cp_owner;
+
+ return cl_page_in_use(pg) &&
+ ergo(parent != NULL, parent->cp_child == pg) &&
+ ergo(child != NULL, child->cp_parent == pg) &&
+ ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+ ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+ ergo(owner != NULL && parent != NULL,
+ parent->cp_owner == pg->cp_owner->ci_parent) &&
+ ergo(owner != NULL && child != NULL,
+ child->cp_owner->ci_parent == owner) &&
+ /*
+ * Either page is early in initialization (has neither child
+ * nor parent yet), or it is in the object radix tree.
+ */
+ ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+ (void *)radix_tree_lookup(&header->coh_tree,
+ pg->cp_index) == pg ||
+ (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+ struct cl_page *page, enum cl_page_state state)
+{
+ enum cl_page_state old;
+
+ /*
+ * Matrix of allowed state transitions [old][new], for sanity
+ * checking.
+ */
+ static const int allowed_transitions[CPS_NR][CPS_NR] = {
+ [CPS_CACHED] = {
+ [CPS_CACHED] = 0,
+ [CPS_OWNED] = 1, /* io finds existing cached page */
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 1, /* write-out from the cache */
+ [CPS_FREEING] = 1, /* eviction on the memory pressure */
+ },
+ [CPS_OWNED] = {
+ [CPS_CACHED] = 1, /* release to the cache */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 1, /* start read immediately */
+ [CPS_PAGEOUT] = 1, /* start write immediately */
+ [CPS_FREEING] = 1, /* lock invalidation or truncate */
+ },
+ [CPS_PAGEIN] = {
+ [CPS_CACHED] = 1, /* io completion */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ },
+ [CPS_PAGEOUT] = {
+ [CPS_CACHED] = 1, /* io completion */
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ },
+ [CPS_FREEING] = {
+ [CPS_CACHED] = 0,
+ [CPS_OWNED] = 0,
+ [CPS_PAGEIN] = 0,
+ [CPS_PAGEOUT] = 0,
+ [CPS_FREEING] = 0,
+ }
+ };
+
+ ENTRY;
+ old = page->cp_state;
+ PASSERT(env, page, allowed_transitions[old][state]);
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+ for (; page != NULL; page = page->cp_child) {
+ PASSERT(env, page, page->cp_state == old);
+ PASSERT(env, page,
+ equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+ CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+ CS_PAGESTATE_INC(page->cp_obj, state);
+ cl_page_state_set_trust(page, state);
+ }
+ EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+ struct cl_page *page, enum cl_page_state state)
+{
+ cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+ ENTRY;
+ cl_page_get_trust(page);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+ PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+ ENTRY;
+ CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+ atomic_read(&page->cp_ref));
+
+ if (atomic_dec_and_test(&page->cp_ref)) {
+ LASSERT(page->cp_state == CPS_FREEING);
+
+ LASSERT(atomic_read(&page->cp_ref) == 0);
+ PASSERT(env, page, page->cp_owner == NULL);
+ PASSERT(env, page, list_empty(&page->cp_batch));
+ /*
+ * Page is no longer reachable by other threads. Tear
+ * it down.
+ */
+ cl_page_free(env, page);
+ }
+
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+ const struct cl_page_slice *slice;
+
+ /*
+ * Find uppermost layer with ->cpo_vmpage() method, and return its
+ * result.
+ */
+ page = cl_page_top(page);
+ do {
+ list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+ if (slice->cpl_ops->cpo_vmpage != NULL)
+ RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+ }
+ page = page->cp_child;
+ } while (page != NULL);
+ LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+ struct cl_page *top;
+ struct cl_page *page;
+
+ ENTRY;
+ KLASSERT(PageLocked(vmpage));
+
+ /*
+ * NOTE: absence of races and liveness of data are guaranteed by page
+ * lock on a "vmpage". That works because object destruction has
+ * bottom-to-top pass.
+ */
+
+ /*
+ * This loop assumes that ->private points to the top-most page. This
+ * can be rectified easily.
+ */
+ top = (struct cl_page *)vmpage->private;
+ if (top == NULL)
+ RETURN(NULL);
+
+ for (page = top; page != NULL; page = page->cp_child) {
+ if (cl_object_same(page->cp_obj, obj)) {
+ cl_page_get_trust(page);
+ break;
+ }
+ }
+ LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+ RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+ return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+ const struct lu_device_type *dtype)
+{
+ return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \
+({ \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ int __result; \
+ ptrdiff_t __op = (_op); \
+ int (*__method)_proto; \
+ \
+ __result = 0; \
+ __page = cl_page_top(__page); \
+ do { \
+ list_for_each_entry(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) { \
+ __result = (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ if (__result != 0) \
+ break; \
+ } \
+ } \
+ __page = __page->cp_child; \
+ } while (__page != NULL && __result == 0); \
+ if (__result > 0) \
+ __result = 0; \
+ __result; \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \
+do { \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ ptrdiff_t __op = (_op); \
+ void (*__method)_proto; \
+ \
+ __page = cl_page_top(__page); \
+ do { \
+ list_for_each_entry(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) \
+ (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ } \
+ __page = __page->cp_child; \
+ } while (__page != NULL); \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \
+do { \
+ const struct lu_env *__env = (_env); \
+ struct cl_page *__page = (_page); \
+ const struct cl_page_slice *__scan; \
+ ptrdiff_t __op = (_op); \
+ void (*__method)_proto; \
+ \
+ /* get to the bottom page. */ \
+ while (__page->cp_child != NULL) \
+ __page = __page->cp_child; \
+ do { \
+ list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+ cpl_linkage) { \
+ __method = *(void **)((char *)__scan->cpl_ops + \
+ __op); \
+ if (__method != NULL) \
+ (*__method)(__env, __scan, \
+ ## __VA_ARGS__); \
+ } \
+ __page = __page->cp_parent; \
+ } while (__page != NULL); \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+ PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+ ENTRY;
+ RETURN(CL_PAGE_INVOKE(env, page, op,
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+ PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+ ENTRY;
+ CL_PAGE_INVOID(env, page, op,
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *), io);
+ EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+ ENTRY;
+ for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+ if (page->cp_owner != NULL) {
+ LASSERT(page->cp_owner->ci_owned_nr > 0);
+ page->cp_owner->ci_owned_nr--;
+ page->cp_owner = NULL;
+ page->cp_task = NULL;
+ }
+ }
+ EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+ ENTRY;
+ for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+ LASSERT(page->cp_owner != NULL);
+ page->cp_owner->ci_owned_nr++;
+ }
+ EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ enum cl_page_state state;
+
+ ENTRY;
+ state = pg->cp_state;
+ PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ cl_page_owner_clear(pg);
+
+ if (state == CPS_OWNED)
+ cl_page_state_set(env, pg, CPS_CACHED);
+ /*
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for VFS/VM interaction runs
+ * last and can release locks safely.
+ */
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+ EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+ LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+ ENTRY;
+ RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0 success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ * or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, int nonblock)
+{
+ int result;
+
+ PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+ ENTRY;
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+
+ if (pg->cp_state == CPS_FREEING) {
+ result = -ENOENT;
+ } else {
+ result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+ (const struct lu_env *,
+ const struct cl_page_slice *,
+ struct cl_io *, int),
+ io, nonblock);
+ if (result == 0) {
+ PASSERT(env, pg, pg->cp_owner == NULL);
+ PASSERT(env, pg, pg->cp_req == NULL);
+ pg->cp_owner = io;
+ pg->cp_task = current;
+ cl_page_owner_set(pg);
+ if (pg->cp_state != CPS_FREEING) {
+ cl_page_state_set(env, pg, CPS_OWNED);
+ } else {
+ cl_page_disown0(env, io, pg);
+ result = -ENOENT;
+ }
+ }
+ }
+ PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+ RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+ return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg)
+{
+ return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+ ENTRY;
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+
+ cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+ PASSERT(env, pg, pg->cp_owner == NULL);
+ pg->cp_owner = io;
+ pg->cp_task = current;
+ cl_page_owner_set(pg);
+ cl_page_state_set(env, pg, CPS_OWNED);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ ENTRY;
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+ cl_page_owner_clear(pg);
+ cl_page_state_set(env, pg, CPS_CACHED);
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+ ENTRY;
+ pg = cl_page_top(pg);
+ io = cl_io_top(io);
+ cl_page_disown0(env, io, pg);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+ int radix)
+{
+ struct cl_page *tmp = pg;
+ ENTRY;
+
+ PASSERT(env, pg, pg == cl_page_top(pg));
+ PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+ /*
+ * Severe all ways to obtain new pointers to @pg.
+ */
+ cl_page_owner_clear(pg);
+
+ /*
+ * unexport the page firstly before freeing it so that
+ * the page content is considered to be invalid.
+ * We have to do this because a CPS_FREEING cl_page may
+ * be NOT under the protection of a cl_lock.
+ * Afterwards, if this page is found by other threads, then this
+ * page will be forced to reread.
+ */
+ cl_page_export(env, pg, 0);
+ cl_page_state_set0(env, pg, CPS_FREEING);
+
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+ (const struct lu_env *, const struct cl_page_slice *));
+
+ if (tmp->cp_type == CPT_CACHEABLE) {
+ if (!radix)
+ /* !radix means that @pg is not yet in the radix tree,
+ * skip removing it.
+ */
+ tmp = pg->cp_child;
+ for (; tmp != NULL; tmp = tmp->cp_child) {
+ void *value;
+ struct cl_object_header *hdr;
+
+ hdr = cl_object_header(tmp->cp_obj);
+ spin_lock(&hdr->coh_page_guard);
+ value = radix_tree_delete(&hdr->coh_tree,
+ tmp->cp_index);
+ PASSERT(env, tmp, value == tmp);
+ PASSERT(env, tmp, hdr->coh_pages > 0);
+ hdr->coh_pages--;
+ spin_unlock(&hdr->coh_page_guard);
+ cl_page_put(env, tmp);
+ }
+ }
+
+ EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ * - removes page from the radix trees,
+ *
+ * - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre pg == cl_page_top(pg)
+ * \pre VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ ENTRY;
+ cl_page_delete0(env, pg, 1);
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+ struct cl_io *io, struct cl_page *pg)
+{
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+ (const struct lu_env *,
+ const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+ int result;
+ const struct cl_page_slice *slice;
+
+ ENTRY;
+ pg = cl_page_top_trusted((struct cl_page *)pg);
+ slice = container_of(pg->cp_layers.next,
+ const struct cl_page_slice, cpl_linkage);
+ PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+ /*
+ * Call ->cpo_is_vmlocked() directly instead of going through
+ * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+ * cl_page_invariant().
+ */
+ result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+ PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+ RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+ ENTRY;
+ RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ /*
+ * Page is queued for IO, change its state.
+ */
+ ENTRY;
+ cl_page_owner_clear(pg);
+ cl_page_state_set(env, pg, cl_req_type_state(crt));
+ EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ int result;
+
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+ PINVRNT(env, pg, crt < CRT_NR);
+
+ /*
+ * XXX this has to be called bottom-to-top, so that llite can set up
+ * PG_writeback without risking other layers deciding to skip this
+ * page.
+ */
+ if (crt >= CRT_NR)
+ return -EINVAL;
+ result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+ if (result == 0)
+ cl_page_io_start(env, pg, crt);
+
+ KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+ equi(result == 0,
+ PageWriteback(cl_page_vmpage(env, pg)))));
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+ struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+ struct cl_sync_io *anchor = pg->cp_sync_io;
+
+ PASSERT(env, pg, crt < CRT_NR);
+ /* cl_page::cp_req already cleared by the caller (osc_completion()) */
+ PASSERT(env, pg, pg->cp_req == NULL);
+ PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+ ENTRY;
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+ if (crt == CRT_READ && ioret == 0) {
+ PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+ pg->cp_flags |= CPF_READ_COMPLETED;
+ }
+
+ cl_page_state_set(env, pg, CPS_CACHED);
+ if (crt >= CRT_NR)
+ return;
+ CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+ (const struct lu_env *,
+ const struct cl_page_slice *, int), ioret);
+ if (anchor) {
+ LASSERT(cl_page_is_vmlocked(env, pg));
+ LASSERT(pg->cp_sync_io == anchor);
+ pg->cp_sync_io = NULL;
+ }
+ /*
+ * As page->cp_obj is pinned by a reference from page->cp_req, it is
+ * safe to call cl_page_put() without risking object destruction in a
+ * non-blocking context.
+ */
+ cl_page_put(env, pg);
+
+ if (anchor)
+ cl_sync_io_note(anchor, ioret);
+
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+ enum cl_req_type crt)
+{
+ int result;
+
+ PINVRNT(env, pg, crt < CRT_NR);
+
+ ENTRY;
+ if (crt >= CRT_NR)
+ RETURN(-EINVAL);
+ result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+ (const struct lu_env *,
+ const struct cl_page_slice *));
+ if (result == 0) {
+ PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+ cl_page_io_start(env, pg, crt);
+ }
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg, enum cl_req_type crt)
+{
+ const struct cl_page_slice *scan;
+ int result = 0;
+
+ PINVRNT(env, pg, crt < CRT_NR);
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ ENTRY;
+
+ if (crt >= CRT_NR)
+ RETURN(-EINVAL);
+
+ list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+ if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+ continue;
+
+ result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+ if (result != 0)
+ break;
+ }
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *pg)
+{
+ int result;
+
+ PINVRNT(env, pg, cl_page_is_owned(pg, io));
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ ENTRY;
+
+ result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page)
+{
+ int rc;
+
+ PINVRNT(env, page, cl_page_invariant(page));
+
+ ENTRY;
+ rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+ (const struct lu_env *,
+ const struct cl_page_slice *, struct cl_io *),
+ io);
+ PASSERT(env, page, rc != 0);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+ struct cl_page *page, void *cbdata)
+{
+ cl_page_own(env, io, page);
+ cl_page_unmap(env, io, page);
+ cl_page_discard(env, io, page);
+ cl_page_disown(env, io, page);
+ return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+ struct cl_thread_info *info;
+ struct cl_object *obj = cl_object_top(clobj);
+ struct cl_io *io;
+ int result;
+
+ ENTRY;
+ info = cl_env_info(env);
+ io = &info->clt_io;
+
+ /*
+ * initialize the io. This is ugly since we never do IO in this
+ * function, we just make cl_page_list functions happy. -jay
+ */
+ io->ci_obj = obj;
+ io->ci_ignore_layout = 1;
+ result = cl_io_init(env, io, CIT_MISC, obj);
+ if (result != 0) {
+ cl_io_fini(env, io);
+ RETURN(io->ci_result);
+ }
+
+ do {
+ result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+ page_prune_cb, NULL);
+ if (result == CLP_GANG_RESCHED)
+ cond_resched();
+ } while (result != CLP_GANG_OKAY);
+
+ cl_io_fini(env, io);
+ RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+ int from, int to)
+{
+ PINVRNT(env, pg, cl_page_invariant(pg));
+
+ CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+ CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+ (const struct lu_env *,
+ const struct cl_page_slice *,int, int),
+ from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_page *pg)
+{
+ (*printer)(env, cookie,
+ "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+ pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+ pg->cp_index, pg->cp_parent, pg->cp_child,
+ pg->cp_state, pg->cp_error, pg->cp_type,
+ pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct cl_page *pg)
+{
+ struct cl_page *scan;
+
+ for (scan = cl_page_top((struct cl_page *)pg);
+ scan != NULL; scan = scan->cp_child)
+ cl_page_header_print(env, cookie, printer, scan);
+ CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+ (const struct lu_env *env,
+ const struct cl_page_slice *slice,
+ void *cookie, lu_printer_t p), cookie, printer);
+ (*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+ return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+ (const struct lu_env *,
+ const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+ /*
+ * XXX for now.
+ */
+ return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+ /*
+ * XXX for now.
+ */
+ return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+ return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+ struct cl_object *obj,
+ const struct cl_page_operations *ops)
+{
+ ENTRY;
+ list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+ slice->cpl_obj = obj;
+ slice->cpl_ops = ops;
+ slice->cpl_page = page;
+ EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int cl_page_init(void)
+{
+ return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 000000000000..af1c2d09c47b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -0,0 +1,689 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+ int jobid_len = JOBSTATS_JOBID_SIZE;
+ int rc = 0;
+ ENTRY;
+
+ memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+ /* Jobstats isn't enabled */
+ if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+ RETURN(0);
+
+ /* Use process name + fsuid as jobid */
+ if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+ snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+ current_comm(), current_fsuid());
+ RETURN(0);
+ }
+
+ rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+ if (rc) {
+ if (rc == -EOVERFLOW) {
+ /* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+ * variable length strings instead of just numbers), it
+ * might make sense to keep the unique parts for JobID,
+ * instead of just returning an error. That means a
+ * larger temp buffer for cfs_get_environ(), then
+ * truncating the string at some separator to fit into
+ * the specified jobid_len. Fix later if needed. */
+ static bool printed;
+ if (unlikely(!printed)) {
+ LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+ "for JobID buffer (%d)\n",
+ obd_jobid_var, jobid_len);
+ printed = true;
+ }
+ } else {
+ CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+ rc == -EDEADLK) ? D_INFO : D_ERROR,
+ "Get jobid for (%s) failed: rc = %d\n",
+ obd_jobid_var, rc);
+ }
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+ size_t size, const char *file, int line)
+{
+ if (ptr == NULL ||
+ (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+ CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+ ptr ? "force " :"", type, name, (__u64)size, file,
+ line);
+ CERROR(LPU64" total bytes and "LPU64" total pages "
+ "("LPU64" bytes) allocated by Lustre, "
+ "%d total bytes by LNET\n",
+ obd_memory_sum(),
+ obd_pages_sum() << PAGE_CACHE_SHIFT,
+ obd_pages_sum(),
+ atomic_read(&libcfs_kmemory));
+ return 1;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+ struct obd_ioctl_data *data)
+{
+ memset(conn, 0, sizeof *conn);
+ conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+ struct lustre_handle *conn)
+{
+ data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+ int rc;
+ int dev;
+
+ ENTRY;
+ if (!len || !name) {
+ CERROR("No name passed,!\n");
+ GOTO(out, rc = -EINVAL);
+ }
+ if (name[len - 1] != 0) {
+ CERROR("Name not nul terminated!\n");
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CDEBUG(D_IOCTL, "device name %s\n", name);
+ dev = class_name2dev(name);
+ if (dev == -1) {
+ CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+ rc = dev;
+
+out:
+ RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+ char *buf = NULL;
+ struct obd_ioctl_data *data;
+ struct libcfs_debug_ioctl_data *debug_data;
+ struct obd_device *obd = NULL;
+ int err = 0, len = 0;
+ ENTRY;
+
+ /* only for debugging */
+ if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+ debug_data = (struct libcfs_debug_ioctl_data*)arg;
+ libcfs_subsystem_debug = debug_data->subs;
+ libcfs_debug = debug_data->debug;
+ return 0;
+ }
+
+ CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+ if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+ CERROR("OBD ioctl: data error\n");
+ RETURN(-EINVAL);
+ }
+ data = (struct obd_ioctl_data *)buf;
+
+ switch (cmd) {
+ case OBD_IOC_PROCESS_CFG: {
+ struct lustre_cfg *lcfg;
+
+ if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+ CERROR("No config buffer passed!\n");
+ GOTO(out, err = -EINVAL);
+ }
+ OBD_ALLOC(lcfg, data->ioc_plen1);
+ if (lcfg == NULL)
+ GOTO(out, err = -ENOMEM);
+ err = copy_from_user(lcfg, data->ioc_pbuf1,
+ data->ioc_plen1);
+ if (!err)
+ err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+ if (!err)
+ err = class_process_config(lcfg);
+
+ OBD_FREE(lcfg, data->ioc_plen1);
+ GOTO(out, err);
+ }
+
+ case OBD_GET_VERSION:
+ if (!data->ioc_inlbuf1) {
+ CERROR("No buffer passed in ioctl\n");
+ GOTO(out, err = -EINVAL);
+ }
+
+ if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+ CERROR("ioctl buffer too small to hold version\n");
+ GOTO(out, err = -EINVAL);
+ }
+
+ memcpy(data->ioc_bulk, BUILD_VERSION,
+ strlen(BUILD_VERSION) + 1);
+
+ err = obd_ioctl_popdata((void *)arg, data, len);
+ if (err)
+ err = -EFAULT;
+ GOTO(out, err);
+
+ case OBD_IOC_NAME2DEV: {
+ /* Resolve a device name. This does not change the
+ * currently selected device.
+ */
+ int dev;
+
+ dev = class_resolve_dev_name(data->ioc_inllen1,
+ data->ioc_inlbuf1);
+ data->ioc_dev = dev;
+ if (dev < 0)
+ GOTO(out, err = -EINVAL);
+
+ err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+ if (err)
+ err = -EFAULT;
+ GOTO(out, err);
+ }
+
+ case OBD_IOC_UUID2DEV: {
+ /* Resolve a device uuid. This does not change the
+ * currently selected device.
+ */
+ int dev;
+ struct obd_uuid uuid;
+
+ if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+ CERROR("No UUID passed!\n");
+ GOTO(out, err = -EINVAL);
+ }
+ if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+ CERROR("UUID not NUL terminated!\n");
+ GOTO(out, err = -EINVAL);
+ }
+
+ CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+ obd_str2uuid(&uuid, data->ioc_inlbuf1);
+ dev = class_uuid2dev(&uuid);
+ data->ioc_dev = dev;
+ if (dev == -1) {
+ CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+ data->ioc_inlbuf1);
+ GOTO(out, err = -EINVAL);
+ }
+
+ CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+ dev);
+ err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+ if (err)
+ err = -EFAULT;
+ GOTO(out, err);
+ }
+
+ case OBD_IOC_CLOSE_UUID: {
+ CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+ data->ioc_inlbuf1);
+ GOTO(out, err = 0);
+ }
+
+ case OBD_IOC_GETDEVICE: {
+ int index = data->ioc_count;
+ char *status, *str;
+
+ if (!data->ioc_inlbuf1) {
+ CERROR("No buffer passed in ioctl\n");
+ GOTO(out, err = -EINVAL);
+ }
+ if (data->ioc_inllen1 < 128) {
+ CERROR("ioctl buffer too small to hold version\n");
+ GOTO(out, err = -EINVAL);
+ }
+
+ obd = class_num2obd(index);
+ if (!obd)
+ GOTO(out, err = -ENOENT);
+
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+ str = (char *)data->ioc_bulk;
+ snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+ (int)index, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+ err = obd_ioctl_popdata((void *)arg, data, len);
+
+ GOTO(out, err = 0);
+ }
+
+ }
+
+ if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+ if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+ GOTO(out, err = -EINVAL);
+ if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+ GOTO(out, err = -EINVAL);
+ obd = class_name2obd(data->ioc_inlbuf4);
+ } else if (data->ioc_dev < class_devno_max()) {
+ obd = class_num2obd(data->ioc_dev);
+ } else {
+ CERROR("OBD ioctl: No device\n");
+ GOTO(out, err = -EINVAL);
+ }
+
+ if (obd == NULL) {
+ CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+ GOTO(out, err = -EINVAL);
+ }
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+ if (!obd->obd_set_up || obd->obd_stopping) {
+ CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+ GOTO(out, err = -EINVAL);
+ }
+
+ switch(cmd) {
+ case OBD_IOC_NO_TRANSNO: {
+ if (!obd->obd_attached) {
+ CERROR("Device %d not attached\n", obd->obd_minor);
+ GOTO(out, err = -ENODEV);
+ }
+ CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+ obd->obd_name);
+ obd->obd_no_transno = 1;
+ GOTO(out, err = 0);
+ }
+
+ default: {
+ err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+ if (err)
+ GOTO(out, err);
+
+ err = obd_ioctl_popdata((void *)arg, data, len);
+ if (err)
+ err = -EFAULT;
+ GOTO(out, err);
+ }
+ }
+
+ out:
+ if (buf)
+ obd_ioctl_freedata(buf, len);
+ RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+ __u64 u64val, div64val;
+ char buf[64];
+ int len, ret = 0;
+
+ CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+ CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+ u64val = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+ if (u64val != OBD_OBJECT_EOF) {
+ CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPX64, u64val);
+ if (len != 18) {
+ CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+ ret = -EINVAL;
+ }
+
+ div64val = OBD_OBJECT_EOF;
+ CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+ if (u64val != OBD_OBJECT_EOF) {
+ CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ ret = -EOVERFLOW;
+ }
+ if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+ CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+ u64val, (int)sizeof(u64val));
+ return -EOVERFLOW;
+ }
+ if (do_div(div64val, 256) != (u64val & 255)) {
+ CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+ return -EOVERFLOW;
+ }
+ if (u64val >> 8 != div64val) {
+ CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+ u64val, div64val, u64val >> 8);
+ return -EOVERFLOW;
+ }
+ len = snprintf(buf, sizeof(buf), LPX64, u64val);
+ if (len != 18) {
+ CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPU64, u64val);
+ if (len != 20) {
+ CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+ ret = -EINVAL;
+ }
+ len = snprintf(buf, sizeof(buf), LPD64, u64val);
+ if (len != 2) {
+ CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+ ret = -EINVAL;
+ }
+ if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+ CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+ (__u64)PAGE_CACHE_SIZE);
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+ int i, err;
+ int lustre_register_fs(void);
+
+ for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+ INIT_LIST_HEAD(&capa_list[i]);
+
+ LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+ spin_lock_init(&obd_types_lock);
+ obd_zombie_impexp_init();
+#ifdef LPROCFS
+ obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+ LPROCFS_STATS_FLAG_NONE |
+ LPROCFS_STATS_FLAG_IRQ_SAFE);
+ if (obd_memory == NULL) {
+ CERROR("kmalloc of 'obd_memory' failed\n");
+ RETURN(-ENOMEM);
+ }
+
+ lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+ LPROCFS_CNTR_AVGMINMAX,
+ "memused", "bytes");
+ lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+ LPROCFS_CNTR_AVGMINMAX,
+ "pagesused", "pages");
+#endif
+ err = obd_init_checks();
+ if (err == -EOVERFLOW)
+ return err;
+
+ class_init_uuidlist();
+ err = class_handle_init();
+ if (err)
+ return err;
+
+ INIT_LIST_HEAD(&obd_types);
+
+ err = misc_register(&obd_psdev);
+ if (err) {
+ CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+ return err;
+ }
+
+ /* This struct is already zeroed for us (static global) */
+ for (i = 0; i < class_devno_max(); i++)
+ obd_devs[i] = NULL;
+
+ /* Default the dirty page cache cap to 1/2 of system memory.
+ * For clients with less memory, a larger fraction is needed
+ * for other purposes (mostly for BGL). */
+ if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+ obd_max_dirty_pages = num_physpages / 4;
+ else
+ obd_max_dirty_pages = num_physpages / 2;
+
+ err = obd_init_caches();
+ if (err)
+ return err;
+ err = class_procfs_init();
+ if (err)
+ return err;
+
+ err = lu_global_init();
+ if (err)
+ return err;
+
+ err = cl_global_init();
+ if (err != 0)
+ return err;
+
+
+ err = llog_info_init();
+ if (err)
+ return err;
+
+ err = lustre_register_fs();
+
+ return err;
+}
+
+void obd_update_maxusage(void)
+{
+ __u64 max1, max2;
+
+ max1 = obd_pages_sum();
+ max2 = obd_memory_sum();
+
+ spin_lock(&obd_updatemax_lock);
+ if (max1 > obd_max_pages)
+ obd_max_pages = max1;
+ if (max2 > obd_max_alloc)
+ obd_max_alloc = max2;
+ spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+ __u64 ret;
+
+ spin_lock(&obd_updatemax_lock);
+ ret = obd_max_alloc;
+ spin_unlock(&obd_updatemax_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+ __u64 ret;
+
+ spin_lock(&obd_updatemax_lock);
+ ret = obd_max_pages;
+ spin_unlock(&obd_updatemax_lock);
+
+ return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently. we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+ int i;
+ int lustre_unregister_fs(void);
+ __u64 memory_leaked, pages_leaked;
+ __u64 memory_max, pages_max;
+ ENTRY;
+
+ lustre_unregister_fs();
+
+ misc_deregister(&obd_psdev);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+ if (obd && obd->obd_set_up &&
+ OBT(obd) && OBP(obd, detach)) {
+ /* XXX should this call generic detach otherwise? */
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ OBP(obd, detach)(obd);
+ }
+ }
+ llog_info_fini();
+ cl_global_fini();
+ lu_global_fini();
+
+ obd_cleanup_caches();
+ obd_sysctl_clean();
+
+ class_procfs_clean();
+
+ class_handle_cleanup();
+ class_exit_uuidlist();
+ obd_zombie_impexp_stop();
+
+ memory_leaked = obd_memory_sum();
+ pages_leaked = obd_pages_sum();
+
+ memory_max = obd_memory_max();
+ pages_max = obd_pages_max();
+
+ lprocfs_free_stats(&obd_memory);
+ CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+ "obd_memory max: "LPU64", leaked: "LPU64"\n",
+ memory_max, memory_leaked);
+ CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+ "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+ pages_max, pages_leaked);
+
+ EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 000000000000..15f71bbb7276
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/debug.c
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+ CDEBUG(D_RPCTRACE,
+ "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+ nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+ CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+ nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+ CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+ " stripe_size %u, stripe_count %u, refc: %d,"
+ " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+ POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+ lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+ atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+ lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+ LASSERT(addr);
+
+ off = cpu_to_le64 (off);
+ id = cpu_to_le64 (id);
+ memcpy(addr, (char *)&off, LPDS);
+ memcpy(addr + LPDS, (char *)&id, LPDS);
+
+ addr += len - LPDS - LPDS;
+ memcpy(addr, (char *)&off, LPDS);
+ memcpy(addr + LPDS, (char *)&id, LPDS);
+
+ return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+ __u64 ne_off;
+ int err = 0;
+
+ LASSERT(addr);
+
+ ne_off = le64_to_cpu (off);
+ id = le64_to_cpu (id);
+ if (memcmp(addr, (char *)&ne_off, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+ LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+ err = -EINVAL;
+ }
+ if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+ who, id, off, *(__u64 *)(addr + LPDS), id);
+ err = -EINVAL;
+ }
+
+ addr += end - LPDS - LPDS;
+ if (memcmp(addr, (char *)&ne_off, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+ LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+ err = -EINVAL;
+ }
+ if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+ CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+ LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+ err = -EINVAL;
+ }
+
+ return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 000000000000..1c962dd3bd2f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+ .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+ .lct_init = dt_global_key_init,
+ .lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+ list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+ list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+ struct dt_device *dev, struct thandle *th)
+{
+ int rc = 0;
+ struct dt_txn_callback *cb;
+
+ if (th->th_local)
+ return 0;
+
+ list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+ if (cb->dtc_txn_start == NULL ||
+ !(cb->dtc_tag & env->le_ctx.lc_tags))
+ continue;
+ rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+ if (rc < 0)
+ break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+ struct dt_device *dev = txn->th_dev;
+ struct dt_txn_callback *cb;
+ int rc = 0;
+
+ if (txn->th_local)
+ return 0;
+
+ list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+ if (cb->dtc_txn_stop == NULL ||
+ !(cb->dtc_tag & env->le_ctx.lc_tags))
+ continue;
+ rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+ if (rc < 0)
+ break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+ struct dt_txn_callback *cb;
+
+ if (txn->th_local)
+ return;
+
+ list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+ dtc_linkage) {
+ if (cb->dtc_txn_commit)
+ cb->dtc_txn_commit(txn, cb->dtc_cookie);
+ }
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+ INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+ return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+ lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+ struct lu_object_header *h, struct lu_device *d)
+
+{
+ return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+ lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+ if (obj->do_index_ops == NULL)
+ obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+ return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+ enum dt_format_type result;
+
+ switch (mode & S_IFMT) {
+ case S_IFDIR:
+ result = DFT_DIR;
+ break;
+ case S_IFREG:
+ result = DFT_REGULAR;
+ break;
+ case S_IFLNK:
+ result = DFT_SYM;
+ break;
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
+ result = DFT_NODE;
+ break;
+ default:
+ LBUG();
+ break;
+ }
+ return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+ const char *name, struct lu_fid *fid)
+{
+ if (dt_try_as_dir(env, dir))
+ return dt_lookup(env, dir, (struct dt_rec *)fid,
+ (const struct dt_key *)name, BYPASS_CAPA);
+ return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+ struct dt_device *dev, const struct lu_fid *fid,
+ struct lu_device *top_dev)
+{
+ struct lu_object *lo, *n;
+ ENTRY;
+
+ lo = lu_object_find_at(env, top_dev, fid, NULL);
+ if (IS_ERR(lo))
+ return (void *)lo;
+
+ LASSERT(lo != NULL);
+
+ list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+ if (n->lo_dev == &dev->dd_lu_dev)
+ return container_of0(n, struct dt_object, do_lu);
+ }
+ return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+ struct dt_find_hint *dfh = data;
+ struct dt_device *dt = dfh->dfh_dt;
+ struct lu_fid *fid = dfh->dfh_fid;
+ struct dt_object *obj = dfh->dfh_o;
+ int result;
+
+ result = dt_lookup_dir(env, obj, entry, fid);
+ lu_object_put(env, &obj->do_lu);
+ if (result == 0) {
+ obj = dt_locate(env, dt, fid);
+ if (IS_ERR(obj))
+ result = PTR_ERR(obj);
+ }
+ dfh->dfh_o = obj;
+ return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+ char *path, dt_entry_func_t entry_func,
+ void *data)
+{
+ char *e;
+ int rc = 0;
+
+ while (1) {
+ e = strsep(&path, "/");
+ if (e == NULL)
+ break;
+
+ if (e[0] == 0) {
+ if (!path || path[0] == '\0')
+ break;
+ continue;
+ }
+ rc = entry_func(env, e, data);
+ if (rc)
+ break;
+ }
+
+ return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+ const char *path, struct lu_fid *fid)
+{
+ struct dt_thread_info *info = dt_info(env);
+ struct dt_find_hint *dfh = &info->dti_dfh;
+ struct dt_object *obj;
+ char *local = info->dti_buf;
+ int result;
+
+
+ dfh->dfh_dt = dt;
+ dfh->dfh_fid = fid;
+
+ strncpy(local, path, DT_MAX_PATH);
+ local[DT_MAX_PATH - 1] = '\0';
+
+ result = dt->dd_ops->dt_root_get(env, dt, fid);
+ if (result == 0) {
+ obj = dt_locate(env, dt, fid);
+ if (!IS_ERR(obj)) {
+ dfh->dfh_o = obj;
+ result = dt_path_parser(env, local, dt_find_entry, dfh);
+ if (result != 0)
+ obj = ERR_PTR(result);
+ else
+ obj = dfh->dfh_o;
+ }
+ } else {
+ obj = ERR_PTR(result);
+ }
+ return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+ struct dt_device *dt,
+ struct dt_object *p,
+ const char *name,
+ struct lu_fid *fid)
+{
+ struct dt_object *o;
+ int result;
+
+ result = dt_lookup_dir(env, p, name, fid);
+ if (result == 0){
+ o = dt_locate(env, dt, fid);
+ }
+ else
+ o = ERR_PTR(result);
+
+ return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ * \param dt dt device
+ * \param fid on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+ struct dt_device *dt,
+ const char *dirname,
+ const char *filename,
+ struct lu_fid *fid)
+{
+ struct dt_object *file;
+ struct dt_object *dir;
+
+ dir = dt_store_resolve(env, dt, dirname, fid);
+ if (!IS_ERR(dir)) {
+ file = dt_reg_open(env, dt, dir,
+ filename, fid);
+ lu_object_put(env, &dir->do_lu);
+ } else {
+ file = dir;
+ }
+ return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object_format *dof,
+ struct lu_attr *at)
+{
+ struct dt_object *dto;
+ struct thandle *th;
+ int rc;
+
+ ENTRY;
+
+ dto = dt_locate(env, dt, fid);
+ if (IS_ERR(dto))
+ RETURN(dto);
+
+ LASSERT(dto != NULL);
+ if (dt_object_exists(dto))
+ RETURN(dto);
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ rc = dt_declare_create(env, dto, at, NULL, dof, th);
+ if (rc)
+ GOTO(trans_stop, rc);
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ GOTO(trans_stop, rc);
+
+ dt_write_lock(env, dto, 0);
+ if (dt_object_exists(dto))
+ GOTO(unlock, rc = 0);
+
+ CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+ rc = dt_create(env, dto, at, NULL, dof, th);
+ if (rc)
+ GOTO(unlock, rc);
+ LASSERT(dt_object_exists(dto));
+unlock:
+ dt_write_unlock(env, dto);
+trans_stop:
+ dt_trans_stop(env, dt, th);
+out:
+ if (rc) {
+ lu_object_put(env, &dto->do_lu);
+ RETURN(ERR_PTR(rc));
+ }
+ RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+ int result;
+
+ LU_CONTEXT_KEY_INIT(&dt_key);
+ result = lu_context_key_register(&dt_key);
+ return result;
+}
+
+void dt_global_fini(void)
+{
+ lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env lustre environment
+ * \param dt object to be read
+ * \param buf lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos)
+{
+ LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+ return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage. Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env lustre environment
+ * \param dt object to be read
+ * \param buf lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+ struct lu_buf *buf, loff_t *pos)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+ rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+ if (rc == buf->lb_len)
+ rc = 0;
+ else if (rc >= 0)
+ rc = -EFAULT;
+ return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+ const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+ int rc;
+
+ LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+ LASSERT(th != NULL);
+ LASSERT(dt->do_body_ops);
+ LASSERT(dt->do_body_ops->dbo_write);
+ rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+ if (rc == buf->lb_len)
+ rc = 0;
+ else if (rc >= 0)
+ rc = -EFAULT;
+ return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+ struct thandle *th)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+
+ LASSERT(o);
+ vbuf.lb_buf = NULL;
+ vbuf.lb_len = sizeof(dt_obj_version_t);
+ return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+ dt_obj_version_t version, struct thandle *th)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+ int rc;
+
+ LASSERT(o);
+ vbuf.lb_buf = &version;
+ vbuf.lb_len = sizeof(version);
+
+ rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+ if (rc < 0)
+ CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+ return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+ struct lu_buf vbuf;
+ char *xname = XATTR_NAME_VERSION;
+ dt_obj_version_t version;
+ int rc;
+
+ LASSERT(o);
+ vbuf.lb_buf = &version;
+ vbuf.lb_len = sizeof(version);
+ rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+ if (rc != sizeof(version)) {
+ CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+ version = 0;
+ }
+ return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+ .dif_flags = DT_IND_UPDATE,
+ .dif_keysize_min = sizeof(struct lu_fid),
+ .dif_keysize_max = sizeof(struct lu_fid),
+ .dif_recsize_min = sizeof(__u8),
+ .dif_recsize_max = sizeof(__u8),
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+ .dif_flags = DT_IND_UPDATE,
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_acct_rec), /* 16 bytes */
+ .dif_recsize_max = sizeof(struct lquota_acct_rec), /* 16 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+ .dif_flags = DT_IND_UPDATE,
+ /* a different key would have to be used for per-directory quota */
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_glb_rec), /* 32 bytes */
+ .dif_recsize_max = sizeof(struct lquota_glb_rec), /* 32 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+ .dif_flags = DT_IND_UPDATE,
+ /* a different key would have to be used for per-directory quota */
+ .dif_keysize_min = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_keysize_max = sizeof(__u64), /* 64-bit uid/gid */
+ .dif_recsize_min = sizeof(struct lquota_slv_rec), /* 8 bytes */
+ .dif_recsize_max = sizeof(struct lquota_slv_rec), /* 8 bytes */
+ .dif_ptrsize = 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+ __u32 mode)
+{
+ if (seq == FID_SEQ_QUOTA_GLB) {
+ /* global quota index */
+ if (!S_ISREG(mode))
+ /* global quota index should be a regular file */
+ return ERR_PTR(-ENOENT);
+ return &dt_quota_glb_features;
+ } else if (seq == FID_SEQ_QUOTA) {
+ /* quota slave index */
+ if (!S_ISREG(mode))
+ /* slave index should be a regular file */
+ return ERR_PTR(-ENOENT);
+ return &dt_quota_slv_features;
+ } else if (seq >= FID_SEQ_NORMAL) {
+ /* object is part of the namespace, verify that it is a
+ * directory */
+ if (!S_ISDIR(mode))
+ /* sorry, we can only deal with directory */
+ return ERR_PTR(-ENOTDIR);
+ return &dt_directory_features;
+ }
+
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+ int nob, const struct dt_it_ops *iops,
+ struct dt_it *it, __u32 attr, void *arg)
+{
+ struct idx_info *ii = (struct idx_info *)arg;
+ struct lu_idxpage *lip = &lp->lp_idx;
+ char *entry;
+ int rc, size;
+ ENTRY;
+
+ /* no support for variable key & record size for now */
+ LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+ LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+ /* initialize the header of the new container */
+ memset(lip, 0, LIP_HDR_SIZE);
+ lip->lip_magic = LIP_MAGIC;
+ nob -= LIP_HDR_SIZE;
+
+ /* compute size needed to store a key/record pair */
+ size = ii->ii_recsize + ii->ii_keysize;
+ if ((ii->ii_flags & II_FL_NOHASH) == 0)
+ /* add hash if the client wants it */
+ size += sizeof(__u64);
+
+ entry = lip->lip_entries;
+ do {
+ char *tmp_entry = entry;
+ struct dt_key *key;
+ __u64 hash;
+
+ /* fetch 64-bit hash value */
+ hash = iops->store(env, it);
+ ii->ii_hash_end = hash;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+ if (lip->lip_nr != 0)
+ GOTO(out, rc = 0);
+ }
+
+ if (nob < size) {
+ if (lip->lip_nr == 0)
+ GOTO(out, rc = -EINVAL);
+ GOTO(out, rc = 0);
+ }
+
+ if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+ /* client wants to the 64-bit hash value associated with
+ * each record */
+ memcpy(tmp_entry, &hash, sizeof(hash));
+ tmp_entry += sizeof(hash);
+ }
+
+ /* then the key value */
+ LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+ key = iops->key(env, it);
+ memcpy(tmp_entry, key, ii->ii_keysize);
+ tmp_entry += ii->ii_keysize;
+
+ /* and finally the record */
+ rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+ if (rc != -ESTALE) {
+ if (rc != 0)
+ GOTO(out, rc);
+
+ /* hash/key/record successfully copied! */
+ lip->lip_nr++;
+ if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+ ii->ii_hash_start = hash;
+ entry = tmp_entry + ii->ii_recsize;
+ nob -= size;
+ }
+
+ /* move on to the next record */
+ do {
+ rc = iops->next(env, it);
+ } while (rc == -ESTALE);
+
+ } while (rc == 0);
+
+ GOTO(out, rc);
+out:
+ if (rc >= 0 && lip->lip_nr > 0)
+ /* one more container */
+ ii->ii_count++;
+ if (rc > 0)
+ /* no more entries */
+ ii->ii_hash_end = II_END_OFF;
+ return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ * with key/record pairs in the format wanted by the caller
+ * \param arg - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+ const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+ void *arg)
+{
+ struct dt_it *it;
+ const struct dt_it_ops *iops;
+ unsigned int pageidx, nob, nlupgs = 0;
+ int rc;
+ ENTRY;
+
+ LASSERT(rdpg->rp_pages != NULL);
+ LASSERT(obj->do_index_ops != NULL);
+
+ nob = rdpg->rp_count;
+ if (nob <= 0)
+ RETURN(-EFAULT);
+
+ /* Iterate through index and fill containers from @rdpg */
+ iops = &obj->do_index_ops->dio_it;
+ LASSERT(iops != NULL);
+ it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+ if (IS_ERR(it))
+ RETURN(PTR_ERR(it));
+
+ rc = iops->load(env, it, rdpg->rp_hash);
+ if (rc == 0) {
+ /*
+ * Iterator didn't find record with exactly the key requested.
+ *
+ * It is currently either
+ *
+ * - positioned above record with key less than
+ * requested---skip it.
+ * - or not positioned at all (is in IAM_IT_SKEWED
+ * state)---position it on the next item.
+ */
+ rc = iops->next(env, it);
+ } else if (rc > 0) {
+ rc = 0;
+ }
+
+ /* Fill containers one after the other. There might be multiple
+ * containers per physical page.
+ *
+ * At this point and across for-loop:
+ * rc == 0 -> ok, proceed.
+ * rc > 0 -> end of index.
+ * rc < 0 -> error. */
+ for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+ union lu_page *lp;
+ int i;
+
+ LASSERT(pageidx < rdpg->rp_npages);
+ lp = kmap(rdpg->rp_pages[pageidx]);
+
+ /* fill lu pages */
+ for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+ rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+ iops, it, rdpg->rp_attrs, arg);
+ if (rc < 0)
+ break;
+ /* one more lu_page */
+ nlupgs++;
+ if (rc > 0)
+ /* end of index */
+ break;
+ }
+ kunmap(rdpg->rp_pages[i]);
+ }
+
+ iops->put(env, it);
+ iops->fini(env, it);
+
+ if (rc >= 0)
+ rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii - is the idx_info structure packed by the client in the
+ * OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+ struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+ const struct dt_index_features *feat;
+ struct dt_object *obj;
+ int rc;
+ ENTRY;
+
+ /* rp_count shouldn't be null and should be a multiple of the container
+ * size */
+ if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+ RETURN(-EFAULT);
+
+ if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+ /* we don't support directory transfer via OBD_IDX_READ for the
+ * time being */
+ RETURN(-EOPNOTSUPP);
+
+ if (!fid_is_quota(&ii->ii_fid))
+ /* block access to all local files except quota files */
+ RETURN(-EPERM);
+
+ /* lookup index object subject to the transfer */
+ obj = dt_locate(env, dev, &ii->ii_fid);
+ if (IS_ERR(obj))
+ RETURN(PTR_ERR(obj));
+ if (dt_object_exists(obj) == 0)
+ GOTO(out, rc = -ENOENT);
+
+ /* fetch index features associated with index object */
+ feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+ lu_object_attr(&obj->do_lu));
+ if (IS_ERR(feat))
+ GOTO(out, rc = PTR_ERR(feat));
+
+ /* load index feature if not done already */
+ if (obj->do_index_ops == NULL) {
+ rc = obj->do_ops->do_index_try(env, obj, feat);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ /* fill ii_flags with supported index features */
+ ii->ii_flags &= II_FL_NOHASH;
+
+ ii->ii_keysize = feat->dif_keysize_max;
+ if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+ /* key size is variable */
+ ii->ii_flags |= II_FL_VARKEY;
+ /* we don't support variable key size for the time being */
+ GOTO(out, rc = -EOPNOTSUPP);
+ }
+
+ ii->ii_recsize = feat->dif_recsize_max;
+ if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+ /* record size is variable */
+ ii->ii_flags |= II_FL_VARREC;
+ /* we don't support variable record size for the time being */
+ GOTO(out, rc = -EOPNOTSUPP);
+ }
+
+ if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+ /* key isn't necessarily unique */
+ ii->ii_flags |= II_FL_NONUNQ;
+
+ dt_read_lock(env, obj, 0);
+ /* fetch object version before walking the index */
+ ii->ii_version = dt_version_get(env, obj);
+
+ /* walk the index and fill lu_idxpages with key/record pairs */
+ rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+ dt_read_unlock(env, obj);
+
+ if (rc == 0) {
+ /* index is empty */
+ LASSERT(ii->ii_count == 0);
+ ii->ii_hash_end = II_END_OFF;
+ }
+
+ GOTO(out, rc);
+out:
+ lu_object_put(env, &obj->do_lu);
+ return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, "%u\n",
+ (unsigned) osfs.os_bsize);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_blocks;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, LPU64"\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bfree;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, LPU64"\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bavail;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ *eof = 1;
+ rc = snprintf(page, count, LPU64"\n", result);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct dt_device *dt = data;
+ struct obd_statfs osfs;
+
+ int rc = dt_statfs(NULL, dt, &osfs);
+ if (rc == 0) {
+ *eof = 1;
+ rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+ }
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 000000000000..d96876e0bc68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -0,0 +1,1853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head obd_zombie_imports;
+struct list_head obd_zombie_exports;
+spinlock_t obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+ const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+ struct obd_device *obd;
+
+ OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+ if (obd != NULL) {
+ obd->obd_magic = OBD_DEVICE_MAGIC;
+ }
+ return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+ LASSERT(obd != NULL);
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ if (obd->obd_namespace != NULL) {
+ CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+ obd, obd->obd_namespace, obd->obd_force);
+ LBUG();
+ }
+ lu_ref_fini(&obd->obd_reference);
+ OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+ struct list_head *tmp;
+ struct obd_type *type;
+
+ spin_lock(&obd_types_lock);
+ list_for_each(tmp, &obd_types) {
+ type = list_entry(tmp, struct obd_type, typ_chain);
+ if (strcmp(type->typ_name, name) == 0) {
+ spin_unlock(&obd_types_lock);
+ return type;
+ }
+ }
+ spin_unlock(&obd_types_lock);
+ return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+ struct obd_type *type = class_search_type(name);
+
+ if (!type) {
+ const char *modname = name;
+
+ if (strcmp(modname, "obdfilter") == 0)
+ modname = "ofd";
+
+ if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+ modname = LUSTRE_OSP_NAME;
+
+ if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+ modname = LUSTRE_MDT_NAME;
+
+ if (!request_module("%s", modname)) {
+ CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+ type = class_search_type(name);
+ } else {
+ LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+ modname);
+ }
+ }
+ if (type) {
+ spin_lock(&type->obd_type_lock);
+ type->typ_refcnt++;
+ try_module_get(type->typ_dt_ops->o_owner);
+ spin_unlock(&type->obd_type_lock);
+ }
+ return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+ LASSERT(type);
+ spin_lock(&type->obd_type_lock);
+ type->typ_refcnt--;
+ module_put(type->typ_dt_ops->o_owner);
+ spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+ struct lprocfs_vars *vars, const char *name,
+ struct lu_device_type *ldt)
+{
+ struct obd_type *type;
+ int rc = 0;
+ ENTRY;
+
+ /* sanity check */
+ LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+ if (class_search_type(name)) {
+ CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+ RETURN(-EEXIST);
+ }
+
+ rc = -ENOMEM;
+ OBD_ALLOC(type, sizeof(*type));
+ if (type == NULL)
+ RETURN(rc);
+
+ OBD_ALLOC_PTR(type->typ_dt_ops);
+ OBD_ALLOC_PTR(type->typ_md_ops);
+ OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+ if (type->typ_dt_ops == NULL ||
+ type->typ_md_ops == NULL ||
+ type->typ_name == NULL)
+ GOTO (failed, rc);
+
+ *(type->typ_dt_ops) = *dt_ops;
+ /* md_ops is optional */
+ if (md_ops)
+ *(type->typ_md_ops) = *md_ops;
+ strcpy(type->typ_name, name);
+ spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+ type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+ vars, type);
+ if (IS_ERR(type->typ_procroot)) {
+ rc = PTR_ERR(type->typ_procroot);
+ type->typ_procroot = NULL;
+ GOTO (failed, rc);
+ }
+#endif
+ if (ldt != NULL) {
+ type->typ_lu = ldt;
+ rc = lu_device_type_init(ldt);
+ if (rc != 0)
+ GOTO (failed, rc);
+ }
+
+ spin_lock(&obd_types_lock);
+ list_add(&type->typ_chain, &obd_types);
+ spin_unlock(&obd_types_lock);
+
+ RETURN (0);
+
+ failed:
+ if (type->typ_name != NULL)
+ OBD_FREE(type->typ_name, strlen(name) + 1);
+ if (type->typ_md_ops != NULL)
+ OBD_FREE_PTR(type->typ_md_ops);
+ if (type->typ_dt_ops != NULL)
+ OBD_FREE_PTR(type->typ_dt_ops);
+ OBD_FREE(type, sizeof(*type));
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+ struct obd_type *type = class_search_type(name);
+ ENTRY;
+
+ if (!type) {
+ CERROR("unknown obd type\n");
+ RETURN(-EINVAL);
+ }
+
+ if (type->typ_refcnt) {
+ CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+ /* This is a bad situation, let's make the best of it */
+ /* Remove ops, but leave the name for debugging */
+ OBD_FREE_PTR(type->typ_dt_ops);
+ OBD_FREE_PTR(type->typ_md_ops);
+ RETURN(-EBUSY);
+ }
+
+ if (type->typ_procroot) {
+ lprocfs_remove(&type->typ_procroot);
+ }
+
+ if (type->typ_lu)
+ lu_device_type_fini(type->typ_lu);
+
+ spin_lock(&obd_types_lock);
+ list_del(&type->typ_chain);
+ spin_unlock(&obd_types_lock);
+ OBD_FREE(type->typ_name, strlen(name) + 1);
+ if (type->typ_dt_ops != NULL)
+ OBD_FREE_PTR(type->typ_dt_ops);
+ if (type->typ_md_ops != NULL)
+ OBD_FREE_PTR(type->typ_md_ops);
+ OBD_FREE(type, sizeof(*type));
+ RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ * pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+ struct obd_device *result = NULL;
+ struct obd_device *newdev;
+ struct obd_type *type = NULL;
+ int i;
+ int new_obd_minor = 0;
+ ENTRY;
+
+ if (strlen(name) >= MAX_OBD_NAME) {
+ CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+ RETURN(ERR_PTR(-EINVAL));
+ }
+
+ type = class_get_type(type_name);
+ if (type == NULL){
+ CERROR("OBD: unknown type: %s\n", type_name);
+ RETURN(ERR_PTR(-ENODEV));
+ }
+
+ newdev = obd_device_alloc();
+ if (newdev == NULL)
+ GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+ LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+ write_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && (strcmp(name, obd->obd_name) == 0)) {
+ CERROR("Device %s already exists at %d, won't add\n",
+ name, i);
+ if (result) {
+ LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+ "%p obd_magic %08x != %08x\n", result,
+ result->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(result->obd_minor == new_obd_minor,
+ "%p obd_minor %d != %d\n", result,
+ result->obd_minor, new_obd_minor);
+
+ obd_devs[result->obd_minor] = NULL;
+ result->obd_name[0]='\0';
+ }
+ result = ERR_PTR(-EEXIST);
+ break;
+ }
+ if (!result && !obd) {
+ result = newdev;
+ result->obd_minor = i;
+ new_obd_minor = i;
+ result->obd_type = type;
+ strncpy(result->obd_name, name,
+ sizeof(result->obd_name) - 1);
+ obd_devs[i] = result;
+ }
+ }
+ write_unlock(&obd_dev_lock);
+
+ if (result == NULL && i >= class_devno_max()) {
+ CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+ class_devno_max());
+ GOTO(out, result = ERR_PTR(-EOVERFLOW));
+ }
+
+ if (IS_ERR(result))
+ GOTO(out, result);
+
+ CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+ result->obd_name, result);
+
+ RETURN(result);
+out:
+ obd_device_free(newdev);
+out_type:
+ class_put_type(type);
+ return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+ struct obd_type *obd_type = obd->obd_type;
+
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+ obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+ LASSERT(obd_type != NULL);
+
+ CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+ obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+ write_lock(&obd_dev_lock);
+ obd_devs[obd->obd_minor] = NULL;
+ write_unlock(&obd_dev_lock);
+ obd_device_free(obd);
+
+ class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+ int i;
+
+ if (!name)
+ return -1;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && strcmp(name, obd->obd_name) == 0) {
+ /* Make sure we finished attaching before we give
+ out any references */
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_attached) {
+ read_unlock(&obd_dev_lock);
+ return i;
+ }
+ break;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+ int dev = class_name2dev(name);
+
+ if (dev < 0 || dev > class_devno_max())
+ return NULL;
+ return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ read_unlock(&obd_dev_lock);
+ return i;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+ int dev = class_uuid2dev(uuid);
+ if (dev < 0)
+ return NULL;
+ return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ * otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+ struct obd_device *obd = NULL;
+
+ if (num < class_devno_max()) {
+ obd = obd_devs[num];
+ if (obd == NULL)
+ return NULL;
+
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "%p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(obd->obd_minor == num,
+ "%p obd_minor %0d != %0d\n",
+ obd, obd->obd_minor, num);
+ }
+
+ return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ * state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+ int index, max_index = class_devno_max(), dev_count = 0;
+
+ read_lock(&obd_dev_lock);
+ for (index = 0; index <= max_index; index++) {
+ struct obd_device *obd = class_num2obd(index);
+ if (obd != NULL)
+ dev_count++;
+ }
+ read_unlock(&obd_dev_lock);
+
+ return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+ char *status;
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+ LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+ i, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+ }
+ read_unlock(&obd_dev_lock);
+ return;
+}
+
+/* Search for a client OBD connected to tgt_uuid. If grp_uuid is
+ specified, then only the client with that uuid is returned,
+ otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+ const char * typ_name,
+ struct obd_uuid *grp_uuid)
+{
+ int i;
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if ((strncmp(obd->obd_type->typ_name, typ_name,
+ strlen(typ_name)) == 0)) {
+ if (obd_uuid_equals(tgt_uuid,
+ &obd->u.cli.cl_target_uuid) &&
+ ((grp_uuid)? obd_uuid_equals(grp_uuid,
+ &obd->obd_uuid) : 1)) {
+ read_unlock(&obd_dev_lock);
+ return obd;
+ }
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+ searching at *next, and if a device is found, the next index to look
+ at is saved in *next. If next is NULL, then the first matching device
+ will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+ int i;
+
+ if (next == NULL)
+ i = 0;
+ else if (*next >= 0 && *next < class_devno_max())
+ i = *next;
+ else
+ return NULL;
+
+ read_lock(&obd_dev_lock);
+ for (; i < class_devno_max(); i++) {
+ struct obd_device *obd = class_num2obd(i);
+
+ if (obd == NULL)
+ continue;
+ if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+ if (next != NULL)
+ *next = i+1;
+ read_unlock(&obd_dev_lock);
+ return obd;
+ }
+ }
+ read_unlock(&obd_dev_lock);
+
+ return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+ struct obd_device *obd;
+ const char *type;
+ int i, rc = 0, rc2;
+
+ LASSERT(namelen > 0);
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ obd = class_num2obd(i);
+
+ if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+ continue;
+
+ /* only notify mdc, osc, mdt, ost */
+ type = obd->obd_type->typ_name;
+ if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+ strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+ strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+ strcmp(type, LUSTRE_OST_NAME) != 0)
+ continue;
+
+ if (strncmp(obd->obd_name, fsname, namelen))
+ continue;
+
+ class_incref(obd, __FUNCTION__, obd);
+ read_unlock(&obd_dev_lock);
+ rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+ sizeof(KEY_SPTLRPC_CONF),
+ KEY_SPTLRPC_CONF, 0, NULL, NULL);
+ rc = rc ? rc : rc2;
+ class_decref(obd, __FUNCTION__, obd);
+ read_lock(&obd_dev_lock);
+ }
+ read_unlock(&obd_dev_lock);
+ return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+ ENTRY;
+ if (obd_device_cachep) {
+ kmem_cache_destroy(obd_device_cachep);
+ obd_device_cachep = NULL;
+ }
+ if (obdo_cachep) {
+ kmem_cache_destroy(obdo_cachep);
+ obdo_cachep = NULL;
+ }
+ if (import_cachep) {
+ kmem_cache_destroy(import_cachep);
+ import_cachep = NULL;
+ }
+ if (capa_cachep) {
+ kmem_cache_destroy(capa_cachep);
+ capa_cachep = NULL;
+ }
+ EXIT;
+}
+
+int obd_init_caches(void)
+{
+ ENTRY;
+
+ LASSERT(obd_device_cachep == NULL);
+ obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+ sizeof(struct obd_device),
+ 0, 0, NULL);
+ if (!obd_device_cachep)
+ GOTO(out, -ENOMEM);
+
+ LASSERT(obdo_cachep == NULL);
+ obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+ 0, 0, NULL);
+ if (!obdo_cachep)
+ GOTO(out, -ENOMEM);
+
+ LASSERT(import_cachep == NULL);
+ import_cachep = kmem_cache_create("ll_import_cache",
+ sizeof(struct obd_import),
+ 0, 0, NULL);
+ if (!import_cachep)
+ GOTO(out, -ENOMEM);
+
+ LASSERT(capa_cachep == NULL);
+ capa_cachep = kmem_cache_create("capa_cache",
+ sizeof(struct obd_capa), 0, 0, NULL);
+ if (!capa_cachep)
+ GOTO(out, -ENOMEM);
+
+ RETURN(0);
+ out:
+ obd_cleanup_caches();
+ RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+ struct obd_export *export;
+ ENTRY;
+
+ if (!conn) {
+ CDEBUG(D_CACHE, "looking for null handle\n");
+ RETURN(NULL);
+ }
+
+ if (conn->cookie == -1) { /* this means assign a new connection */
+ CDEBUG(D_CACHE, "want a new connection\n");
+ RETURN(NULL);
+ }
+
+ CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+ export = class_handle2object(conn->cookie);
+ RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+ if (exp)
+ return exp->exp_obd;
+ return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+ struct obd_export *export;
+ export = class_conn2export(conn);
+ if (export) {
+ struct obd_device *obd = export->exp_obd;
+ class_export_put(export);
+ return obd;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+ if (obd == NULL)
+ return NULL;
+ return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+ struct obd_device *obd = class_conn2obd(conn);
+ if (obd == NULL)
+ return NULL;
+ return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+ ENTRY;
+
+ LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+ LASSERT(obd != NULL);
+
+ CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+ exp->exp_client_uuid.uuid, obd->obd_name);
+
+ /* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+ if (exp->exp_connection)
+ ptlrpc_put_connection_superhack(exp->exp_connection);
+
+ LASSERT(list_empty(&exp->exp_outstanding_replies));
+ LASSERT(list_empty(&exp->exp_uncommitted_replies));
+ LASSERT(list_empty(&exp->exp_req_replay_queue));
+ LASSERT(list_empty(&exp->exp_hp_rpcs));
+ obd_destroy_export(exp);
+ class_decref(obd, "export", exp);
+
+ OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+ EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+ class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+ .hop_addref = export_handle_addref,
+ .hop_free = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+ atomic_inc(&exp->exp_refcount);
+ CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+ atomic_read(&exp->exp_refcount));
+ return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+ LASSERT(exp != NULL);
+ LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+ CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+ atomic_read(&exp->exp_refcount) - 1);
+
+ if (atomic_dec_and_test(&exp->exp_refcount)) {
+ LASSERT(!list_empty(&exp->exp_obd_chain));
+ CDEBUG(D_IOCTL, "final put %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+
+ /* release nid stat refererence */
+ lprocfs_exp_cleanup(exp);
+
+ obd_zombie_export_add(exp);
+ }
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+ struct obd_uuid *cluuid)
+{
+ struct obd_export *export;
+ cfs_hash_t *hash = NULL;
+ int rc = 0;
+ ENTRY;
+
+ OBD_ALLOC_PTR(export);
+ if (!export)
+ return ERR_PTR(-ENOMEM);
+
+ export->exp_conn_cnt = 0;
+ export->exp_lock_hash = NULL;
+ export->exp_flock_hash = NULL;
+ atomic_set(&export->exp_refcount, 2);
+ atomic_set(&export->exp_rpc_count, 0);
+ atomic_set(&export->exp_cb_count, 0);
+ atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ INIT_LIST_HEAD(&export->exp_locks_list);
+ spin_lock_init(&export->exp_locks_list_guard);
+#endif
+ atomic_set(&export->exp_replay_count, 0);
+ export->exp_obd = obd;
+ INIT_LIST_HEAD(&export->exp_outstanding_replies);
+ spin_lock_init(&export->exp_uncommitted_replies_lock);
+ INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+ INIT_LIST_HEAD(&export->exp_req_replay_queue);
+ INIT_LIST_HEAD(&export->exp_handle.h_link);
+ INIT_LIST_HEAD(&export->exp_hp_rpcs);
+ class_handle_hash(&export->exp_handle, &export_handle_ops);
+ export->exp_last_request_time = cfs_time_current_sec();
+ spin_lock_init(&export->exp_lock);
+ spin_lock_init(&export->exp_rpc_lock);
+ INIT_HLIST_NODE(&export->exp_uuid_hash);
+ INIT_HLIST_NODE(&export->exp_nid_hash);
+ spin_lock_init(&export->exp_bl_list_lock);
+ INIT_LIST_HEAD(&export->exp_bl_list);
+
+ export->exp_sp_peer = LUSTRE_SP_ANY;
+ export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+ export->exp_client_uuid = *cluuid;
+ obd_init_export(export);
+
+ spin_lock(&obd->obd_dev_lock);
+ /* shouldn't happen, but might race */
+ if (obd->obd_stopping)
+ GOTO(exit_unlock, rc = -ENODEV);
+
+ hash = cfs_hash_getref(obd->obd_uuid_hash);
+ if (hash == NULL)
+ GOTO(exit_unlock, rc = -ENODEV);
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+ rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+ if (rc != 0) {
+ LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+ obd->obd_name, cluuid->uuid, rc);
+ GOTO(exit_err, rc = -EALREADY);
+ }
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+ GOTO(exit_unlock, rc = -ENODEV);
+ }
+
+ class_incref(obd, "export", export);
+ list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+ list_add_tail(&export->exp_obd_chain_timed,
+ &export->exp_obd->obd_exports_timed);
+ export->exp_obd->obd_num_exports++;
+ spin_unlock(&obd->obd_dev_lock);
+ cfs_hash_putref(hash);
+ RETURN(export);
+
+exit_unlock:
+ spin_unlock(&obd->obd_dev_lock);
+exit_err:
+ if (hash)
+ cfs_hash_putref(hash);
+ class_handle_unhash(&export->exp_handle);
+ LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+ obd_destroy_export(export);
+ OBD_FREE_PTR(export);
+ return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+ class_handle_unhash(&exp->exp_handle);
+
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ /* delete an uuid-export hashitem from hashtables */
+ if (!hlist_unhashed(&exp->exp_uuid_hash))
+ cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+ &exp->exp_client_uuid,
+ &exp->exp_uuid_hash);
+
+ list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+ list_del_init(&exp->exp_obd_chain_timed);
+ exp->exp_obd->obd_num_exports--;
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+ ENTRY;
+
+ CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+ imp->imp_obd->obd_name);
+
+ LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+ ptlrpc_put_connection_superhack(imp->imp_connection);
+
+ while (!list_empty(&imp->imp_conn_list)) {
+ struct obd_import_conn *imp_conn;
+
+ imp_conn = list_entry(imp->imp_conn_list.next,
+ struct obd_import_conn, oic_item);
+ list_del_init(&imp_conn->oic_item);
+ ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+ OBD_FREE(imp_conn, sizeof(*imp_conn));
+ }
+
+ LASSERT(imp->imp_sec == NULL);
+ class_decref(imp->imp_obd, "import", imp);
+ OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+ EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+ class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+ .hop_addref = import_handle_addref,
+ .hop_free = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+ atomic_inc(&import->imp_refcount);
+ CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+ atomic_read(&import->imp_refcount),
+ import->imp_obd->obd_name);
+ return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+ ENTRY;
+
+ LASSERT(list_empty(&imp->imp_zombie_chain));
+ LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+ CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+ atomic_read(&imp->imp_refcount) - 1,
+ imp->imp_obd->obd_name);
+
+ if (atomic_dec_and_test(&imp->imp_refcount)) {
+ CDEBUG(D_INFO, "final put import %p\n", imp);
+ obd_zombie_import_add(imp);
+ }
+
+ /* catch possible import put race */
+ LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+ EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+ int i;
+ at_init(&at->iat_net_latency, 0, 0);
+ for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ /* max service estimates are tracked on the server side, so
+ don't use the AT history here, just use the last reported
+ val. (But keep hist for proc histogram, worst_ever) */
+ at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+ AT_FLG_NOHIST);
+ }
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+ struct obd_import *imp;
+
+ OBD_ALLOC(imp, sizeof(*imp));
+ if (imp == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&imp->imp_pinger_chain);
+ INIT_LIST_HEAD(&imp->imp_zombie_chain);
+ INIT_LIST_HEAD(&imp->imp_replay_list);
+ INIT_LIST_HEAD(&imp->imp_sending_list);
+ INIT_LIST_HEAD(&imp->imp_delayed_list);
+ spin_lock_init(&imp->imp_lock);
+ imp->imp_last_success_conn = 0;
+ imp->imp_state = LUSTRE_IMP_NEW;
+ imp->imp_obd = class_incref(obd, "import", imp);
+ mutex_init(&imp->imp_sec_mutex);
+ init_waitqueue_head(&imp->imp_recovery_waitq);
+
+ atomic_set(&imp->imp_refcount, 2);
+ atomic_set(&imp->imp_unregistering, 0);
+ atomic_set(&imp->imp_inflight, 0);
+ atomic_set(&imp->imp_replay_inflight, 0);
+ atomic_set(&imp->imp_inval_count, 0);
+ INIT_LIST_HEAD(&imp->imp_conn_list);
+ INIT_LIST_HEAD(&imp->imp_handle.h_link);
+ class_handle_hash(&imp->imp_handle, &import_handle_ops);
+ init_imp_at(&imp->imp_at);
+
+ /* the default magic is V2, will be used in connect RPC, and
+ * then adjusted according to the flags in request/reply. */
+ imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+ return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+ LASSERT(import != NULL);
+ LASSERT(import != LP_POISON);
+
+ class_handle_unhash(&import->imp_handle);
+
+ spin_lock(&import->imp_lock);
+ import->imp_generation++;
+ spin_unlock(&import->imp_lock);
+ class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+ spin_lock(&exp->exp_locks_list_guard);
+
+ LASSERT(lock->l_exp_refs_nr >= 0);
+
+ if (lock->l_exp_refs_target != NULL &&
+ lock->l_exp_refs_target != exp) {
+ LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+ exp, lock, lock->l_exp_refs_target);
+ }
+ if ((lock->l_exp_refs_nr ++) == 0) {
+ list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+ lock->l_exp_refs_target = exp;
+ }
+ CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+ lock, exp, lock->l_exp_refs_nr);
+ spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+ spin_lock(&exp->exp_locks_list_guard);
+ LASSERT(lock->l_exp_refs_nr > 0);
+ if (lock->l_exp_refs_target != exp) {
+ LCONSOLE_WARN("lock %p, "
+ "mismatching export pointers: %p, %p\n",
+ lock, lock->l_exp_refs_target, exp);
+ }
+ if (-- lock->l_exp_refs_nr == 0) {
+ list_del_init(&lock->l_exp_refs_link);
+ lock->l_exp_refs_target = NULL;
+ }
+ CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+ lock, exp, lock->l_exp_refs_nr);
+ spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+ be managed. This releases the export pointer reference, and returns
+ the export handle, so the export refcount is 1 when this function
+ returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+ struct obd_uuid *cluuid)
+{
+ struct obd_export *export;
+ LASSERT(conn != NULL);
+ LASSERT(obd != NULL);
+ LASSERT(cluuid != NULL);
+ ENTRY;
+
+ export = class_new_export(obd, cluuid);
+ if (IS_ERR(export))
+ RETURN(PTR_ERR(export));
+
+ conn->cookie = export->exp_handle.h_cookie;
+ class_export_put(export);
+
+ CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+ cluuid->uuid, conn->cookie);
+ RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+ struct obd_device *obd = exp->exp_obd;
+
+ spin_lock(&obd->obd_recovery_task_lock);
+ if (exp->exp_delayed)
+ obd->obd_delayed_clients--;
+ if (obd->obd_recovering) {
+ if (exp->exp_in_recovery) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_in_recovery = 0;
+ spin_unlock(&exp->exp_lock);
+ LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+ atomic_dec(&obd->obd_connected_clients);
+ }
+
+ /* if called during recovery then should update
+ * obd_stale_clients counter,
+ * lightweight exports are not counted */
+ if (exp->exp_failed &&
+ (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+ exp->exp_obd->obd_stale_clients++;
+ }
+ spin_unlock(&obd->obd_recovery_task_lock);
+ /** Cleanup req replay fields */
+ if (exp->exp_req_replay_needed) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_req_replay_needed = 0;
+ spin_unlock(&exp->exp_lock);
+ LASSERT(atomic_read(&obd->obd_req_replay_clients));
+ atomic_dec(&obd->obd_req_replay_clients);
+ }
+ /** Cleanup lock replay data */
+ if (exp->exp_lock_replay_needed) {
+ spin_lock(&exp->exp_lock);
+ exp->exp_lock_replay_needed = 0;
+ spin_unlock(&exp->exp_lock);
+ LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+ atomic_dec(&obd->obd_lock_replay_clients);
+ }
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+ int already_disconnected;
+ ENTRY;
+
+ if (export == NULL) {
+ CWARN("attempting to free NULL export %p\n", export);
+ RETURN(-EINVAL);
+ }
+
+ spin_lock(&export->exp_lock);
+ already_disconnected = export->exp_disconnected;
+ export->exp_disconnected = 1;
+ spin_unlock(&export->exp_lock);
+
+ /* class_cleanup(), abort_recovery(), and class_fail_export()
+ * all end up in here, and if any of them race we shouldn't
+ * call extra class_export_puts(). */
+ if (already_disconnected) {
+ LASSERT(hlist_unhashed(&export->exp_nid_hash));
+ GOTO(no_disconn, already_disconnected);
+ }
+
+ CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+ export->exp_handle.h_cookie);
+
+ if (!hlist_unhashed(&export->exp_nid_hash))
+ cfs_hash_del(export->exp_obd->obd_nid_hash,
+ &export->exp_connection->c_peer.nid,
+ &export->exp_nid_hash);
+
+ class_export_recovery_cleanup(export);
+ class_unlink_export(export);
+no_disconn:
+ class_export_put(export);
+ RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+ if (exp) {
+ int connected;
+ spin_lock(&exp->exp_lock);
+ connected = (exp->exp_conn_cnt > 0);
+ spin_unlock(&exp->exp_lock);
+ return connected;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+ enum obd_option flags)
+{
+ int rc;
+ struct obd_export *exp;
+ ENTRY;
+
+ /* It's possible that an export may disconnect itself, but
+ * nothing else will be added to this list. */
+ while (!list_empty(list)) {
+ exp = list_entry(list->next, struct obd_export,
+ exp_obd_chain);
+ /* need for safe call CDEBUG after obd_disconnect */
+ class_export_get(exp);
+
+ spin_lock(&exp->exp_lock);
+ exp->exp_flags = flags;
+ spin_unlock(&exp->exp_lock);
+
+ if (obd_uuid_equals(&exp->exp_client_uuid,
+ &exp->exp_obd->obd_uuid)) {
+ CDEBUG(D_HA,
+ "exp %p export uuid == obd uuid, don't discon\n",
+ exp);
+ /* Need to delete this now so we don't end up pointing
+ * to work_list later when this export is cleaned up. */
+ list_del_init(&exp->exp_obd_chain);
+ class_export_put(exp);
+ continue;
+ }
+
+ class_export_get(exp);
+ CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+ "last request at "CFS_TIME_T"\n",
+ exp->exp_obd->obd_name, obd_export_nid2str(exp),
+ exp, exp->exp_last_request_time);
+ /* release one export reference anyway */
+ rc = obd_disconnect(exp);
+
+ CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+ obd_export_nid2str(exp), exp, rc);
+ class_export_put(exp);
+ }
+ EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+ struct list_head work_list;
+ ENTRY;
+
+ /* Move all of the exports from obd_exports to a work list, en masse. */
+ INIT_LIST_HEAD(&work_list);
+ spin_lock(&obd->obd_dev_lock);
+ list_splice_init(&obd->obd_exports, &work_list);
+ list_splice_init(&obd->obd_delayed_exports, &work_list);
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (!list_empty(&work_list)) {
+ CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+ "disconnecting them\n", obd->obd_minor, obd);
+ class_disconnect_export_list(&work_list,
+ exp_flags_from_obd(obd));
+ } else
+ CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+ obd->obd_minor, obd);
+ EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+ int (*test_export)(struct obd_export *))
+{
+ struct list_head work_list;
+ struct obd_export *exp, *n;
+ int evicted = 0;
+ ENTRY;
+
+ INIT_LIST_HEAD(&work_list);
+ spin_lock(&obd->obd_dev_lock);
+ list_for_each_entry_safe(exp, n, &obd->obd_exports,
+ exp_obd_chain) {
+ /* don't count self-export as client */
+ if (obd_uuid_equals(&exp->exp_client_uuid,
+ &exp->exp_obd->obd_uuid))
+ continue;
+
+ /* don't evict clients which have no slot in last_rcvd
+ * (e.g. lightweight connection) */
+ if (exp->exp_target_data.ted_lr_idx == -1)
+ continue;
+
+ spin_lock(&exp->exp_lock);
+ if (exp->exp_failed || test_export(exp)) {
+ spin_unlock(&exp->exp_lock);
+ continue;
+ }
+ exp->exp_failed = 1;
+ spin_unlock(&exp->exp_lock);
+
+ list_move(&exp->exp_obd_chain, &work_list);
+ evicted++;
+ CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+ obd->obd_name, exp->exp_client_uuid.uuid,
+ exp->exp_connection == NULL ? "<unknown>" :
+ libcfs_nid2str(exp->exp_connection->c_peer.nid));
+ print_export_data(exp, "EVICTING", 0);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (evicted)
+ LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+ obd->obd_name, evicted);
+
+ class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+ OBD_OPT_ABORT_RECOV);
+ EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+ int rc, already_failed;
+
+ spin_lock(&exp->exp_lock);
+ already_failed = exp->exp_failed;
+ exp->exp_failed = 1;
+ spin_unlock(&exp->exp_lock);
+
+ if (already_failed) {
+ CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+ exp, exp->exp_client_uuid.uuid);
+ return;
+ }
+
+ CDEBUG(D_HA, "disconnecting export %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+
+ if (obd_dump_on_timeout)
+ libcfs_debug_dumplog();
+
+ /* need for safe call CDEBUG after obd_disconnect */
+ class_export_get(exp);
+
+ /* Most callers into obd_disconnect are removing their own reference
+ * (request, for example) in addition to the one from the hash table.
+ * We don't have such a reference here, so make one. */
+ class_export_get(exp);
+ rc = obd_disconnect(exp);
+ if (rc)
+ CERROR("disconnecting export %p failed: %d\n", exp, rc);
+ else
+ CDEBUG(D_HA, "disconnected export %p/%s\n",
+ exp, exp->exp_client_uuid.uuid);
+ class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+ if (exp->exp_connection != NULL)
+ return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+ return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+ cfs_hash_t *nid_hash;
+ struct obd_export *doomed_exp = NULL;
+ int exports_evicted = 0;
+
+ lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+ spin_lock(&obd->obd_dev_lock);
+ /* umount has run already, so evict thread should leave
+ * its task to umount thread now */
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ return exports_evicted;
+ }
+ nid_hash = obd->obd_nid_hash;
+ cfs_hash_getref(nid_hash);
+ spin_unlock(&obd->obd_dev_lock);
+
+ do {
+ doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+ if (doomed_exp == NULL)
+ break;
+
+ LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+ "nid %s found, wanted nid %s, requested nid %s\n",
+ obd_export_nid2str(doomed_exp),
+ libcfs_nid2str(nid_key), nid);
+ LASSERTF(doomed_exp != obd->obd_self_export,
+ "self-export is hashed by NID?\n");
+ exports_evicted++;
+ LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+ "request\n", obd->obd_name,
+ obd_uuid2str(&doomed_exp->exp_client_uuid),
+ obd_export_nid2str(doomed_exp));
+ class_fail_export(doomed_exp);
+ class_export_put(doomed_exp);
+ } while (1);
+
+ cfs_hash_putref(nid_hash);
+
+ if (!exports_evicted)
+ CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+ obd->obd_name, nid);
+ return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+ cfs_hash_t *uuid_hash;
+ struct obd_export *doomed_exp = NULL;
+ struct obd_uuid doomed_uuid;
+ int exports_evicted = 0;
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ return exports_evicted;
+ }
+ uuid_hash = obd->obd_uuid_hash;
+ cfs_hash_getref(uuid_hash);
+ spin_unlock(&obd->obd_dev_lock);
+
+ obd_str2uuid(&doomed_uuid, uuid);
+ if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+ CERROR("%s: can't evict myself\n", obd->obd_name);
+ cfs_hash_putref(uuid_hash);
+ return exports_evicted;
+ }
+
+ doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+ if (doomed_exp == NULL) {
+ CERROR("%s: can't disconnect %s: no exports found\n",
+ obd->obd_name, uuid);
+ } else {
+ CWARN("%s: evicting %s at adminstrative request\n",
+ obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+ class_fail_export(doomed_exp);
+ class_export_put(doomed_exp);
+ exports_evicted++;
+ }
+ cfs_hash_putref(uuid_hash);
+
+ return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+ int locks)
+{
+ struct ptlrpc_reply_state *rs;
+ struct ptlrpc_reply_state *first_reply = NULL;
+ int nreplies = 0;
+
+ spin_lock(&exp->exp_lock);
+ list_for_each_entry(rs, &exp->exp_outstanding_replies,
+ rs_exp_list) {
+ if (nreplies == 0)
+ first_reply = rs;
+ nreplies++;
+ }
+ spin_unlock(&exp->exp_lock);
+
+ CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+ exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+ obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+ atomic_read(&exp->exp_rpc_count),
+ atomic_read(&exp->exp_cb_count),
+ atomic_read(&exp->exp_locks_count),
+ exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+ nreplies, first_reply, nreplies > 3 ? "..." : "",
+ exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+ if (locks && class_export_dump_hook != NULL)
+ class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+ struct obd_export *exp;
+
+ spin_lock(&obd->obd_dev_lock);
+ list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+ print_export_data(exp, "ACTIVE", locks);
+ list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+ print_export_data(exp, "UNLINKED", locks);
+ list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+ print_export_data(exp, "DELAYED", locks);
+ spin_unlock(&obd->obd_dev_lock);
+ spin_lock(&obd_zombie_impexp_lock);
+ list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+ print_export_data(exp, "ZOMBIE", locks);
+ spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+ int waited = 2;
+ LASSERT(list_empty(&obd->obd_exports));
+ spin_lock(&obd->obd_dev_lock);
+ while (!list_empty(&obd->obd_unlinked_exports)) {
+ spin_unlock(&obd->obd_dev_lock);
+ schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+ cfs_time_seconds(waited));
+ if (waited > 5 && IS_PO2(waited)) {
+ LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+ "more than %d seconds. "
+ "The obd refcount = %d. Is it stuck?\n",
+ obd->obd_name, waited,
+ atomic_read(&obd->obd_refcount));
+ dump_exports(obd, 1);
+ }
+ waited *= 2;
+ spin_lock(&obd->obd_dev_lock);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+ struct obd_import *import;
+ struct obd_export *export;
+ ENTRY;
+
+ do {
+ spin_lock(&obd_zombie_impexp_lock);
+
+ import = NULL;
+ if (!list_empty(&obd_zombie_imports)) {
+ import = list_entry(obd_zombie_imports.next,
+ struct obd_import,
+ imp_zombie_chain);
+ list_del_init(&import->imp_zombie_chain);
+ }
+
+ export = NULL;
+ if (!list_empty(&obd_zombie_exports)) {
+ export = list_entry(obd_zombie_exports.next,
+ struct obd_export,
+ exp_obd_chain);
+ list_del_init(&export->exp_obd_chain);
+ }
+
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ if (import != NULL) {
+ class_import_destroy(import);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count--;
+ spin_unlock(&obd_zombie_impexp_lock);
+ }
+
+ if (export != NULL) {
+ class_export_destroy(export);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count--;
+ spin_unlock(&obd_zombie_impexp_lock);
+ }
+
+ cond_resched();
+ } while (import != NULL || export != NULL);
+ EXIT;
+}
+
+static struct completion obd_zombie_start;
+static struct completion obd_zombie_stop;
+static unsigned long obd_zombie_flags;
+static wait_queue_head_t obd_zombie_waitq;
+static pid_t obd_zombie_pid;
+
+enum {
+ OBD_ZOMBIE_STOP = 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+ int rc;
+
+ spin_lock(&obd_zombie_impexp_lock);
+ rc = (zombies_count == 0) &&
+ !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+ spin_lock(&exp->exp_obd->obd_dev_lock);
+ LASSERT(!list_empty(&exp->exp_obd_chain));
+ list_del_init(&exp->exp_obd_chain);
+ spin_unlock(&exp->exp_obd->obd_dev_lock);
+ spin_lock(&obd_zombie_impexp_lock);
+ zombies_count++;
+ list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+ LASSERT(imp->imp_sec == NULL);
+ LASSERT(imp->imp_rq_pool == NULL);
+ spin_lock(&obd_zombie_impexp_lock);
+ LASSERT(list_empty(&imp->imp_zombie_chain));
+ zombies_count++;
+ list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+ spin_unlock(&obd_zombie_impexp_lock);
+
+ obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+ /*
+ * Make sure obd_zomebie_impexp_thread get this notification.
+ * It is possible this signal only get by obd_zombie_barrier, and
+ * barrier gulps this notification and sleeps away and hangs ensues
+ */
+ wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+ int rc;
+
+ LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+ spin_lock(&obd_zombie_impexp_lock);
+ rc = (zombies_count == 0);
+ spin_unlock(&obd_zombie_impexp_lock);
+ return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+ struct l_wait_info lwi = { 0 };
+
+ if (obd_zombie_pid == current_pid())
+ /* don't wait for myself */
+ return;
+ l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+ unshare_fs_struct();
+ complete(&obd_zombie_start);
+
+ obd_zombie_pid = current_pid();
+
+ while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+ struct l_wait_info lwi = { 0 };
+
+ l_wait_event(obd_zombie_waitq,
+ !obd_zombie_impexp_check(NULL), &lwi);
+ obd_zombie_impexp_cull();
+
+ /*
+ * Notify obd_zombie_barrier callers that queues
+ * may be empty.
+ */
+ wake_up(&obd_zombie_waitq);
+ }
+
+ complete(&obd_zombie_stop);
+
+ RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+ task_t *task;
+
+ INIT_LIST_HEAD(&obd_zombie_imports);
+ INIT_LIST_HEAD(&obd_zombie_exports);
+ spin_lock_init(&obd_zombie_impexp_lock);
+ init_completion(&obd_zombie_start);
+ init_completion(&obd_zombie_stop);
+ init_waitqueue_head(&obd_zombie_waitq);
+ obd_zombie_pid = 0;
+
+ task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+ if (IS_ERR(task))
+ RETURN(PTR_ERR(task));
+
+ wait_for_completion(&obd_zombie_start);
+ RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+ set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+ obd_zombie_impexp_notify();
+ wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+ return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+ struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+ LASSERT(lh->kuc_magic == KUC_MAGIC);
+ return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+ struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+ if (kh->kuc_magic == KUC_MAGIC)
+ return 1;
+ else
+ return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+ struct kuc_hdr *lh;
+ int len = kuc_len(payload_len);
+
+ OBD_ALLOC(lh, len);
+ if (lh == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ lh->kuc_magic = KUC_MAGIC;
+ lh->kuc_transport = transport;
+ lh->kuc_msgtype = type;
+ lh->kuc_msglen = len;
+
+ return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+ struct kuc_hdr *lh = kuc_ptr(p);
+ OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644
index 000000000000..622f8d165275
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/idmap.c
@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do { \
+ atomic_inc(&(group_info)->usage); \
+} while (0)
+
+#define lustre_put_group_info(group_info) do { \
+ if (atomic_dec_and_test(&(group_info)->usage)) \
+ groups_free(group_info); \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+ gid_t grp)
+{
+ int left, right;
+
+ if (!group_info)
+ return 0;
+
+ left = 0;
+ right = group_info->ngroups;
+ while (left < right) {
+ int mid = (left + right) / 2;
+ int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+ if (cmp > 0)
+ left = mid + 1;
+ else if (cmp < 0)
+ right = mid;
+ else
+ return 1;
+ }
+ return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+ int i;
+ int count = ginfo->ngroups;
+
+ /* fill group_info from gid array */
+ for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+ int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+ int off = i * CFS_NGROUPS_PER_BLOCK;
+ int len = cp_count * sizeof(*glist);
+
+ memcpy(ginfo->blocks[i], glist + off, len);
+ count -= cp_count;
+ }
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+ int base, max, stride;
+ int gidsetsize = group_info->ngroups;
+
+ for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+ ; /* nothing */
+ stride /= 3;
+
+ while (stride) {
+ max = gidsetsize - stride;
+ for (base = 0; base < max; base++) {
+ int left = base;
+ int right = left + stride;
+ gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+ while (left >= 0 &&
+ CFS_GROUP_AT(group_info, left) > tmp) {
+ CFS_GROUP_AT(group_info, right) =
+ CFS_GROUP_AT(group_info, left);
+ right = left;
+ left -= stride;
+ }
+ CFS_GROUP_AT(group_info, right) = tmp;
+ }
+ stride /= 3;
+ }
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+ int rc = 1;
+
+ if (grp != mu->uc_fsgid) {
+ group_info_t *group_info = NULL;
+
+ if (mu->uc_ginfo || !mu->uc_identity ||
+ mu->uc_valid == UCRED_OLD)
+ if (grp == mu->uc_suppgids[0] ||
+ grp == mu->uc_suppgids[1])
+ return 1;
+
+ if (mu->uc_ginfo)
+ group_info = mu->uc_ginfo;
+ else if (mu->uc_identity)
+ group_info = mu->uc_identity->mi_ginfo;
+
+ if (!group_info)
+ return 0;
+
+ lustre_get_group_info(group_info);
+ rc = lustre_groups_search(group_info, grp);
+ lustre_put_group_info(group_info);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+ struct list_head lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+ struct list_head lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+ struct list_head lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+ struct list_head lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+ uid_t lie_rmt_uid; /* remote uid */
+ uid_t lie_lcl_uid; /* local uid */
+ gid_t lie_rmt_gid; /* remote gid */
+ gid_t lie_lcl_gid; /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+ return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+ gid_t rmt_gid, gid_t lcl_gid)
+{
+ struct lustre_idmap_entry *e;
+
+ OBD_ALLOC_PTR(e);
+ if (e == NULL)
+ return NULL;
+
+ INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+ INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+ INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+ INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+ e->lie_rmt_uid = rmt_uid;
+ e->lie_lcl_uid = lcl_uid;
+ e->lie_rmt_gid = rmt_gid;
+ e->lie_lcl_gid = lcl_gid;
+
+ return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+ if (!list_empty(&e->lie_rmt_uid_hash))
+ list_del(&e->lie_rmt_uid_hash);
+ if (!list_empty(&e->lie_lcl_uid_hash))
+ list_del(&e->lie_lcl_uid_hash);
+ if (!list_empty(&e->lie_rmt_gid_hash))
+ list_del(&e->lie_rmt_gid_hash);
+ if (!list_empty(&e->lie_lcl_gid_hash))
+ list_del(&e->lie_lcl_gid_hash);
+ OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+ uid_t rmt_uid, uid_t lcl_uid,
+ gid_t rmt_gid, gid_t lcl_gid)
+{
+ struct list_head *head;
+ struct lustre_idmap_entry *e;
+
+ head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+ list_for_each_entry(e, head, lie_rmt_uid_hash)
+ if (e->lie_rmt_uid == rmt_uid) {
+ if (e->lie_lcl_uid == lcl_uid) {
+ if (e->lie_rmt_gid == rmt_gid &&
+ e->lie_lcl_gid == lcl_gid)
+ /* must be quaternion match */
+ return e;
+ } else {
+ /* 1:N uid mapping */
+ CERROR("rmt uid %u already be mapped to %u"
+ " (new %u)\n", e->lie_rmt_uid,
+ e->lie_lcl_uid, lcl_uid);
+ return ERR_PTR(-EACCES);
+ }
+ }
+
+ head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+ list_for_each_entry(e, head, lie_rmt_gid_hash)
+ if (e->lie_rmt_gid == rmt_gid) {
+ if (e->lie_lcl_gid == lcl_gid) {
+ if (unlikely(e->lie_rmt_uid == rmt_uid &&
+ e->lie_lcl_uid == lcl_uid))
+ /* after uid mapping search above,
+ * we should never come here */
+ LBUG();
+ } else {
+ /* 1:N gid mapping */
+ CERROR("rmt gid %u already be mapped to %u"
+ " (new %u)\n", e->lie_rmt_gid,
+ e->lie_lcl_gid, lcl_gid);
+ return ERR_PTR(-EACCES);
+ }
+ }
+
+ return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+ __u32 uid)
+{
+ struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+ struct lustre_idmap_entry *e;
+
+ if (!reverse) {
+ list_for_each_entry(e, head, lie_rmt_uid_hash)
+ if (e->lie_rmt_uid == uid)
+ return e->lie_lcl_uid;
+ } else {
+ list_for_each_entry(e, head, lie_lcl_uid_hash)
+ if (e->lie_lcl_uid == uid)
+ return e->lie_rmt_uid;
+ }
+
+ return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+ struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+ struct lustre_idmap_entry *e;
+
+ if (!reverse) {
+ list_for_each_entry(e, head, lie_rmt_gid_hash)
+ if (e->lie_rmt_gid == gid)
+ return e->lie_lcl_gid;
+ } else {
+ list_for_each_entry(e, head, lie_lcl_gid_hash)
+ if (e->lie_lcl_gid == gid)
+ return e->lie_rmt_gid;
+ }
+
+ return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+ uid_t ruid, uid_t luid,
+ gid_t rgid, gid_t lgid)
+{
+ struct lustre_idmap_entry *e0, *e1;
+
+ LASSERT(t);
+
+ spin_lock(&t->lit_lock);
+ e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+ spin_unlock(&t->lit_lock);
+ if (!e0) {
+ e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+ if (!e0)
+ return -ENOMEM;
+
+ spin_lock(&t->lit_lock);
+ e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+ if (e1 == NULL) {
+ list_add_tail(&e0->lie_rmt_uid_hash,
+ &t->lit_idmaps[RMT_UIDMAP_IDX]
+ [lustre_idmap_hashfunc(ruid)]);
+ list_add_tail(&e0->lie_lcl_uid_hash,
+ &t->lit_idmaps[LCL_UIDMAP_IDX]
+ [lustre_idmap_hashfunc(luid)]);
+ list_add_tail(&e0->lie_rmt_gid_hash,
+ &t->lit_idmaps[RMT_GIDMAP_IDX]
+ [lustre_idmap_hashfunc(rgid)]);
+ list_add_tail(&e0->lie_lcl_gid_hash,
+ &t->lit_idmaps[LCL_GIDMAP_IDX]
+ [lustre_idmap_hashfunc(lgid)]);
+ }
+ spin_unlock(&t->lit_lock);
+ if (e1 != NULL) {
+ idmap_entry_free(e0);
+ if (IS_ERR(e1))
+ return PTR_ERR(e1);
+ }
+ } else if (IS_ERR(e0)) {
+ return PTR_ERR(e0);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+ uid_t ruid, uid_t luid,
+ gid_t rgid, gid_t lgid)
+{
+ struct lustre_idmap_entry *e;
+ int rc = 0;
+
+ LASSERT(t);
+
+ spin_lock(&t->lit_lock);
+ e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+ if (IS_ERR(e))
+ rc = PTR_ERR(e);
+ else if (e)
+ idmap_entry_free(e);
+ spin_unlock(&t->lit_lock);
+
+ return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+ struct lustre_idmap_table *t,
+ int reverse, uid_t uid)
+{
+ struct list_head *hash;
+
+ if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+ if (!reverse) {
+ if (uid == mu->uc_o_uid)
+ return mu->uc_uid;
+ else if (uid == mu->uc_o_fsuid)
+ return mu->uc_fsuid;
+ } else {
+ if (uid == mu->uc_uid)
+ return mu->uc_o_uid;
+ else if (uid == mu->uc_fsuid)
+ return mu->uc_o_fsuid;
+ }
+ }
+
+ if (t == NULL)
+ return CFS_IDMAP_NOTFOUND;
+
+ hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+ spin_lock(&t->lit_lock);
+ uid = idmap_lookup_uid(hash, reverse, uid);
+ spin_unlock(&t->lit_lock);
+
+ return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+ int reverse, gid_t gid)
+{
+ struct list_head *hash;
+
+ if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+ if (!reverse) {
+ if (gid == mu->uc_o_gid)
+ return mu->uc_gid;
+ else if (gid == mu->uc_o_fsgid)
+ return mu->uc_fsgid;
+ } else {
+ if (gid == mu->uc_gid)
+ return mu->uc_o_gid;
+ else if (gid == mu->uc_fsgid)
+ return mu->uc_o_fsgid;
+ }
+ }
+
+ if (t == NULL)
+ return CFS_IDMAP_NOTFOUND;
+
+ hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+ spin_lock(&t->lit_lock);
+ gid = idmap_lookup_gid(hash, reverse, gid);
+ spin_unlock(&t->lit_lock);
+
+ return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+ struct lustre_idmap_table *t;
+ int i, j;
+
+ OBD_ALLOC_PTR(t);
+ if(unlikely(t == NULL))
+ return (ERR_PTR(-ENOMEM));
+
+ spin_lock_init(&t->lit_lock);
+ for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+ for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+ INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+ return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+ struct list_head *list;
+ struct lustre_idmap_entry *e;
+ int i;
+ LASSERT(t);
+
+ list = t->lit_idmaps[RMT_UIDMAP_IDX];
+ spin_lock(&t->lit_lock);
+ for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+ while (!list_empty(&list[i])) {
+ e = list_entry(list[i].next,
+ struct lustre_idmap_entry,
+ lie_rmt_uid_hash);
+ idmap_entry_free(e);
+ }
+ spin_unlock(&t->lit_lock);
+
+ OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644
index 000000000000..b5c19ac1470f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linkea.c
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+ ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+ if (ldata->ld_buf->lb_buf == NULL)
+ return -ENOMEM;
+ ldata->ld_leh = ldata->ld_buf->lb_buf;
+ ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+ ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+ ldata->ld_leh->leh_reccount = 0;
+ return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+ struct link_ea_header *leh;
+
+ LASSERT(ldata->ld_buf != NULL);
+ leh = ldata->ld_buf->lb_buf;
+ if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+ leh->leh_magic = LINK_EA_MAGIC;
+ leh->leh_reccount = __swab32(leh->leh_reccount);
+ leh->leh_len = __swab64(leh->leh_len);
+ /* entries are swabbed by linkea_entry_unpack */
+ }
+ if (leh->leh_magic != LINK_EA_MAGIC)
+ return -EINVAL;
+ if (leh->leh_reccount == 0)
+ return -ENODATA;
+
+ ldata->ld_leh = leh;
+ return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+ const struct lu_name *lname,
+ const struct lu_fid *pfid)
+{
+ struct lu_fid tmpfid;
+ int reclen;
+
+ fid_cpu_to_be(&tmpfid, pfid);
+ if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+ tmpfid.f_ver = ~0;
+ memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+ memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+ reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+ lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+ lee->lee_reclen[1] = reclen & 0xff;
+ return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+ struct lu_name *lname, struct lu_fid *pfid)
+{
+ *reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+ memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+ fid_be_to_cpu(pfid, pfid);
+ lname->ln_name = lee->lee_name;
+ lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+ const struct lu_fid *pfid)
+{
+ LASSERT(ldata->ld_leh != NULL);
+
+ if (lname == NULL || pfid == NULL)
+ return -EINVAL;
+
+ ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+ if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+ ldata->ld_buf->lb_len) {
+ if (lu_buf_check_and_grow(ldata->ld_buf,
+ ldata->ld_leh->leh_len +
+ ldata->ld_reclen) < 0)
+ return -ENOMEM;
+ }
+
+ ldata->ld_leh = ldata->ld_buf->lb_buf;
+ ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+ ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+ ldata->ld_leh->leh_len += ldata->ld_reclen;
+ ldata->ld_leh->leh_reccount++;
+ CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+ lname->ln_namelen, lname->ln_name);
+ return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+ LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+ ldata->ld_leh->leh_reccount--;
+ ldata->ld_leh->leh_len -= ldata->ld_reclen;
+ memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+ (char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+ (char *)ldata->ld_lee);
+ CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+ lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval 0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+ const struct lu_fid *pfid)
+{
+ struct lu_name tmpname;
+ struct lu_fid tmpfid;
+ int count;
+
+ LASSERT(ldata->ld_leh != NULL);
+
+ /* link #0 */
+ ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+ for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+ linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+ &tmpname, &tmpfid);
+ if (tmpname.ln_namelen == lname->ln_namelen &&
+ lu_fid_eq(&tmpfid, pfid) &&
+ (strncmp(tmpname.ln_name, lname->ln_name,
+ tmpname.ln_namelen) == 0))
+ break;
+ ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+ ldata->ld_reclen);
+ }
+
+ if (count == ldata->ld_leh->leh_reccount) {
+ CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+ lname->ln_namelen, lname->ln_name);
+ ldata->ld_lee = NULL;
+ return -ENOENT;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 000000000000..d2c3072541d1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+ struct obd_ioctl_hdr hdr;
+ struct obd_ioctl_data *data;
+ int err;
+ int offset = 0;
+ ENTRY;
+
+ err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+ if ( err )
+ RETURN(err);
+
+ if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+ CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+ OBD_IOCTL_VERSION, hdr.ioc_version);
+ RETURN(-EINVAL);
+ }
+
+ if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+ CERROR("User buffer len %d exceeds %d max buffer\n",
+ hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+ RETURN(-EINVAL);
+ }
+
+ if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+ CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+ RETURN(-EINVAL);
+ }
+
+ /* When there are lots of processes calling vmalloc on multi-core
+ * system, the high lock contention will hurt performance badly,
+ * obdfilter-survey is an example, which relies on ioctl. So we'd
+ * better avoid vmalloc on ioctl path. LU-66 */
+ OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+ if (*buf == NULL) {
+ CERROR("Cannot allocate control buffer of len %d\n",
+ hdr.ioc_len);
+ RETURN(-EINVAL);
+ }
+ *len = hdr.ioc_len;
+ data = (struct obd_ioctl_data *)*buf;
+
+ err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+ if ( err ) {
+ OBD_FREE_LARGE(*buf, hdr.ioc_len);
+ RETURN(err);
+ }
+
+ if (obd_ioctl_is_invalid(data)) {
+ CERROR("ioctl not correctly formatted\n");
+ OBD_FREE_LARGE(*buf, hdr.ioc_len);
+ RETURN(-EINVAL);
+ }
+
+ if (data->ioc_inllen1) {
+ data->ioc_inlbuf1 = &data->ioc_bulk[0];
+ offset += cfs_size_round(data->ioc_inllen1);
+ }
+
+ if (data->ioc_inllen2) {
+ data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+ offset += cfs_size_round(data->ioc_inllen2);
+ }
+
+ if (data->ioc_inllen3) {
+ data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+ offset += cfs_size_round(data->ioc_inllen3);
+ }
+
+ if (data->ioc_inllen4) {
+ data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+ }
+
+ EXIT;
+ return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+ int err;
+
+ err = copy_to_user(arg, data, len);
+ if (err)
+ err = -EFAULT;
+ return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/* opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+ ENTRY;
+
+ try_module_get(THIS_MODULE);
+ RETURN(0);
+}
+
+/* closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+ ENTRY;
+
+ module_put(THIS_MODULE);
+ RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+ unsigned long arg)
+{
+ int err = 0;
+ ENTRY;
+
+ /* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+ if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+ RETURN(err = -EACCES);
+ if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+ RETURN(err = -ENOTTY);
+
+ err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+ RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+ .owner = THIS_MODULE,
+ .unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+ .open = obd_class_open, /* open */
+ .release = obd_class_release, /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+ .minor = OBD_DEV_MINOR,
+ .name = OBD_DEV_NAME,
+ .fops = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+ return seq_printf(m, "lustre: %s\nkernel: %s\nbuild: %s\n",
+ LUSTRE_VERSION_STRING, "patchless_client",
+ BUILD_VERSION);
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+ return seq_printf(m, "%s\n", "on");
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+ int rc = 0, i;
+
+ if (libcfs_catastrophe)
+ seq_printf(m, "LBUG\n");
+
+ read_lock(&obd_dev_lock);
+ for (i = 0; i < class_devno_max(); i++) {
+ struct obd_device *obd;
+
+ obd = class_num2obd(i);
+ if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+ continue;
+
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_stopping)
+ continue;
+
+ class_incref(obd, __FUNCTION__, current);
+ read_unlock(&obd_dev_lock);
+
+ if (obd_health_check(NULL, obd)) {
+ seq_printf(m, "device %s reported unhealthy\n",
+ obd->obd_name);
+ rc++;
+ }
+ class_decref(obd, __FUNCTION__, current);
+ read_lock(&obd_dev_lock);
+ }
+ read_unlock(&obd_dev_lock);
+
+ if (rc == 0)
+ return seq_printf(m, "healthy\n");
+
+ seq_printf(m, "NOT HEALTHY\n");
+ return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+ return seq_printf(m, "%s\n", obd_jobid_var);
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffer,
+ size_t count, loff_t *off)
+{
+ if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+ return -EINVAL;
+
+ memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+ /* Trim the trailing '\n' if any */
+ memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+ return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+ { "version", &obd_proc_version_fops },
+ { "pinger", &obd_proc_pinger_fops },
+ { "health_check", &obd_proc_health_fops },
+ { "jobid_var", &obd_proc_jobid_var_fops },
+ { 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+ if (*pos >= class_devno_max())
+ return NULL;
+
+ return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ ++*pos;
+ if (*pos >= class_devno_max())
+ return NULL;
+
+ return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+ loff_t index = *(loff_t *)v;
+ struct obd_device *obd = class_num2obd((int)index);
+ char *status;
+
+ if (obd == NULL)
+ return 0;
+
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ if (obd->obd_stopping)
+ status = "ST";
+ else if (obd->obd_inactive)
+ status = "IN";
+ else if (obd->obd_set_up)
+ status = "UP";
+ else if (obd->obd_attached)
+ status = "AT";
+ else
+ status = "--";
+
+ return seq_printf(p, "%3d %s %s %s %s %d\n",
+ (int)index, status, obd->obd_type->typ_name,
+ obd->obd_name, obd->obd_uuid.uuid,
+ atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+ .start = obd_device_list_seq_start,
+ .stop = obd_device_list_seq_stop,
+ .next = obd_device_list_seq_next,
+ .show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc = seq_open(file, &obd_device_list_sops);
+
+ if (rc)
+ return rc;
+
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+
+ return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+ .owner = THIS_MODULE,
+ .open = obd_device_list_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+ int rc;
+ ENTRY;
+
+ obd_sysctl_init();
+ proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+ lprocfs_base, NULL);
+ rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+ &obd_device_list_fops, NULL);
+ if (rc)
+ CERROR("error adding /proc/fs/lustre/devices file\n");
+ RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+ ENTRY;
+ if (proc_lustre_root) {
+ lprocfs_remove(&proc_lustre_root);
+ }
+ RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 000000000000..6ee347153a16
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+ obd_flag newvalid = 0;
+
+ if (valid & LA_ATIME) {
+ dst->o_atime = la->la_atime;
+ newvalid |= OBD_MD_FLATIME;
+ }
+ if (valid & LA_MTIME) {
+ dst->o_mtime = la->la_mtime;
+ newvalid |= OBD_MD_FLMTIME;
+ }
+ if (valid & LA_CTIME) {
+ dst->o_ctime = la->la_ctime;
+ newvalid |= OBD_MD_FLCTIME;
+ }
+ if (valid & LA_SIZE) {
+ dst->o_size = la->la_size;
+ newvalid |= OBD_MD_FLSIZE;
+ }
+ if (valid & LA_BLOCKS) { /* allocation of space (x512 bytes) */
+ dst->o_blocks = la->la_blocks;
+ newvalid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & LA_TYPE) {
+ dst->o_mode = (dst->o_mode & S_IALLUGO) |
+ (la->la_mode & S_IFMT);
+ newvalid |= OBD_MD_FLTYPE;
+ }
+ if (valid & LA_MODE) {
+ dst->o_mode = (dst->o_mode & S_IFMT) |
+ (la->la_mode & S_IALLUGO);
+ newvalid |= OBD_MD_FLMODE;
+ }
+ if (valid & LA_UID) {
+ dst->o_uid = la->la_uid;
+ newvalid |= OBD_MD_FLUID;
+ }
+ if (valid & LA_GID) {
+ dst->o_gid = la->la_gid;
+ newvalid |= OBD_MD_FLGID;
+ }
+ dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+ __u64 newvalid = 0;
+
+ valid &= obdo->o_valid;
+
+ if (valid & OBD_MD_FLATIME) {
+ dst->la_atime = obdo->o_atime;
+ newvalid |= LA_ATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ dst->la_mtime = obdo->o_mtime;
+ newvalid |= LA_MTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ dst->la_ctime = obdo->o_ctime;
+ newvalid |= LA_CTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ dst->la_size = obdo->o_size;
+ newvalid |= LA_SIZE;
+ }
+ if (valid & OBD_MD_FLBLOCKS) {
+ dst->la_blocks = obdo->o_blocks;
+ newvalid |= LA_BLOCKS;
+ }
+ if (valid & OBD_MD_FLTYPE) {
+ dst->la_mode = (dst->la_mode & S_IALLUGO) |
+ (obdo->o_mode & S_IFMT);
+ newvalid |= LA_TYPE;
+ }
+ if (valid & OBD_MD_FLMODE) {
+ dst->la_mode = (dst->la_mode & S_IFMT) |
+ (obdo->o_mode & S_IALLUGO);
+ newvalid |= LA_MODE;
+ }
+ if (valid & OBD_MD_FLUID) {
+ dst->la_uid = obdo->o_uid;
+ newvalid |= LA_UID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ dst->la_gid = obdo->o_gid;
+ newvalid |= LA_GID;
+ }
+ dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+ valid &= src->o_valid;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE,
+ "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+ src->o_valid, LTIME_S(dst->i_mtime),
+ LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+ if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+ LTIME_S(dst->i_atime) = src->o_atime;
+ if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+ LTIME_S(dst->i_mtime) = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+ LTIME_S(dst->i_ctime) = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ i_size_write(dst, src->o_size);
+ /* optimum IO size */
+ if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+ dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+ if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+ dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+ /* allocation of space */
+ if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+ /*
+ * XXX shouldn't overflow be checked here like in
+ * obdo_to_inode().
+ */
+ dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+ valid &= src->o_valid;
+
+ LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+ OBD_MD_FLID | OBD_MD_FLGROUP)),
+ "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE,
+ "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+ src->o_valid, LTIME_S(dst->i_mtime),
+ LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+ if (valid & OBD_MD_FLATIME)
+ LTIME_S(dst->i_atime) = src->o_atime;
+ if (valid & OBD_MD_FLMTIME)
+ LTIME_S(dst->i_mtime) = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+ LTIME_S(dst->i_ctime) = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ i_size_write(dst, src->o_size);
+ if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+ dst->i_blocks = src->o_blocks;
+ if (dst->i_blocks < src->o_blocks) /* overflow */
+ dst->i_blocks = -1;
+
+ }
+ if (valid & OBD_MD_FLBLKSZ)
+ dst->i_blkbits = ffs(src->o_blksize)-1;
+ if (valid & OBD_MD_FLMODE)
+ dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+ if (valid & OBD_MD_FLUID)
+ dst->i_uid = src->o_uid;
+ if (valid & OBD_MD_FLGID)
+ dst->i_gid = src->o_gid;
+ if (valid & OBD_MD_FLFLAGS)
+ dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 000000000000..46aad6813cab
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+ OBD_TIMEOUT = 3, /* RPC timeout before recovery/intr */
+ OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */
+ OBD_MEMUSED, /* bytes currently OBD_ALLOCated */
+ OBD_PAGESUSED, /* pages currently OBD_PAGE_ALLOCated */
+ OBD_MAXMEMUSED, /* maximum bytes OBD_ALLOCated concurrently */
+ OBD_MAXPAGESUSED, /* maximum pages OBD_PAGE_ALLOCated concurrently */
+ OBD_SYNCFILTER, /* XXX temporary, as we play with sync osts.. */
+ OBD_LDLM_TIMEOUT, /* LDLM timeout for ASTs before client eviction */
+ OBD_DUMP_ON_EVICTION, /* dump kernel debug log upon eviction */
+ OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+ OBD_ALLOC_FAIL_RATE, /* memory allocation random failure rate */
+ OBD_MAX_DIRTY_PAGES, /* maximum dirty pages */
+ OBD_AT_MIN, /* Adaptive timeouts params */
+ OBD_AT_MAX,
+ OBD_AT_EXTRA,
+ OBD_AT_EARLY_MARGIN,
+ OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+ int rc;
+
+ rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+ if (ldlm_timeout >= obd_timeout)
+ ldlm_timeout = max(obd_timeout / 3, 1U);
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+ char buf[22];
+ int len;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write)
+ return -EINVAL;
+
+ len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ *ppos += *lenp;
+ return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+ int rc = 0;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ rc = lprocfs_write_frac_helper(buffer, *lenp,
+ (unsigned int*)table->data,
+ 1 << (20 - PAGE_CACHE_SHIFT));
+ /* Don't allow them to let dirty pages exceed 90% of system
+ * memory and set a hard minimum of 4MB. */
+ if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+ CERROR("Refusing to set max dirty pages to %u, which "
+ "is more than 90%% of available RAM; setting "
+ "to %lu\n", obd_max_dirty_pages,
+ ((num_physpages / 10) * 9));
+ obd_max_dirty_pages = ((num_physpages / 10) * 9);
+ } else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+ obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+ }
+ } else {
+ char buf[21];
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, sizeof(buf),
+ *(unsigned int*)table->data,
+ 1 << (20 - PAGE_CACHE_SHIFT));
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ }
+ *ppos += *lenp;
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+ int rc = 0;
+ DECLARE_LL_PROC_PPOS_DECL;
+
+ if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+ if (write) {
+ rc = lprocfs_write_frac_helper(buffer, *lenp,
+ (unsigned int*)table->data,
+ OBD_ALLOC_FAIL_MULT);
+ } else {
+ char buf[21];
+ int len;
+
+ len = lprocfs_read_frac_helper(buf, 21,
+ *(unsigned int*)table->data,
+ OBD_ALLOC_FAIL_MULT);
+ if (len > *lenp)
+ len = *lenp;
+ buf[len] = '\0';
+ if (copy_to_user(buffer, buf, len))
+ return -EFAULT;
+ *lenp = len;
+ }
+ *ppos += *lenp;
+ return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+ return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+ return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+ return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+ return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+ return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+ {
+ INIT_CTL_NAME(OBD_TIMEOUT)
+ .procname = "timeout",
+ .data = &obd_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_set_timeout
+ },
+ {
+ INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+ .procname = "debug_peer_on_timeout",
+ .data = &obd_debug_peer_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+ .procname = "dump_on_timeout",
+ .data = &obd_dump_on_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+ .procname = "dump_on_eviction",
+ .data = &obd_dump_on_eviction,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
+ INIT_CTL_NAME(OBD_MEMUSED)
+ .procname = "memused",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_memory_alloc
+ },
+ {
+ INIT_CTL_NAME(OBD_PAGESUSED)
+ .procname = "pagesused",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_pages_alloc
+ },
+ {
+ INIT_CTL_NAME(OBD_MAXMEMUSED)
+ .procname = "memused_max",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_mem_max
+ },
+ {
+ INIT_CTL_NAME(OBD_MAXPAGESUSED)
+ .procname = "pagesused_max",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0444,
+ .proc_handler = &proc_pages_max
+ },
+ {
+ INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+ .procname = "ldlm_timeout",
+ .data = &ldlm_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_set_timeout
+ },
+ {
+ INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+ .procname = "alloc_fail_rate",
+ .data = &obd_alloc_fail_rate,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_alloc_fail_rate
+ },
+ {
+ INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+ .procname = "max_dirty_mb",
+ .data = &obd_max_dirty_pages,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_max_dirty_pages_in_mb
+ },
+ {
+ INIT_CTL_NAME(OBD_AT_MIN)
+ .procname = "at_min",
+ .data = &at_min,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_at_min
+ },
+ {
+ INIT_CTL_NAME(OBD_AT_MAX)
+ .procname = "at_max",
+ .data = &at_max,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_at_max
+ },
+ {
+ INIT_CTL_NAME(OBD_AT_EXTRA)
+ .procname = "at_extra",
+ .data = &at_extra,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_at_extra
+ },
+ {
+ INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+ .procname = "at_early_margin",
+ .data = &at_early_margin,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_at_early_margin
+ },
+ {
+ INIT_CTL_NAME(OBD_AT_HISTORY)
+ .procname = "at_history",
+ .data = &at_history,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = &proc_at_history
+ },
+ { INIT_CTL_NAME(0) }
+};
+
+static ctl_table_t parent_table[] = {
+ {
+ INIT_CTL_NAME(OBD_SYSCTL)
+ .procname = "lustre",
+ .data = NULL,
+ .maxlen = 0,
+ .mode = 0555,
+ .child = obd_table
+ },
+ { INIT_CTL_NAME(0) }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+ if ( !obd_table_header )
+ obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+ if ( obd_table_header )
+ unregister_sysctl_table(obd_table_header);
+ obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 000000000000..b1d215e56991
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog.c
@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+ struct llog_handle *loghandle;
+
+ OBD_ALLOC_PTR(loghandle);
+ if (loghandle == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ init_rwsem(&loghandle->lgh_lock);
+ spin_lock_init(&loghandle->lgh_hdr_lock);
+ INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+ atomic_set(&loghandle->lgh_refcount, 1);
+
+ return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+ LASSERT(loghandle != NULL);
+
+ /* failed llog_init_handle */
+ if (!loghandle->lgh_hdr)
+ goto out;
+
+ if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+ LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+ else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+ LASSERT(list_empty(&loghandle->u.chd.chd_head));
+ LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+ OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+ OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+ atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+ LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+ if (atomic_dec_and_test(&loghandle->lgh_refcount))
+ llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+ int index)
+{
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ int rc = 0;
+ ENTRY;
+
+ CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+ index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+ if (index == 0) {
+ CERROR("Can't cancel index 0 which is header\n");
+ RETURN(-EINVAL);
+ }
+
+ spin_lock(&loghandle->lgh_hdr_lock);
+ if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+ RETURN(-ENOENT);
+ }
+
+ llh->llh_count--;
+
+ if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1) &&
+ (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ rc = llog_destroy(env, loghandle);
+ if (rc < 0) {
+ CERROR("%s: can't destroy empty llog #"DOSTID
+ "#%08x: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, rc);
+ GOTO(out_err, rc);
+ }
+ RETURN(1);
+ }
+ spin_unlock(&loghandle->lgh_hdr_lock);
+
+ rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+ if (rc < 0) {
+ CERROR("%s: fail to write header for llog #"DOSTID
+ "#%08x: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, rc);
+ GOTO(out_err, rc);
+ }
+ RETURN(0);
+out_err:
+ spin_lock(&loghandle->lgh_hdr_lock);
+ ext2_set_bit(index, llh->llh_bitmap);
+ llh->llh_count++;
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct obd_uuid *uuid)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ RETURN(rc);
+
+ if (lop->lop_read_header == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_read_header(env, handle);
+ if (rc == LLOG_EEMPTY) {
+ struct llog_log_hdr *llh = handle->lgh_hdr;
+
+ handle->lgh_last_idx = 0; /* header is record with index 0 */
+ llh->llh_count = 1; /* for the header record */
+ llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+ llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+ llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+ llh->llh_timestamp = cfs_time_current_sec();
+ if (uuid)
+ memcpy(&llh->llh_tgtuuid, uuid,
+ sizeof(llh->llh_tgtuuid));
+ llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+ ext2_set_bit(0, llh->llh_bitmap);
+ rc = 0;
+ }
+ return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+ int flags, struct obd_uuid *uuid)
+{
+ struct llog_log_hdr *llh;
+ int rc;
+
+ ENTRY;
+ LASSERT(handle->lgh_hdr == NULL);
+
+ OBD_ALLOC_PTR(llh);
+ if (llh == NULL)
+ RETURN(-ENOMEM);
+ handle->lgh_hdr = llh;
+ /* first assign flags to use llog_client_ops */
+ llh->llh_flags = flags;
+ rc = llog_read_header(env, handle, uuid);
+ if (rc == 0) {
+ if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+ flags & LLOG_F_IS_CAT) ||
+ (llh->llh_flags & LLOG_F_IS_CAT &&
+ flags & LLOG_F_IS_PLAIN))) {
+ CERROR("%s: llog type is %s but initializing %s\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ llh->llh_flags & LLOG_F_IS_CAT ?
+ "catalog" : "plain",
+ flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+ GOTO(out, rc = -EINVAL);
+ } else if (llh->llh_flags &
+ (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+ /*
+ * it is possible to open llog without specifying llog
+ * type so it is taken from llh_flags
+ */
+ flags = llh->llh_flags;
+ } else {
+ /* for some reason the llh_flags has no type set */
+ CERROR("llog type is not specified!\n");
+ GOTO(out, rc = -EINVAL);
+ }
+ if (unlikely(uuid &&
+ !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+ CERROR("%s: llog uuid mismatch: %s/%s\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ (char *)uuid->uuid,
+ (char *)llh->llh_tgtuuid.uuid);
+ GOTO(out, rc = -EEXIST);
+ }
+ }
+ if (flags & LLOG_F_IS_CAT) {
+ LASSERT(list_empty(&handle->u.chd.chd_head));
+ INIT_LIST_HEAD(&handle->u.chd.chd_head);
+ llh->llh_size = sizeof(struct llog_logid_rec);
+ } else if (!(flags & LLOG_F_IS_PLAIN)) {
+ CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+ handle->lgh_ctxt->loc_obd->obd_name,
+ flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+ rc = -EINVAL;
+ }
+out:
+ if (rc) {
+ OBD_FREE_PTR(llh);
+ handle->lgh_hdr = NULL;
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *rec,
+ void *data)
+{
+ struct llog_rec_hdr local_rec = *rec;
+ struct llog_handle *local_llh = (struct llog_handle *)data;
+ char *cfg_buf = (char*) (rec + 1);
+ struct lustre_cfg *lcfg;
+ int rc = 0;
+ ENTRY;
+
+ /* Append all records */
+ local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+ rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+ (void *)cfg_buf, -1);
+
+ lcfg = (struct lustre_cfg *)cfg_buf;
+ CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+ rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+ lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+ struct llog_process_info *lpi = arg;
+ struct llog_handle *loghandle = lpi->lpi_loghandle;
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ struct llog_process_cat_data *cd = lpi->lpi_catdata;
+ char *buf;
+ __u64 cur_offset = LLOG_CHUNK_SIZE;
+ __u64 last_offset;
+ int rc = 0, index = 1, last_index;
+ int saved_index = 0;
+ int last_called_index = 0;
+
+ ENTRY;
+
+ LASSERT(llh);
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf) {
+ lpi->lpi_rc = -ENOMEM;
+ RETURN(0);
+ }
+
+ if (cd != NULL) {
+ last_called_index = cd->lpcd_first_idx;
+ index = cd->lpcd_first_idx + 1;
+ }
+ if (cd != NULL && cd->lpcd_last_idx)
+ last_index = cd->lpcd_last_idx;
+ else
+ last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+ while (rc == 0) {
+ struct llog_rec_hdr *rec;
+
+ /* skip records not set in bitmap */
+ while (index <= last_index &&
+ !ext2_test_bit(index, llh->llh_bitmap))
+ ++index;
+
+ LASSERT(index <= last_index + 1);
+ if (index == last_index + 1)
+ break;
+repeat:
+ CDEBUG(D_OTHER, "index: %d last_index %d\n",
+ index, last_index);
+
+ /* get the buf with our target record; avoid old garbage */
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ last_offset = cur_offset;
+ rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+ index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+ if (rc)
+ GOTO(out, rc);
+
+ /* NB: when rec->lrh_len is accessed it is already swabbed
+ * since it is used at the "end" of the loop and the rec
+ * swabbing is done at the beginning of the loop. */
+ for (rec = (struct llog_rec_hdr *)buf;
+ (char *)rec < buf + LLOG_CHUNK_SIZE;
+ rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+ CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+ rec, rec->lrh_type);
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+ rec->lrh_type, rec->lrh_index);
+
+ if (rec->lrh_index == 0) {
+ /* probably another rec just got added? */
+ if (index <= loghandle->lgh_last_idx)
+ GOTO(repeat, rc = 0);
+ GOTO(out, rc = 0); /* no more records */
+ }
+ if (rec->lrh_len == 0 ||
+ rec->lrh_len > LLOG_CHUNK_SIZE) {
+ CWARN("invalid length %d in llog record for "
+ "index %d/%d\n", rec->lrh_len,
+ rec->lrh_index, index);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ if (rec->lrh_index < index) {
+ CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+ rec->lrh_index);
+ continue;
+ }
+
+ CDEBUG(D_OTHER,
+ "lrh_index: %d lrh_len: %d (%d remains)\n",
+ rec->lrh_index, rec->lrh_len,
+ (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+ loghandle->lgh_cur_idx = rec->lrh_index;
+ loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+ last_offset;
+
+ /* if set, process the callback on this record */
+ if (ext2_test_bit(index, llh->llh_bitmap)) {
+ rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+ lpi->lpi_cbdata);
+ last_called_index = index;
+ if (rc == LLOG_PROC_BREAK) {
+ GOTO(out, rc);
+ } else if (rc == LLOG_DEL_RECORD) {
+ llog_cancel_rec(lpi->lpi_env,
+ loghandle,
+ rec->lrh_index);
+ rc = 0;
+ }
+ if (rc)
+ GOTO(out, rc);
+ } else {
+ CDEBUG(D_OTHER, "Skipped index %d\n", index);
+ }
+
+ /* next record, still in buffer? */
+ ++index;
+ if (index > last_index)
+ GOTO(out, rc = 0);
+ }
+ }
+
+out:
+ if (cd != NULL)
+ cd->lpcd_last_idx = last_called_index;
+
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+ lpi->lpi_rc = rc;
+ return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+ struct llog_process_info *lpi = arg;
+ struct lu_env env;
+ int rc;
+
+ unshare_fs_struct();
+
+ /* client env has no keys, tags is just 0 */
+ rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+ if (rc)
+ goto out;
+ lpi->lpi_env = &env;
+
+ rc = llog_process_thread(arg);
+
+ lu_env_fini(&env);
+out:
+ complete(&lpi->lpi_completion);
+ return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+ struct llog_process_info *lpi;
+ int rc;
+
+ ENTRY;
+
+ OBD_ALLOC_PTR(lpi);
+ if (lpi == NULL) {
+ CERROR("cannot alloc pointer\n");
+ RETURN(-ENOMEM);
+ }
+ lpi->lpi_loghandle = loghandle;
+ lpi->lpi_cb = cb;
+ lpi->lpi_cbdata = data;
+ lpi->lpi_catdata = catdata;
+
+ if (fork) {
+ /* The new thread can't use parent env,
+ * init the new one in llog_process_thread_daemonize. */
+ lpi->lpi_env = NULL;
+ init_completion(&lpi->lpi_completion);
+ rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+ "llog_process_thread"));
+ if (IS_ERR_VALUE(rc)) {
+ CERROR("%s: cannot start thread: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+ OBD_FREE_PTR(lpi);
+ RETURN(rc);
+ }
+ wait_for_completion(&lpi->lpi_completion);
+ } else {
+ lpi->lpi_env = env;
+ llog_process_thread(lpi);
+ }
+ rc = lpi->lpi_rc;
+ OBD_FREE_PTR(lpi);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata)
+{
+ return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+ if (loghandle && loghandle->lgh_hdr)
+ return loghandle->lgh_hdr->llh_count;
+ return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+ struct llog_handle *loghandle, llog_cb_t cb,
+ void *data, void *catdata)
+{
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ struct llog_process_cat_data *cd = catdata;
+ void *buf;
+ int rc = 0, first_index = 1, index, idx;
+ ENTRY;
+
+ OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+ if (!buf)
+ RETURN(-ENOMEM);
+
+ if (cd != NULL)
+ first_index = cd->lpcd_first_idx + 1;
+ if (cd != NULL && cd->lpcd_last_idx)
+ index = cd->lpcd_last_idx;
+ else
+ index = LLOG_BITMAP_BYTES * 8 - 1;
+
+ while (rc == 0) {
+ struct llog_rec_hdr *rec;
+ struct llog_rec_tail *tail;
+
+ /* skip records not set in bitmap */
+ while (index >= first_index &&
+ !ext2_test_bit(index, llh->llh_bitmap))
+ --index;
+
+ LASSERT(index >= first_index - 1);
+ if (index == first_index - 1)
+ break;
+
+ /* get the buf with our target record; avoid old garbage */
+ memset(buf, 0, LLOG_CHUNK_SIZE);
+ rc = llog_prev_block(env, loghandle, index, buf,
+ LLOG_CHUNK_SIZE);
+ if (rc)
+ GOTO(out, rc);
+
+ rec = buf;
+ idx = rec->lrh_index;
+ CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+ while (idx < index) {
+ rec = (void *)rec + rec->lrh_len;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+ idx ++;
+ }
+ LASSERT(idx == index);
+ tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+ /* process records in buffer, starting where we found one */
+ while ((void *)tail > buf) {
+ if (tail->lrt_index == 0)
+ GOTO(out, rc = 0); /* no more records */
+
+ /* if set, process the callback on this record */
+ if (ext2_test_bit(index, llh->llh_bitmap)) {
+ rec = (void *)tail - tail->lrt_len +
+ sizeof(*tail);
+
+ rc = cb(env, loghandle, rec, data);
+ if (rc == LLOG_PROC_BREAK) {
+ GOTO(out, rc);
+ } else if (rc == LLOG_DEL_RECORD) {
+ llog_cancel_rec(env, loghandle,
+ tail->lrt_index);
+ rc = 0;
+ }
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ /* previous record, still in buffer? */
+ --index;
+ if (index < first_index)
+ GOTO(out, rc = 0);
+ tail = (void *)tail - tail->lrt_len;
+ }
+ }
+
+out:
+ if (buf)
+ OBD_FREE(buf, LLOG_CHUNK_SIZE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ * llog_open - open llog, may not exist
+ * llog_exist - check if llog exists
+ * llog_close - close opened llog, pair for open, frees llog_handle
+ * llog_declare_create - declare llog creation
+ * llog_create - create new llog on disk, need transaction handle
+ * llog_declare_write_rec - declaration of llog write
+ * llog_write_rec - write llog record on disk, need transaction handle
+ * llog_declare_add - declare llog catalog record addition
+ * llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_exist == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ rc = lop->lop_exist(loghandle);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+ struct llog_handle *loghandle, struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_declare_create == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_declare_create(env, loghandle, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+ struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ RETURN(rc);
+ if (lop->lop_create == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_create(env, handle, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, int idx,
+ struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ RETURN(rc);
+ LASSERT(lop);
+ if (lop->lop_declare_write_rec == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ int numcookies, void *buf, int idx, struct thandle *th)
+{
+ struct llog_operations *lop;
+ int raised, rc, buflen;
+
+ ENTRY;
+
+ rc = llog_handle2ops(handle, &lop);
+ if (rc)
+ RETURN(rc);
+
+ LASSERT(lop);
+ if (lop->lop_write_rec == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ if (buf)
+ buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+ sizeof(struct llog_rec_tail);
+ else
+ buflen = rec->lrh_len;
+ LASSERT(cfs_size_round(buflen) == buflen);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+ buf, idx, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+ void *buf, struct thandle *th)
+{
+ int raised, rc;
+
+ ENTRY;
+
+ if (lgh->lgh_logops->lop_add == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+ struct llog_rec_hdr *rec, struct thandle *th)
+{
+ int raised, rc;
+
+ ENTRY;
+
+ if (lgh->lgh_logops->lop_declare_add == NULL)
+ RETURN(-EOPNOTSUPP);
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **res, struct llog_logid *logid,
+ char *name)
+{
+ struct thandle *th;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+ if (rc)
+ RETURN(rc);
+
+ if (llog_exist(*res))
+ RETURN(0);
+
+ if ((*res)->lgh_obj != NULL) {
+ struct dt_device *d;
+
+ d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+ th = dt_trans_create(env, d);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ rc = llog_declare_create(env, *res, th);
+ if (rc == 0) {
+ rc = dt_trans_start_local(env, d, th);
+ if (rc == 0)
+ rc = llog_create(env, *res, th);
+ }
+ dt_trans_stop(env, d, th);
+ } else {
+ /* lvfs compat code */
+ LASSERT((*res)->lgh_file == NULL);
+ rc = llog_create(env, *res, NULL);
+ }
+out:
+ if (rc)
+ llog_close(env, *res);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_logid *logid, char *name)
+{
+ struct llog_handle *handle;
+ int rc = 0, rc2;
+
+ ENTRY;
+
+ /* nothing to erase */
+ if (name == NULL && logid == NULL)
+ RETURN(0);
+
+ rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+ if (rc < 0)
+ RETURN(rc);
+
+ rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+ if (rc == 0)
+ rc = llog_destroy(env, handle);
+
+ rc2 = llog_close(env, handle);
+ if (rc == 0)
+ rc = rc2;
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ int cookiecount, void *buf, int idx)
+{
+ int rc;
+
+ ENTRY;
+
+ LASSERT(loghandle);
+ LASSERT(loghandle->lgh_ctxt);
+
+ if (loghandle->lgh_obj != NULL) {
+ struct dt_device *dt;
+ struct thandle *th;
+
+ dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ RETURN(PTR_ERR(th));
+
+ rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ down_write(&loghandle->lgh_lock);
+ rc = llog_write_rec(env, loghandle, rec, reccookie,
+ cookiecount, buf, idx, th);
+ up_write(&loghandle->lgh_lock);
+out_trans:
+ dt_trans_stop(env, dt, th);
+ } else { /* lvfs compatibility */
+ down_write(&loghandle->lgh_lock);
+ rc = llog_write_rec(env, loghandle, rec, reccookie,
+ cookiecount, buf, idx, NULL);
+ up_write(&loghandle->lgh_lock);
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_handle **lgh, struct llog_logid *logid,
+ char *name, enum llog_open_param open_param)
+{
+ int raised;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_logops);
+
+ if (ctxt->loc_logops->lop_open == NULL) {
+ *lgh = NULL;
+ RETURN(-EOPNOTSUPP);
+ }
+
+ *lgh = llog_alloc_handle();
+ if (*lgh == NULL)
+ RETURN(-ENOMEM);
+ (*lgh)->lgh_ctxt = ctxt;
+ (*lgh)->lgh_logops = ctxt->loc_logops;
+
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ if (rc) {
+ llog_free_handle(*lgh);
+ *lgh = NULL;
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+ struct llog_operations *lop;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_handle2ops(loghandle, &lop);
+ if (rc)
+ GOTO(out, rc);
+ if (lop->lop_close == NULL)
+ GOTO(out, rc = -EOPNOTSUPP);
+ rc = lop->lop_close(env, loghandle);
+out:
+ llog_handle_put(loghandle);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 000000000000..cf00b2f550ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_handle *loghandle,
+ struct thandle *th)
+{
+
+ struct llog_log_hdr *llh;
+ struct llog_logid_rec rec = { { 0 }, };
+ int rc, index, bitmap_size;
+ ENTRY;
+
+ llh = cathandle->lgh_hdr;
+ bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+ index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+ /* maximum number of available slots in catlog is bitmap_size - 2 */
+ if (llh->llh_cat_idx == index) {
+ CERROR("no free catalog slots for log...\n");
+ RETURN(-ENOSPC);
+ }
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+ RETURN(-ENOSPC);
+
+ rc = llog_create(env, loghandle, th);
+ /* if llog is already created, no need to initialize it */
+ if (rc == -EEXIST) {
+ RETURN(0);
+ } else if (rc != 0) {
+ CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+ RETURN(rc);
+ }
+
+ rc = llog_init_handle(env, loghandle,
+ LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+ &cathandle->lgh_hdr->llh_tgtuuid);
+ if (rc)
+ GOTO(out_destroy, rc);
+
+ if (index == 0)
+ index = 1;
+
+ spin_lock(&loghandle->lgh_hdr_lock);
+ llh->llh_count++;
+ if (ext2_set_bit(index, llh->llh_bitmap)) {
+ CERROR("argh, index %u already set in log bitmap?\n",
+ index);
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ LBUG(); /* should never happen */
+ }
+ spin_unlock(&loghandle->lgh_hdr_lock);
+
+ cathandle->lgh_last_idx = index;
+ llh->llh_tail.lrt_index = index;
+
+ CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+ DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, index,
+ POSTID(&cathandle->lgh_id.lgl_oi));
+ /* build the record for this log in the catalog */
+ rec.lid_hdr.lrh_len = sizeof(rec);
+ rec.lid_hdr.lrh_index = index;
+ rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+ rec.lid_id = loghandle->lgh_id;
+ rec.lid_tail.lrt_len = sizeof(rec);
+ rec.lid_tail.lrt_index = index;
+
+ /* update the catalog: header and record */
+ rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+ &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+ if (rc < 0)
+ GOTO(out_destroy, rc);
+
+ loghandle->lgh_hdr->llh_cat_idx = index;
+ RETURN(0);
+out_destroy:
+ llog_destroy(env, loghandle);
+ RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle **res, struct llog_logid *logid)
+{
+ struct llog_handle *loghandle;
+ int rc = 0;
+
+ ENTRY;
+
+ if (cathandle == NULL)
+ RETURN(-EBADF);
+
+ down_write(&cathandle->lgh_lock);
+ list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+ u.phd.phd_entry) {
+ struct llog_logid *cgl = &loghandle->lgh_id;
+
+ if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+ ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+ if (cgl->lgl_ogen != logid->lgl_ogen) {
+ CERROR("%s: log "DOSTID" generation %x != %x\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+ logid->lgl_ogen);
+ continue;
+ }
+ loghandle->u.phd.phd_cat_handle = cathandle;
+ up_write(&cathandle->lgh_lock);
+ GOTO(out, rc = 0);
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+ LLOG_OPEN_EXISTS);
+ if (rc < 0) {
+ CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+ RETURN(rc);
+ }
+
+ rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+ if (rc < 0) {
+ llog_close(env, loghandle);
+ loghandle = NULL;
+ RETURN(rc);
+ }
+
+ down_write(&cathandle->lgh_lock);
+ list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+ up_write(&cathandle->lgh_lock);
+
+ loghandle->u.phd.phd_cat_handle = cathandle;
+ loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+ loghandle->u.phd.phd_cookie.lgc_index =
+ loghandle->lgh_hdr->llh_cat_idx;
+ EXIT;
+out:
+ llog_handle_get(loghandle);
+ *res = loghandle;
+ return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+ struct llog_handle *loghandle, *n;
+ int rc;
+
+ ENTRY;
+
+ list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+ u.phd.phd_entry) {
+ struct llog_log_hdr *llh = loghandle->lgh_hdr;
+ int index;
+
+ /* unlink open-not-created llogs */
+ list_del_init(&loghandle->u.phd.phd_entry);
+ llh = loghandle->lgh_hdr;
+ if (loghandle->lgh_obj != NULL && llh != NULL &&
+ (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1)) {
+ rc = llog_destroy(env, loghandle);
+ if (rc)
+ CERROR("%s: failure destroying log during "
+ "cleanup: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name,
+ rc);
+
+ index = loghandle->u.phd.phd_cookie.lgc_index;
+ llog_cat_cleanup(env, cathandle, NULL, index);
+ }
+ llog_close(env, loghandle);
+ }
+ /* if handle was stored in ctxt, remove it too */
+ if (cathandle->lgh_ctxt->loc_handle == cathandle)
+ cathandle->lgh_ctxt->loc_handle = NULL;
+ rc = llog_close(env, cathandle);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+ LLOGH_CAT,
+ LLOGH_LOG
+};
+
+/** Return the currently active log handle. If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+ struct thandle *th)
+{
+ struct llog_handle *loghandle = NULL;
+ ENTRY;
+
+ down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+ loghandle = cathandle->u.chd.chd_current_log;
+ if (loghandle) {
+ struct llog_log_hdr *llh;
+
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ llh = loghandle->lgh_hdr;
+ if (llh == NULL ||
+ loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+ up_read(&cathandle->lgh_lock);
+ RETURN(loghandle);
+ } else {
+ up_write(&loghandle->lgh_lock);
+ }
+ }
+ up_read(&cathandle->lgh_lock);
+
+ /* time to use next log */
+
+ /* first, we have to make sure the state hasn't changed */
+ down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+ loghandle = cathandle->u.chd.chd_current_log;
+ if (loghandle) {
+ struct llog_log_hdr *llh;
+
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ llh = loghandle->lgh_hdr;
+ LASSERT(llh);
+ if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+ up_write(&cathandle->lgh_lock);
+ RETURN(loghandle);
+ } else {
+ up_write(&loghandle->lgh_lock);
+ }
+ }
+
+ CDEBUG(D_INODE, "use next log\n");
+
+ loghandle = cathandle->u.chd.chd_next_log;
+ cathandle->u.chd.chd_current_log = loghandle;
+ cathandle->u.chd.chd_next_log = NULL;
+ down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+ up_write(&cathandle->lgh_lock);
+ LASSERT(loghandle);
+ RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf, struct thandle *th)
+{
+ struct llog_handle *loghandle;
+ int rc;
+ ENTRY;
+
+ LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+ loghandle = llog_cat_current_log(cathandle, th);
+ LASSERT(!IS_ERR(loghandle));
+
+ /* loghandle is already locked by llog_cat_current_log() for us */
+ if (!llog_exist(loghandle)) {
+ rc = llog_cat_new_log(env, cathandle, loghandle, th);
+ if (rc < 0) {
+ up_write(&loghandle->lgh_lock);
+ RETURN(rc);
+ }
+ }
+ /* now let's try to add the record */
+ rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+ if (rc < 0)
+ CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+ "llog_write_rec %d: lh=%p\n", rc, loghandle);
+ up_write(&loghandle->lgh_lock);
+ if (rc == -ENOSPC) {
+ /* try to use next log */
+ loghandle = llog_cat_current_log(cathandle, th);
+ LASSERT(!IS_ERR(loghandle));
+ /* new llog can be created concurrently */
+ if (!llog_exist(loghandle)) {
+ rc = llog_cat_new_log(env, cathandle, loghandle, th);
+ if (rc < 0) {
+ up_write(&loghandle->lgh_lock);
+ RETURN(rc);
+ }
+ }
+ /* now let's try to add the record */
+ rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+ -1, th);
+ if (rc < 0)
+ CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+ up_write(&loghandle->lgh_lock);
+ }
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+ struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct thandle *th)
+{
+ struct llog_handle *loghandle, *next;
+ int rc = 0;
+
+ ENTRY;
+
+ if (cathandle->u.chd.chd_current_log == NULL) {
+ /* declare new plain llog */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_current_log == NULL) {
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+ NULL, NULL, LLOG_OPEN_NEW);
+ if (rc == 0) {
+ cathandle->u.chd.chd_current_log = loghandle;
+ list_add_tail(&loghandle->u.phd.phd_entry,
+ &cathandle->u.chd.chd_head);
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+ } else if (cathandle->u.chd.chd_next_log == NULL) {
+ /* declare next plain llog */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_next_log == NULL) {
+ rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+ NULL, NULL, LLOG_OPEN_NEW);
+ if (rc == 0) {
+ cathandle->u.chd.chd_next_log = loghandle;
+ list_add_tail(&loghandle->u.phd.phd_entry,
+ &cathandle->u.chd.chd_head);
+ }
+ }
+ up_write(&cathandle->lgh_lock);
+ }
+ if (rc)
+ GOTO(out, rc);
+
+ if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+ rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+ th);
+ if (rc)
+ GOTO(out, rc);
+ llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ }
+ /* declare records in the llogs */
+ rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+ rec, -1, th);
+ if (rc)
+ GOTO(out, rc);
+
+ next = cathandle->u.chd.chd_next_log;
+ if (next) {
+ if (!llog_exist(next)) {
+ rc = llog_declare_create(env, next, th);
+ llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ }
+ llog_declare_write_rec(env, next, rec, -1, th);
+ }
+out:
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+ void *buf)
+{
+ struct llog_ctxt *ctxt;
+ struct dt_device *dt;
+ struct thandle *th = NULL;
+ int rc;
+
+ ctxt = cathandle->lgh_ctxt;
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_exp);
+
+ if (cathandle->lgh_obj != NULL) {
+ dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+ LASSERT(dt);
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ RETURN(PTR_ERR(th));
+
+ rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc)
+ GOTO(out_trans, rc);
+ rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+ dt_trans_stop(env, dt, th);
+ } else { /* lvfs compat code */
+ LASSERT(cathandle->lgh_file != NULL);
+ rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+ if (rc == 0)
+ rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+ buf, th);
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+ struct llog_handle *cathandle, int count,
+ struct llog_cookie *cookies)
+{
+ int i, index, rc = 0, failed = 0;
+
+ ENTRY;
+
+ for (i = 0; i < count; i++, cookies++) {
+ struct llog_handle *loghandle;
+ struct llog_logid *lgl = &cookies->lgc_lgl;
+ int lrc;
+
+ rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lgl->lgl_oi), rc);
+ failed++;
+ continue;
+ }
+
+ lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+ if (lrc == 1) { /* log has been destroyed */
+ index = loghandle->u.phd.phd_cookie.lgc_index;
+ rc = llog_cat_cleanup(env, cathandle, loghandle,
+ index);
+ } else if (lrc == -ENOENT) {
+ if (rc == 0) /* ENOENT shouldn't rewrite any error */
+ rc = lrc;
+ } else if (lrc < 0) {
+ failed++;
+ rc = lrc;
+ }
+ llog_handle_put(loghandle);
+ }
+ if (rc)
+ CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+ rc);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_process_data *d = data;
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *llh;
+ int rc;
+
+ ENTRY;
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cat_llh->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ RETURN(rc);
+ }
+
+ if (rec->lrh_index < d->lpd_startcat)
+ /* Skip processing of the logs until startcat */
+ RETURN(0);
+
+ if (d->lpd_startidx > 0) {
+ struct llog_process_cat_data cd;
+
+ cd.lpcd_first_idx = d->lpd_startidx;
+ cd.lpcd_last_idx = 0;
+ rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+ &cd, false);
+ /* Continue processing the next log from idx 0 */
+ d->lpd_startidx = 0;
+ } else {
+ rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+ NULL, false);
+ }
+ llog_handle_put(llh);
+
+ RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat,
+ int startidx, bool fork)
+{
+ struct llog_process_data d;
+ struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+ int rc;
+ ENTRY;
+
+ LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+ d.lpd_data = data;
+ d.lpd_cb = cb;
+ d.lpd_startcat = startcat;
+ d.lpd_startidx = startidx;
+
+ if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+ struct llog_process_cat_data cd;
+
+ CWARN("catlog "DOSTID" crosses index zero\n",
+ POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ cd.lpcd_first_idx = llh->llh_cat_idx;
+ cd.lpcd_last_idx = 0;
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, &cd, fork);
+ if (rc != 0)
+ RETURN(rc);
+
+ cd.lpcd_first_idx = 0;
+ cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, &cd, fork);
+ } else {
+ rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+ &d, NULL, fork);
+ }
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data, int startcat, int startidx)
+{
+ return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+ startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_process_data *d = data;
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *llh;
+ int rc;
+
+ if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cat_llh->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ RETURN(rc);
+ }
+
+ rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+ llog_handle_put(llh);
+ RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+ struct llog_handle *cat_llh,
+ llog_cb_t cb, void *data)
+{
+ struct llog_process_data d;
+ struct llog_process_cat_data cd;
+ struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+ int rc;
+ ENTRY;
+
+ LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+ d.lpd_data = data;
+ d.lpd_cb = cb;
+
+ if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+ CWARN("catalog "DOSTID" crosses index zero\n",
+ POSTID(&cat_llh->lgh_id.lgl_oi));
+
+ cd.lpcd_first_idx = 0;
+ cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, &cd);
+ if (rc != 0)
+ RETURN(rc);
+
+ cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+ cd.lpcd_last_idx = 0;
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, &cd);
+ } else {
+ rc = llog_reverse_process(env, cat_llh,
+ llog_cat_reverse_process_cb,
+ &d, NULL);
+ }
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+ struct llog_log_hdr *llh = cathandle->lgh_hdr;
+ int i, bitmap_size, idx;
+ ENTRY;
+
+ bitmap_size = LLOG_BITMAP_SIZE(llh);
+ if (llh->llh_cat_idx == (index - 1)) {
+ idx = llh->llh_cat_idx + 1;
+ llh->llh_cat_idx = idx;
+ if (idx == cathandle->lgh_last_idx)
+ goto out;
+ for (i = (index + 1) % bitmap_size;
+ i != cathandle->lgh_last_idx;
+ i = (i + 1) % bitmap_size) {
+ if (!ext2_test_bit(i, llh->llh_bitmap)) {
+ idx = llh->llh_cat_idx + 1;
+ llh->llh_cat_idx = idx;
+ } else if (i == 0) {
+ llh->llh_cat_idx = 0;
+ } else {
+ break;
+ }
+ }
+out:
+ CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+ POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+ }
+
+ RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle *loghandle, int index)
+{
+ int rc;
+
+ LASSERT(index);
+ if (loghandle != NULL) {
+ /* remove destroyed llog from catalog list and
+ * chd_current_log variable */
+ down_write(&cathandle->lgh_lock);
+ if (cathandle->u.chd.chd_current_log == loghandle)
+ cathandle->u.chd.chd_current_log = NULL;
+ list_del_init(&loghandle->u.phd.phd_entry);
+ up_write(&cathandle->lgh_lock);
+ LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+ /* llog was opened and keep in a list, close it now */
+ llog_close(env, loghandle);
+ }
+ /* remove plain llog entry from catalog by index */
+ llog_cat_set_first_idx(cathandle, index);
+ rc = llog_cancel_rec(env, cathandle, index);
+ if (rc == 0)
+ CDEBUG(D_HA, "cancel plain log at index"
+ " %u of catalog "DOSTID"\n",
+ index, POSTID(&cathandle->lgh_id.lgl_oi));
+ return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *loghandle;
+ struct llog_log_hdr *llh;
+ int rc;
+
+ ENTRY;
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+
+ CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+ DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+ rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+ rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+ if (rc) {
+ CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+ cathandle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&lir->lid_id.lgl_oi), rc);
+ if (rc == -ENOENT || rc == -ESTALE) {
+ /* remove index from catalog */
+ llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+ }
+ RETURN(rc);
+ }
+
+ llh = loghandle->lgh_hdr;
+ if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+ (llh->llh_count == 1)) {
+ rc = llog_destroy(env, loghandle);
+ if (rc)
+ CERROR("%s: fail to destroy empty log: rc = %d\n",
+ loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+ llog_cat_cleanup(env, cathandle, loghandle,
+ loghandle->u.phd.phd_cookie.lgc_index);
+ }
+ llog_handle_put(loghandle);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+ struct llog_handle *llh)
+{
+ int rc;
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+ if (rc)
+ RETURN(rc);
+
+ rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+ if (rc)
+ CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+ "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+ RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 000000000000..539e1d4f9d4c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+ struct llog_handle *lpi_loghandle;
+ llog_cb_t lpi_cb;
+ void *lpi_cbdata;
+ void *lpi_catdata;
+ int lpi_rc;
+ struct completion lpi_completion;
+ const struct lu_env *lpi_env;
+
+};
+
+struct llog_thread_info {
+ struct lu_attr lgi_attr;
+ struct lu_fid lgi_fid;
+ struct dt_object_format lgi_dof;
+ struct lu_buf lgi_buf;
+ loff_t lgi_off;
+ struct llog_rec_hdr lgi_lrh;
+ struct llog_rec_tail lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+ struct llog_thread_info *lgi;
+
+ lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+ LASSERT(lgi);
+ return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+ ostid_set_seq_llog(&logid->lgl_oi);
+ ostid_set_id(&logid->lgl_oi, ino);
+ logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+ struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 000000000000..0732874e26c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+ char *start, *end, *endp;
+ __u64 id, seq;
+
+ ENTRY;
+ start = str;
+ if (*start != '#')
+ RETURN(-EINVAL);
+
+ start++;
+ if (start - str >= len - 1)
+ RETURN(-EINVAL);
+ end = strchr(start, '#');
+ if (end == NULL || end == start)
+ RETURN(-EINVAL);
+
+ *end = '\0';
+ id = simple_strtoull(start, &endp, 0);
+ if (endp != end)
+ RETURN(-EINVAL);
+
+ start = ++end;
+ if (start - str >= len - 1)
+ RETURN(-EINVAL);
+ end = strchr(start, '#');
+ if (end == NULL || end == start)
+ RETURN(-EINVAL);
+
+ *end = '\0';
+ seq = simple_strtoull(start, &endp, 0);
+ if (endp != end)
+ RETURN(-EINVAL);
+
+ ostid_set_seq(&logid->lgl_oi, seq);
+ ostid_set_id(&logid->lgl_oi, id);
+
+ start = ++end;
+ if (start - str >= len - 1)
+ RETURN(-EINVAL);
+ logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+ if (*endp != '\0')
+ RETURN(-EINVAL);
+
+ RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+ static int l, remains, from, to;
+ static char *out;
+ char *endp;
+ int cur_index, rc = 0;
+
+ ENTRY;
+
+ if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+ l = 0;
+ remains = ioc_data->ioc_inllen4 +
+ cfs_size_round(ioc_data->ioc_inllen1) +
+ cfs_size_round(ioc_data->ioc_inllen2) +
+ cfs_size_round(ioc_data->ioc_inllen3);
+ from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+ if (*endp != '\0')
+ RETURN(-EINVAL);
+ to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+ if (*endp != '\0')
+ RETURN(-EINVAL);
+ ioc_data->ioc_inllen1 = 0;
+ out = ioc_data->ioc_bulk;
+ }
+
+ cur_index = rec->lrh_index;
+ if (cur_index < from)
+ RETURN(0);
+ if (to > 0 && cur_index > to)
+ RETURN(-LLOG_EEMPTY);
+
+ if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct llog_handle *loghandle;
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ l = snprintf(out, remains, "[index]: %05d [type]: "
+ "%02x [len]: %04d failed\n",
+ cur_index, rec->lrh_type,
+ rec->lrh_len);
+ }
+ if (handle->lgh_ctxt == NULL)
+ RETURN(-EOPNOTSUPP);
+ rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+ if (rc) {
+ CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+ POSTID(&lir->lid_id.lgl_oi),
+ lir->lid_id.lgl_ogen);
+ RETURN(rc);
+ }
+ rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+ llog_handle_put(loghandle);
+ } else {
+ bool ok;
+
+ switch (rec->lrh_type) {
+ case OST_SZ_REC:
+ case MDS_UNLINK_REC:
+ case MDS_UNLINK64_REC:
+ case MDS_SETATTR64_REC:
+ case OBD_CFG_REC:
+ case LLOG_GEN_REC:
+ case LLOG_HDR_MAGIC:
+ ok = true;
+ break;
+ default:
+ ok = false;
+ }
+
+ l = snprintf(out, remains, "[index]: %05d [type]: "
+ "%02x [len]: %04d %s\n",
+ cur_index, rec->lrh_type, rec->lrh_len,
+ ok ? "ok" : "failed");
+ out += l;
+ remains -= l;
+ if (remains <= 0) {
+ CERROR("%s: no space to print log records\n",
+ handle->lgh_ctxt->loc_obd->obd_name);
+ RETURN(-LLOG_EEMPTY);
+ }
+ }
+ RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+ static int l, remains, from, to;
+ static char *out;
+ char *endp;
+ int cur_index;
+
+ ENTRY;
+ if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+ l = 0;
+ remains = ioc_data->ioc_inllen4 +
+ cfs_size_round(ioc_data->ioc_inllen1) +
+ cfs_size_round(ioc_data->ioc_inllen2) +
+ cfs_size_round(ioc_data->ioc_inllen3);
+ from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+ if (*endp != '\0')
+ RETURN(-EINVAL);
+ to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+ if (*endp != '\0')
+ RETURN(-EINVAL);
+ out = ioc_data->ioc_bulk;
+ ioc_data->ioc_inllen1 = 0;
+ }
+
+ cur_index = rec->lrh_index;
+ if (cur_index < from)
+ RETURN(0);
+ if (to > 0 && cur_index > to)
+ RETURN(-LLOG_EEMPTY);
+
+ if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+
+ l = snprintf(out, remains,
+ "[index]: %05d [logid]: #"DOSTID"#%08x\n",
+ cur_index, POSTID(&lir->lid_id.lgl_oi),
+ lir->lid_id.lgl_ogen);
+ } else if (rec->lrh_type == OBD_CFG_REC) {
+ int rc;
+
+ rc = class_config_parse_rec(rec, out, remains);
+ if (rc < 0)
+ RETURN(rc);
+ l = rc;
+ } else {
+ l = snprintf(out, remains,
+ "[index]: %05d [type]: %02x [len]: %04d\n",
+ cur_index, rec->lrh_type, rec->lrh_len);
+ }
+ out += l;
+ remains -= l;
+ if (remains <= 0) {
+ CERROR("not enough space for print log records\n");
+ RETURN(-LLOG_EEMPTY);
+ }
+
+ RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+ struct llog_logid *logid)
+{
+ struct llog_handle *log;
+ int rc;
+
+ ENTRY;
+
+ rc = llog_cat_id2handle(env, cat, &log, logid);
+ if (rc) {
+ CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+ POSTID(&logid->lgl_oi), logid->lgl_ogen);
+ RETURN(-ENOENT);
+ }
+
+ rc = llog_destroy(env, log);
+ if (rc) {
+ CDEBUG(D_IOCTL, "cannot destroy log\n");
+ GOTO(out, rc);
+ }
+ llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+ llog_handle_put(log);
+ RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ int rc;
+
+ ENTRY;
+ if (rec->lrh_type != LLOG_LOGID_MAGIC)
+ RETURN(-EINVAL);
+ rc = llog_remove_log(env, handle, &lir->lid_id);
+
+ RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+ struct obd_ioctl_data *data)
+{
+ struct llog_logid logid;
+ int rc = 0;
+ struct llog_handle *handle = NULL;
+
+ ENTRY;
+
+ if (*data->ioc_inlbuf1 == '#') {
+ rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+ if (rc)
+ RETURN(rc);
+ rc = llog_open(env, ctxt, &handle, &logid, NULL,
+ LLOG_OPEN_EXISTS);
+ if (rc)
+ RETURN(rc);
+ } else if (*data->ioc_inlbuf1 == '$') {
+ char *name = data->ioc_inlbuf1 + 1;
+
+ rc = llog_open(env, ctxt, &handle, NULL, name,
+ LLOG_OPEN_EXISTS);
+ if (rc)
+ RETURN(rc);
+ } else {
+ RETURN(-EINVAL);
+ }
+
+ rc = llog_init_handle(env, handle, 0, NULL);
+ if (rc)
+ GOTO(out_close, rc = -ENOENT);
+
+ switch (cmd) {
+ case OBD_IOC_LLOG_INFO: {
+ int l;
+ int remains = data->ioc_inllen2 +
+ cfs_size_round(data->ioc_inllen1);
+ char *out = data->ioc_bulk;
+
+ l = snprintf(out, remains,
+ "logid: #"DOSTID"#%08x\n"
+ "flags: %x (%s)\n"
+ "records count: %d\n"
+ "last index: %d\n",
+ POSTID(&handle->lgh_id.lgl_oi),
+ handle->lgh_id.lgl_ogen,
+ handle->lgh_hdr->llh_flags,
+ handle->lgh_hdr->llh_flags &
+ LLOG_F_IS_CAT ? "cat" : "plain",
+ handle->lgh_hdr->llh_count,
+ handle->lgh_last_idx);
+ out += l;
+ remains -= l;
+ if (remains <= 0) {
+ CERROR("%s: not enough space for log header info\n",
+ ctxt->loc_obd->obd_name);
+ rc = -ENOSPC;
+ }
+ break;
+ }
+ case OBD_IOC_LLOG_CHECK:
+ LASSERT(data->ioc_inllen1 > 0);
+ rc = llog_process(env, handle, llog_check_cb, data, NULL);
+ if (rc == -LLOG_EEMPTY)
+ rc = 0;
+ else if (rc)
+ GOTO(out_close, rc);
+ break;
+ case OBD_IOC_LLOG_PRINT:
+ LASSERT(data->ioc_inllen1 > 0);
+ rc = llog_process(env, handle, llog_print_cb, data, NULL);
+ if (rc == -LLOG_EEMPTY)
+ rc = 0;
+ else if (rc)
+ GOTO(out_close, rc);
+ break;
+ case OBD_IOC_LLOG_CANCEL: {
+ struct llog_cookie cookie;
+ struct llog_logid plain;
+ char *endp;
+
+ cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+ if (*endp != '\0')
+ GOTO(out_close, rc = -EINVAL);
+
+ if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+ rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+ GOTO(out_close, rc);
+ } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+ GOTO(out_close, rc = -EINVAL);
+ }
+
+ if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+ GOTO(out_close, rc = -ENOTTY);
+
+ rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+ if (rc)
+ GOTO(out_close, rc);
+ cookie.lgc_lgl = plain;
+ rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+ if (rc)
+ GOTO(out_close, rc);
+ break;
+ }
+ case OBD_IOC_LLOG_REMOVE: {
+ struct llog_logid plain;
+
+ if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+ rc = llog_destroy(env, handle);
+ GOTO(out_close, rc);
+ } else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+ GOTO(out_close, rc = -EINVAL);
+ }
+
+ if (data->ioc_inlbuf2 > 0) {
+ /* remove indicate log from the catalog */
+ rc = str2logid(&plain, data->ioc_inlbuf2,
+ data->ioc_inllen2);
+ if (rc)
+ GOTO(out_close, rc);
+ rc = llog_remove_log(env, handle, &plain);
+ } else {
+ /* remove all the log of the catalog */
+ rc = llog_process(env, handle, llog_delete_cb, NULL,
+ NULL);
+ if (rc)
+ GOTO(out_close, rc);
+ }
+ break;
+ }
+ default:
+ CERROR("%s: Unknown ioctl cmd %#x\n",
+ ctxt->loc_obd->obd_name, cmd);
+ GOTO(out_close, rc = -ENOTTY);
+ }
+
+out_close:
+ if (handle->lgh_hdr &&
+ handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+ llog_cat_close(env, handle);
+ else
+ llog_close(env, handle);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644
index 000000000000..7e12dc62141f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ * if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+ int len, int index)
+{
+ struct llog_rec_hdr rec = { 0 };
+ struct llog_rec_tail tail;
+ int rc;
+ ENTRY;
+
+ LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+ tail.lrt_len = rec.lrh_len = len;
+ tail.lrt_index = rec.lrh_index = index;
+ rec.lrh_type = LLOG_PAD_MAGIC;
+
+ rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+ if (rc) {
+ CERROR("error writing padding record: rc %d\n", rc);
+ goto out;
+ }
+
+ file->f_pos += len - sizeof(rec) - sizeof(tail);
+ rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+ if (rc) {
+ CERROR("error writing padding record: rc %d\n", rc);
+ goto out;
+ }
+
+ out:
+ RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+ struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+ int rc;
+ struct llog_rec_tail end;
+ loff_t saved_off = file->f_pos;
+ int buflen = rec->lrh_len;
+
+ ENTRY;
+
+ file->f_pos = off;
+
+ if (buflen == 0)
+ CWARN("0-length record\n");
+
+ if (!buf) {
+ rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+ if (rc) {
+ CERROR("error writing log record: rc %d\n", rc);
+ goto out;
+ }
+ GOTO(out, rc = 0);
+ }
+
+ /* the buf case */
+ rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+ rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+ if (rc) {
+ CERROR("error writing log hdr: rc %d\n", rc);
+ goto out;
+ }
+
+ rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+ if (rc) {
+ CERROR("error writing log buffer: rc %d\n", rc);
+ goto out;
+ }
+
+ end.lrt_len = rec->lrh_len;
+ end.lrt_index = rec->lrh_index;
+ rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+ if (rc) {
+ CERROR("error writing log tail: rc %d\n", rc);
+ goto out;
+ }
+
+ rc = 0;
+ out:
+ if (saved_off > file->f_pos)
+ file->f_pos = saved_off;
+ LASSERT(rc <= 0);
+ RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+ void *buf, int size, loff_t off)
+{
+ loff_t offset = off;
+ int rc;
+ ENTRY;
+
+ rc = fsfilt_read_record(obd, file, buf, size, &offset);
+ if (rc) {
+ CERROR("error reading log record: rc %d\n", rc);
+ RETURN(rc);
+ }
+ RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct obd_device *obd;
+ int rc;
+ ENTRY;
+
+ LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+ obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+ if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+ CDEBUG(D_HA, "not reading header from 0-byte log\n");
+ RETURN(LLOG_EEMPTY);
+ }
+
+ rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+ LLOG_CHUNK_SIZE, 0);
+ if (rc) {
+ CERROR("error reading log header from %.*s\n",
+ handle->lgh_file->f_dentry->d_name.len,
+ handle->lgh_file->f_dentry->d_name.name);
+ } else {
+ struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+ lustre_swab_llog_hdr(handle->lgh_hdr);
+
+ if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+ CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+ handle->lgh_file->f_dentry->d_name.len,
+ handle->lgh_file->f_dentry->d_name.name,
+ llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+ rc = -EIO;
+ } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+ CERROR("incorrectly sized log %.*s header: %#x "
+ "(expected %#x)\n",
+ handle->lgh_file->f_dentry->d_name.len,
+ handle->lgh_file->f_dentry->d_name.name,
+ llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+ CERROR("you may need to re-run lconf --write_conf.\n");
+ rc = -EIO;
+ }
+ }
+
+ handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+ handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+ RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ struct llog_cookie *reccookie, int cookiecount,
+ void *buf, int idx, struct thandle *th)
+{
+ struct llog_log_hdr *llh;
+ int reclen = rec->lrh_len, index, rc;
+ struct llog_rec_tail *lrt;
+ struct obd_device *obd;
+ struct file *file;
+ size_t left;
+ ENTRY;
+
+ llh = loghandle->lgh_hdr;
+ file = loghandle->lgh_file;
+ obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+ /* record length should not bigger than LLOG_CHUNK_SIZE */
+ if (buf)
+ rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+ sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+ else
+ rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+ if (rc)
+ RETURN(rc);
+
+ if (buf)
+ /* write_blob adds header and tail to lrh_len. */
+ reclen = sizeof(*rec) + rec->lrh_len +
+ sizeof(struct llog_rec_tail);
+
+ if (idx != -1) {
+ loff_t saved_offset;
+
+ /* no header: only allowed to insert record 1 */
+ if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+ CERROR("idx != -1 in empty log\n");
+ LBUG();
+ }
+
+ if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+ RETURN(-EINVAL);
+
+ if (!ext2_test_bit(idx, llh->llh_bitmap))
+ CERROR("Modify unset record %u\n", idx);
+ if (idx != rec->lrh_index)
+ CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+ rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+ /* we are done if we only write the header or on error */
+ if (rc || idx == 0)
+ RETURN(rc);
+
+ if (buf) {
+ /* We assume that caller has set lgh_cur_* */
+ saved_offset = loghandle->lgh_cur_offset;
+ CDEBUG(D_OTHER,
+ "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+ "offset %llu\n",
+ POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+ loghandle->lgh_cur_idx, rec->lrh_len,
+ (long long)(saved_offset - sizeof(*llh)));
+ if (rec->lrh_index != loghandle->lgh_cur_idx) {
+ CERROR("modify idx mismatch %u/%d\n",
+ idx, loghandle->lgh_cur_idx);
+ RETURN(-EFAULT);
+ }
+ } else {
+ /* Assumes constant lrh_len */
+ saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+ }
+
+ rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+ if (rc == 0 && reccookie) {
+ reccookie->lgc_lgl = loghandle->lgh_id;
+ reccookie->lgc_index = idx;
+ rc = 1;
+ }
+ RETURN(rc);
+ }
+
+ /* Make sure that records don't cross a chunk boundary, so we can
+ * process them page-at-a-time if needed. If it will cross a chunk
+ * boundary, write in a fake (but referenced) entry to pad the chunk.
+ *
+ * We know that llog_current_log() will return a loghandle that is
+ * big enough to hold reclen, so all we care about is padding here.
+ */
+ left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+ /* NOTE: padding is a record, but no bit is set */
+ if (left != 0 && left != reclen &&
+ left < (reclen + LLOG_MIN_REC_SIZE)) {
+ index = loghandle->lgh_last_idx + 1;
+ rc = llog_lvfs_pad(obd, file, left, index);
+ if (rc)
+ RETURN(rc);
+ loghandle->lgh_last_idx++; /*for pad rec*/
+ }
+ /* if it's the last idx in log file, then return -ENOSPC */
+ if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+ RETURN(-ENOSPC);
+ loghandle->lgh_last_idx++;
+ index = loghandle->lgh_last_idx;
+ LASSERT(index < LLOG_BITMAP_SIZE(llh));
+ rec->lrh_index = index;
+ if (buf == NULL) {
+ lrt = (struct llog_rec_tail *)
+ ((char *)rec + rec->lrh_len - sizeof(*lrt));
+ lrt->lrt_len = rec->lrh_len;
+ lrt->lrt_index = rec->lrh_index;
+ }
+ /*The caller should make sure only 1 process access the lgh_last_idx,
+ *Otherwise it might hit the assert.*/
+ LASSERT(index < LLOG_BITMAP_SIZE(llh));
+ spin_lock(&loghandle->lgh_hdr_lock);
+ if (ext2_set_bit(index, llh->llh_bitmap)) {
+ CERROR("argh, index %u already set in log bitmap?\n", index);
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ LBUG(); /* should never happen */
+ }
+ llh->llh_count++;
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ llh->llh_tail.lrt_index = index;
+
+ rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+ if (rc)
+ RETURN(rc);
+
+ rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+ if (rc)
+ RETURN(rc);
+
+ CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+ POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+ if (rc == 0 && reccookie) {
+ reccookie->lgc_lgl = loghandle->lgh_id;
+ reccookie->lgc_index = index;
+ if ((rec->lrh_type == MDS_UNLINK_REC) ||
+ (rec->lrh_type == MDS_SETATTR64_REC))
+ reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+ else if (rec->lrh_type == OST_SZ_REC)
+ reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+ else
+ reccookie->lgc_subsys = -1;
+ rc = 1;
+ }
+ if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+ rc = 1;
+
+ RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping. If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+ if (goal <= curr)
+ return;
+ *off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+ ~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ * - cur_offset to the furthest point read in the log file
+ * - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+ struct llog_handle *loghandle, int *cur_idx,
+ int next_idx, __u64 *cur_offset, void *buf,
+ int len)
+{
+ int rc;
+ ENTRY;
+
+ if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+ RETURN(-EINVAL);
+
+ CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+ next_idx, *cur_idx, *cur_offset);
+
+ while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+ struct llog_rec_hdr *rec, *last_rec;
+ struct llog_rec_tail *tail;
+ loff_t ppos;
+ int llen;
+
+ llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+ /* read up to next LLOG_CHUNK_SIZE block */
+ ppos = *cur_offset;
+ llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+ rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+ loghandle->lgh_file, buf, llen,
+ cur_offset);
+ if (rc < 0) {
+ CERROR("Cant read llog block at log id "DOSTID
+ "/%u offset "LPU64"\n",
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen,
+ *cur_offset);
+ RETURN(rc);
+ }
+
+ /* put number of bytes read into rc to make code simpler */
+ rc = *cur_offset - ppos;
+ if (rc < len) {
+ /* signal the end of the valid buffer to llog_process */
+ memset(buf + rc, 0, len - rc);
+ }
+
+ if (rc == 0) /* end of file, nothing to do */
+ RETURN(0);
+
+ if (rc < sizeof(*tail)) {
+ CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+ LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, *cur_offset);
+ RETURN(-EINVAL);
+ }
+
+ rec = buf;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ tail = (struct llog_rec_tail *)(buf + rc -
+ sizeof(struct llog_rec_tail));
+
+ /* get the last record in block */
+ last_rec = (struct llog_rec_hdr *)(buf + rc -
+ le32_to_cpu(tail->lrt_len));
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+ lustre_swab_llog_rec(last_rec);
+ LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+ *cur_idx = tail->lrt_index;
+
+ /* this shouldn't happen */
+ if (tail->lrt_index == 0) {
+ CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+ LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, *cur_offset);
+ RETURN(-EINVAL);
+ }
+ if (tail->lrt_index < next_idx)
+ continue;
+
+ /* sanity check that the start of the new buffer is no farther
+ * than the record that we wanted. This shouldn't happen. */
+ if (rec->lrh_index > next_idx) {
+ CERROR("missed desired record? %u > %u\n",
+ rec->lrh_index, next_idx);
+ RETURN(-ENOENT);
+ }
+ RETURN(0);
+ }
+ RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ __u64 cur_offset;
+ int rc;
+ ENTRY;
+
+ if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+ RETURN(-EINVAL);
+
+ CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+ cur_offset = LLOG_CHUNK_SIZE;
+ llog_skip_over(&cur_offset, 0, prev_idx);
+
+ while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+ struct llog_rec_hdr *rec, *last_rec;
+ struct llog_rec_tail *tail;
+ loff_t ppos = cur_offset;
+
+ rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+ loghandle->lgh_file, buf, len,
+ &cur_offset);
+ if (rc < 0) {
+ CERROR("Cant read llog block at log id "DOSTID
+ "/%u offset "LPU64"\n",
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen,
+ cur_offset);
+ RETURN(rc);
+ }
+
+ /* put number of bytes read into rc to make code simpler */
+ rc = cur_offset - ppos;
+
+ if (rc == 0) /* end of file, nothing to do */
+ RETURN(0);
+
+ if (rc < sizeof(*tail)) {
+ CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+ LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ RETURN(-EINVAL);
+ }
+
+ rec = buf;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ tail = (struct llog_rec_tail *)(buf + rc -
+ sizeof(struct llog_rec_tail));
+
+ /* get the last record in block */
+ last_rec = (struct llog_rec_hdr *)(buf + rc -
+ le32_to_cpu(tail->lrt_len));
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+ lustre_swab_llog_rec(last_rec);
+ LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+ /* this shouldn't happen */
+ if (tail->lrt_index == 0) {
+ CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+ LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ RETURN(-EINVAL);
+ }
+ if (tail->lrt_index < prev_idx)
+ continue;
+
+ /* sanity check that the start of the new buffer is no farther
+ * than the record that we wanted. This shouldn't happen. */
+ if (rec->lrh_index > prev_idx) {
+ CERROR("missed desired record? %u > %u\n",
+ rec->lrh_index, prev_idx);
+ RETURN(-ENOENT);
+ }
+ RETURN(0);
+ }
+ RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+ char *logname;
+ struct file *filp;
+ int len;
+
+ OBD_ALLOC(logname, PATH_MAX);
+ if (logname == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+ if (len >= PATH_MAX - 1) {
+ filp = ERR_PTR(-ENAMETOOLONG);
+ } else {
+ filp = l_filp_open(logname, flags, mode);
+ if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+ CERROR("logfile creation %s: %ld\n", logname,
+ PTR_ERR(filp));
+ }
+ OBD_FREE(logname, PATH_MAX);
+ return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_logid *logid, char *name,
+ enum llog_open_param open_param)
+{
+ struct llog_ctxt *ctxt = handle->lgh_ctxt;
+ struct l_dentry *dchild = NULL;
+ struct obd_device *obd;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_exp);
+ LASSERT(ctxt->loc_exp->exp_obd);
+ obd = ctxt->loc_exp->exp_obd;
+
+ LASSERT(handle);
+ if (logid != NULL) {
+ dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+ logid->lgl_ogen);
+ if (IS_ERR(dchild)) {
+ rc = PTR_ERR(dchild);
+ CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+ " rc = %d\n", ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+ GOTO(out, rc);
+ }
+ if (dchild->d_inode == NULL) {
+ l_dput(dchild);
+ rc = -ENOENT;
+ CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+ "rc = %d\n", ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+ GOTO(out, rc);
+ }
+ handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+ O_RDWR | O_LARGEFILE);
+ l_dput(dchild);
+ if (IS_ERR(handle->lgh_file)) {
+ rc = PTR_ERR(handle->lgh_file);
+ handle->lgh_file = NULL;
+ CERROR("%s: error opening llog #"DOSTID"#%08x:"
+ "rc = %d\n", ctxt->loc_obd->obd_name,
+ POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+ GOTO(out, rc);
+ }
+ handle->lgh_id = *logid;
+ } else if (name) {
+ handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+ O_RDWR | O_LARGEFILE, 0644);
+ if (IS_ERR(handle->lgh_file)) {
+ rc = PTR_ERR(handle->lgh_file);
+ handle->lgh_file = NULL;
+ if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+ OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+ if (handle->lgh_name)
+ strcpy(handle->lgh_name, name);
+ else
+ GOTO(out, rc = -ENOMEM);
+ rc = 0;
+ } else {
+ GOTO(out, rc);
+ }
+ } else {
+ lustre_build_llog_lvfs_oid(&handle->lgh_id,
+ handle->lgh_file->f_dentry->d_inode->i_ino,
+ handle->lgh_file->f_dentry->d_inode->i_generation);
+ }
+ } else {
+ LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+ handle->lgh_file = NULL;
+ }
+
+ /* No new llog is expected but doesn't exist */
+ if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+ GOTO(out_name, rc = -ENOENT);
+
+ RETURN(0);
+out_name:
+ if (handle->lgh_name != NULL)
+ OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+ RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+ return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct thandle *th)
+{
+ struct llog_ctxt *ctxt = handle->lgh_ctxt;
+ struct obd_device *obd;
+ struct l_dentry *dchild = NULL;
+ struct file *file;
+ struct obdo *oa = NULL;
+ int rc = 0;
+ int open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+ ENTRY;
+
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_exp);
+ obd = ctxt->loc_exp->exp_obd;
+ LASSERT(handle->lgh_file == NULL);
+
+ if (handle->lgh_name) {
+ file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+ open_flags, 0644);
+ if (IS_ERR(file))
+ RETURN(PTR_ERR(file));
+
+ lustre_build_llog_lvfs_oid(&handle->lgh_id,
+ file->f_dentry->d_inode->i_ino,
+ file->f_dentry->d_inode->i_generation);
+ handle->lgh_file = file;
+ } else {
+ OBDO_ALLOC(oa);
+ if (oa == NULL)
+ RETURN(-ENOMEM);
+
+ ostid_set_seq_llog(&oa->o_oi);
+ oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+ rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ /* FIXME: rationalize the misuse of o_generation in
+ * this API along with mds_obd_{create,destroy}.
+ * Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+ dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+ oa->o_generation);
+ if (IS_ERR(dchild))
+ GOTO(out, rc = PTR_ERR(dchild));
+
+ file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+ l_dput(dchild);
+ if (IS_ERR(file))
+ GOTO(out, rc = PTR_ERR(file));
+ handle->lgh_id.lgl_oi = oa->o_oi;
+ handle->lgh_id.lgl_ogen = oa->o_generation;
+ handle->lgh_file = file;
+out:
+ OBDO_FREE(oa);
+ }
+ RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ int rc;
+
+ ENTRY;
+
+ if (handle->lgh_file == NULL)
+ RETURN(0);
+ rc = filp_close(handle->lgh_file, 0);
+ if (rc)
+ CERROR("%s: error closing llog #"DOSTID"#%08x: "
+ "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+ POSTID(&handle->lgh_id.lgl_oi),
+ handle->lgh_id.lgl_ogen, rc);
+ handle->lgh_file = NULL;
+ if (handle->lgh_name) {
+ OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+ handle->lgh_name = NULL;
+ }
+ RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct dentry *fdentry;
+ struct obdo *oa;
+ struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+ char *dir;
+ void *th;
+ struct inode *inode;
+ int rc, rc1;
+ ENTRY;
+
+ dir = MOUNT_CONFIGS_DIR;
+
+ LASSERT(handle->lgh_file);
+ fdentry = handle->lgh_file->f_dentry;
+ inode = fdentry->d_parent->d_inode;
+ if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+ struct lvfs_run_ctxt saved;
+ struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+ push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ dget(fdentry);
+ rc = llog_lvfs_close(env, handle);
+ if (rc == 0) {
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ rc = ll_vfs_unlink(inode, fdentry, mnt);
+ mutex_unlock(&inode->i_mutex);
+ }
+ mntput(mnt);
+
+ dput(fdentry);
+ pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+ RETURN(rc);
+ }
+
+ OBDO_ALLOC(oa);
+ if (oa == NULL)
+ RETURN(-ENOMEM);
+
+ oa->o_oi = handle->lgh_id.lgl_oi;
+ oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+ oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+ rc = llog_lvfs_close(env, handle);
+ if (rc)
+ GOTO(out, rc);
+
+ th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+ if (IS_ERR(th)) {
+ CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+ GOTO(out, rc = PTR_ERR(th));
+ }
+
+ rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+ NULL, NULL, NULL, NULL);
+
+ rc1 = fsfilt_commit(obd, inode, th, 0);
+ if (rc == 0 && rc1 != 0)
+ rc = rc1;
+ out:
+ OBDO_FREE(oa);
+ RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+ struct llog_handle *res,
+ struct thandle *th)
+{
+ return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ int idx, struct thandle *th)
+{
+ return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+ .lop_write_rec = llog_lvfs_write_rec,
+ .lop_next_block = llog_lvfs_next_block,
+ .lop_prev_block = llog_lvfs_prev_block,
+ .lop_read_header = llog_lvfs_read_header,
+ .lop_create = llog_lvfs_create,
+ .lop_destroy = llog_lvfs_destroy,
+ .lop_close = llog_lvfs_close,
+ .lop_open = llog_lvfs_open,
+ .lop_exist = llog_lvfs_exist,
+ .lop_declare_create = llog_lvfs_declare_create,
+ .lop_declare_write_rec = llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 000000000000..7e2290796315
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+
+ OBD_ALLOC_PTR(ctxt);
+ if (!ctxt)
+ return NULL;
+
+ ctxt->loc_obd = obd;
+ atomic_set(&ctxt->loc_refcount, 1);
+
+ return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+ if (ctxt->loc_exp) {
+ class_export_put(ctxt->loc_exp);
+ ctxt->loc_exp = NULL;
+ }
+ if (ctxt->loc_imp) {
+ class_import_put(ctxt->loc_imp);
+ ctxt->loc_imp = NULL;
+ }
+ OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct obd_llog_group *olg = ctxt->loc_olg;
+ struct obd_device *obd;
+ int rc = 0;
+
+ spin_lock(&olg->olg_lock);
+ if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+ spin_unlock(&olg->olg_lock);
+ return rc;
+ }
+ olg->olg_ctxts[ctxt->loc_idx] = NULL;
+ spin_unlock(&olg->olg_lock);
+
+ obd = ctxt->loc_obd;
+ spin_lock(&obd->obd_dev_lock);
+ /* sync with llog ctxt user thread */
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* obd->obd_starting is needed for the case of cleanup
+ * in error case while obd is starting up. */
+ LASSERTF(obd->obd_starting == 1 ||
+ obd->obd_stopping == 1 || obd->obd_set_up == 0,
+ "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+ !!obd->obd_stopping, !!obd->obd_set_up);
+
+ /* cleanup the llog ctxt here */
+ if (CTXTP(ctxt, cleanup))
+ rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+ llog_ctxt_destroy(ctxt);
+ wake_up(&olg->olg_waitq);
+ return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+ struct obd_llog_group *olg;
+ int rc, idx;
+ ENTRY;
+
+ LASSERT(ctxt != NULL);
+ LASSERT(ctxt != LP_POISON);
+
+ olg = ctxt->loc_olg;
+ LASSERT(olg != NULL);
+ LASSERT(olg != LP_POISON);
+
+ idx = ctxt->loc_idx;
+
+ /*
+ * Banlance the ctxt get when calling llog_cleanup()
+ */
+ LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+ LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+ llog_ctxt_put(ctxt);
+
+ /*
+ * Try to free the ctxt.
+ */
+ rc = __llog_ctxt_put(env, ctxt);
+ if (rc)
+ CERROR("Error %d while cleaning up ctxt %p\n",
+ rc, ctxt);
+
+ l_wait_event(olg->olg_waitq,
+ llog_group_ctxt_null(olg, idx), &lwi);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int index,
+ struct obd_device *disk_obd, struct llog_operations *op)
+{
+ struct llog_ctxt *ctxt;
+ int rc = 0;
+ ENTRY;
+
+ if (index < 0 || index >= LLOG_MAX_CTXTS)
+ RETURN(-EINVAL);
+
+ LASSERT(olg != NULL);
+
+ ctxt = llog_new_ctxt(obd);
+ if (!ctxt)
+ RETURN(-ENOMEM);
+
+ ctxt->loc_obd = obd;
+ ctxt->loc_olg = olg;
+ ctxt->loc_idx = index;
+ ctxt->loc_logops = op;
+ mutex_init(&ctxt->loc_mutex);
+ ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+ ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+ rc = llog_group_set_ctxt(olg, ctxt, index);
+ if (rc) {
+ llog_ctxt_destroy(ctxt);
+ if (rc == -EEXIST) {
+ ctxt = llog_group_get_ctxt(olg, index);
+ if (ctxt) {
+ /*
+ * mds_lov_update_desc() might call here multiple
+ * times. So if the llog is already set up then
+ * don't to do it again.
+ */
+ CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+ obd->obd_name, index);
+ LASSERT(ctxt->loc_olg == olg);
+ LASSERT(ctxt->loc_obd == obd);
+ LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+ LASSERT(ctxt->loc_logops == op);
+ llog_ctxt_put(ctxt);
+ }
+ rc = 0;
+ }
+ RETURN(rc);
+ }
+
+ if (op->lop_setup) {
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+ rc = -EOPNOTSUPP;
+ else
+ rc = op->lop_setup(env, obd, olg, index, disk_obd);
+ }
+
+ if (rc) {
+ CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+ obd->obd_name, index, op->lop_setup, rc);
+ llog_group_clear_ctxt(olg, index);
+ llog_ctxt_destroy(ctxt);
+ } else {
+ CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+ obd->obd_name, index);
+ ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+ }
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+ int rc = 0;
+ ENTRY;
+
+ if (!ctxt)
+ RETURN(0);
+
+ if (CTXTP(ctxt, sync))
+ rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+ struct llog_cookie *logcookies, int numcookies)
+{
+ int raised, rc;
+ ENTRY;
+
+ if (!ctxt) {
+ CERROR("No ctxt\n");
+ RETURN(-ENODEV);
+ }
+
+ if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+ RETURN(-ENXIO);
+
+ CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+ raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+ if (!raised)
+ cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+ rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+ numcookies);
+ if (!raised)
+ cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+ struct lov_stripe_md *lsm, int count,
+ struct llog_cookie *cookies, int flags)
+{
+ int rc;
+ ENTRY;
+
+ if (!ctxt) {
+ CERROR("No ctxt\n");
+ RETURN(-ENODEV);
+ }
+
+ CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+ rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+ struct obd_device *disk_obd, int *index)
+{
+ int rc;
+ ENTRY;
+ OBD_CHECK_DT_OP(obd, llog_init, 0);
+ OBD_COUNTER_INCREMENT(obd, llog_init);
+
+ rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+ int rc;
+ ENTRY;
+ OBD_CHECK_DT_OP(obd, llog_finish, 0);
+ OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+ rc = OBP(obd, llog_finish)(obd, count);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+ llog_key_init_generic(&llog_thread_key, NULL);
+ lu_context_key_register(&llog_thread_key);
+ return 0;
+}
+
+void llog_info_fini(void)
+{
+ lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644
index 000000000000..6dbd21a863c2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+
+ lgi->lgi_attr.la_valid = LA_MODE;
+ lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+ lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+ return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+ &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o,
+ struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+
+ lgi->lgi_attr.la_valid = LA_MODE;
+ lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+ lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+ return local_object_create(env, los, o, &lgi->lgi_attr,
+ &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+ loff_t *off, int len, int index, struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ int rc;
+
+ ENTRY;
+
+ LASSERT(th);
+ LASSERT(off);
+ LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+ lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+ lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+ lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+ lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+ lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+ dt_write_lock(env, o, 0);
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc) {
+ CERROR("%s: error writing padding record: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+ GOTO(out, rc);
+ }
+
+ lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+ lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+ *off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc)
+ CERROR("%s: error writing padding record: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+ dt_write_unlock(env, o);
+ RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+ struct llog_rec_hdr *rec, void *buf,
+ loff_t *off, struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ int buflen = rec->lrh_len;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(env);
+ LASSERT(o);
+
+ if (buflen == 0)
+ CWARN("0-length record\n");
+
+ CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+ rec->lrh_type, buf, buflen, *off);
+
+ lgi->lgi_attr.la_valid = LA_SIZE;
+ lgi->lgi_attr.la_size = *off;
+
+ if (!buf) {
+ lgi->lgi_buf.lb_len = buflen;
+ lgi->lgi_buf.lb_buf = rec;
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc)
+ CERROR("%s: error writing log record: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+ GOTO(out, rc);
+ }
+
+ /* the buf case */
+ /* protect the following 3 writes from concurrent read */
+ dt_write_lock(env, o, 0);
+ rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+ lgi->lgi_buf.lb_len = sizeof(*rec);
+ lgi->lgi_buf.lb_buf = rec;
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc) {
+ CERROR("%s: error writing log hdr: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+ GOTO(out_unlock, rc);
+ }
+
+ lgi->lgi_buf.lb_len = buflen;
+ lgi->lgi_buf.lb_buf = buf;
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc) {
+ CERROR("%s: error writing log buffer: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+ GOTO(out_unlock, rc);
+ }
+
+ lgi->lgi_tail.lrt_len = rec->lrh_len;
+ lgi->lgi_tail.lrt_index = rec->lrh_index;
+ lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+ lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+ rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+ if (rc)
+ CERROR("%s: error writing log tail: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+ dt_write_unlock(env, o);
+
+out:
+ /* cleanup the content written above */
+ if (rc) {
+ dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+ BYPASS_CAPA);
+ dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+ }
+
+ RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+ struct llog_handle *handle)
+{
+ struct llog_rec_hdr *llh_hdr;
+ struct dt_object *o;
+ struct llog_thread_info *lgi;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+ o = handle->lgh_obj;
+ LASSERT(o);
+
+ lgi = llog_info(env);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+ if (rc)
+ RETURN(rc);
+
+ LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+ if (lgi->lgi_attr.la_size == 0) {
+ CDEBUG(D_HA, "not reading header from 0-byte log\n");
+ RETURN(LLOG_EEMPTY);
+ }
+
+ lgi->lgi_off = 0;
+ lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+ lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+ rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+ if (rc) {
+ CERROR("%s: error reading log header from "DFID": rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ PFID(lu_object_fid(&o->do_lu)), rc);
+ RETURN(rc);
+ }
+
+ llh_hdr = &handle->lgh_hdr->llh_hdr;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+ lustre_swab_llog_hdr(handle->lgh_hdr);
+
+ if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+ CERROR("%s: bad log %s "DFID" header magic: %#x "
+ "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+ handle->lgh_name ? handle->lgh_name : "",
+ PFID(lu_object_fid(&o->do_lu)),
+ llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+ RETURN(-EIO);
+ } else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+ CERROR("%s: incorrectly sized log %s "DFID" header: "
+ "%#x (expected %#x)\n"
+ "you may need to re-run lconf --write_conf.\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ handle->lgh_name ? handle->lgh_name : "",
+ PFID(lu_object_fid(&o->do_lu)),
+ llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+ RETURN(-EIO);
+ }
+
+ handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+ RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ int idx, struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct dt_object *o;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(env);
+ LASSERT(th);
+ LASSERT(loghandle);
+
+ o = loghandle->lgh_obj;
+ LASSERT(o);
+
+ /* each time we update header */
+ rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+ th);
+ if (rc || idx == 0) /* if error or just header */
+ RETURN(rc);
+
+ if (dt_object_exists(o)) {
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+ lgi->lgi_off = lgi->lgi_attr.la_size;
+ LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+ if (rc)
+ RETURN(rc);
+
+ rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+ if (rc)
+ RETURN(rc);
+ } else {
+ lgi->lgi_off = 0;
+ }
+
+ /* XXX: implement declared window or multi-chunks approach */
+ rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+ RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ struct llog_rec_hdr *rec,
+ struct llog_cookie *reccookie, int cookiecount,
+ void *buf, int idx, struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct llog_log_hdr *llh;
+ int reclen = rec->lrh_len;
+ int index, rc, old_tail_idx;
+ struct llog_rec_tail *lrt;
+ struct dt_object *o;
+ size_t left;
+
+ ENTRY;
+
+ LASSERT(env);
+ llh = loghandle->lgh_hdr;
+ LASSERT(llh);
+ o = loghandle->lgh_obj;
+ LASSERT(o);
+ LASSERT(th);
+
+ CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+ rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+ /* record length should not bigger than LLOG_CHUNK_SIZE */
+ if (buf)
+ rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+ sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+ else
+ rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+ if (rc)
+ RETURN(rc);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+ if (rc)
+ RETURN(rc);
+
+ if (buf)
+ /* write_blob adds header and tail to lrh_len. */
+ reclen = sizeof(*rec) + rec->lrh_len +
+ sizeof(struct llog_rec_tail);
+
+ if (idx != -1) {
+ /* no header: only allowed to insert record 1 */
+ if (idx != 1 && lgi->lgi_attr.la_size == 0)
+ LBUG();
+
+ if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+ RETURN(-EINVAL);
+
+ if (!ext2_test_bit(idx, llh->llh_bitmap))
+ CERROR("%s: modify unset record %u\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, idx);
+ if (idx != rec->lrh_index)
+ CERROR("%s: index mismatch %d %u\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, idx,
+ rec->lrh_index);
+
+ lgi->lgi_off = 0;
+ rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+ &lgi->lgi_off, th);
+ /* we are done if we only write the header or on error */
+ if (rc || idx == 0)
+ RETURN(rc);
+
+ if (buf) {
+ /* We assume that caller has set lgh_cur_* */
+ lgi->lgi_off = loghandle->lgh_cur_offset;
+ CDEBUG(D_OTHER,
+ "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+ "offset %llu\n",
+ POSTID(&loghandle->lgh_id.lgl_oi), idx,
+ rec->lrh_index,
+ loghandle->lgh_cur_idx, rec->lrh_len,
+ (long long)(lgi->lgi_off - sizeof(*llh)));
+ if (rec->lrh_index != loghandle->lgh_cur_idx) {
+ CERROR("%s: modify idx mismatch %u/%d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, idx,
+ loghandle->lgh_cur_idx);
+ RETURN(-EFAULT);
+ }
+ } else {
+ /* Assumes constant lrh_len */
+ lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+ }
+
+ rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+ if (rc == 0 && reccookie) {
+ reccookie->lgc_lgl = loghandle->lgh_id;
+ reccookie->lgc_index = idx;
+ rc = 1;
+ }
+ RETURN(rc);
+ }
+
+ /* Make sure that records don't cross a chunk boundary, so we can
+ * process them page-at-a-time if needed. If it will cross a chunk
+ * boundary, write in a fake (but referenced) entry to pad the chunk.
+ *
+ * We know that llog_current_log() will return a loghandle that is
+ * big enough to hold reclen, so all we care about is padding here.
+ */
+ LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+ lgi->lgi_off = lgi->lgi_attr.la_size;
+ left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+ /* NOTE: padding is a record, but no bit is set */
+ if (left != 0 && left != reclen &&
+ left < (reclen + LLOG_MIN_REC_SIZE)) {
+ index = loghandle->lgh_last_idx + 1;
+ rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+ if (rc)
+ RETURN(rc);
+ loghandle->lgh_last_idx++; /*for pad rec*/
+ }
+ /* if it's the last idx in log file, then return -ENOSPC */
+ if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+ RETURN(-ENOSPC);
+
+ loghandle->lgh_last_idx++;
+ index = loghandle->lgh_last_idx;
+ LASSERT(index < LLOG_BITMAP_SIZE(llh));
+ rec->lrh_index = index;
+ if (buf == NULL) {
+ lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+ sizeof(*lrt));
+ lrt->lrt_len = rec->lrh_len;
+ lrt->lrt_index = rec->lrh_index;
+ }
+ /* The caller should make sure only 1 process access the lgh_last_idx,
+ * Otherwise it might hit the assert.*/
+ LASSERT(index < LLOG_BITMAP_SIZE(llh));
+ spin_lock(&loghandle->lgh_hdr_lock);
+ if (ext2_set_bit(index, llh->llh_bitmap)) {
+ CERROR("%s: index %u already set in log bitmap\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, index);
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ LBUG(); /* should never happen */
+ }
+ llh->llh_count++;
+ spin_unlock(&loghandle->lgh_hdr_lock);
+ old_tail_idx = llh->llh_tail.lrt_index;
+ llh->llh_tail.lrt_index = index;
+
+ lgi->lgi_off = 0;
+ rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+ th);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+ if (rc)
+ GOTO(out, rc);
+
+ LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+ lgi->lgi_off = lgi->lgi_attr.la_size;
+
+ rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+ /* cleanup llog for error case */
+ if (rc) {
+ spin_lock(&loghandle->lgh_hdr_lock);
+ ext2_clear_bit(index, llh->llh_bitmap);
+ llh->llh_count--;
+ spin_unlock(&loghandle->lgh_hdr_lock);
+
+ /* restore the header */
+ loghandle->lgh_last_idx--;
+ llh->llh_tail.lrt_index = old_tail_idx;
+ lgi->lgi_off = 0;
+ llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+ &lgi->lgi_off, th);
+ }
+
+ CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+ POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+ if (rc == 0 && reccookie) {
+ reccookie->lgc_lgl = loghandle->lgh_id;
+ reccookie->lgc_index = index;
+ if ((rec->lrh_type == MDS_UNLINK_REC) ||
+ (rec->lrh_type == MDS_SETATTR64_REC))
+ reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+ else if (rec->lrh_type == OST_SZ_REC)
+ reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+ else
+ reccookie->lgc_subsys = -1;
+ rc = 1;
+ }
+ RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping. If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+ if (goal <= curr)
+ return;
+ *off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+ ~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ * - cur_offset to the furthest point read in the log file
+ * - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+ struct llog_handle *loghandle, int *cur_idx,
+ int next_idx, __u64 *cur_offset, void *buf,
+ int len)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct dt_object *o;
+ struct dt_device *dt;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(env);
+ LASSERT(lgi);
+
+ if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+ RETURN(-EINVAL);
+
+ CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+ next_idx, *cur_idx, *cur_offset);
+
+ LASSERT(loghandle);
+ LASSERT(loghandle->lgh_ctxt);
+
+ o = loghandle->lgh_obj;
+ LASSERT(o);
+ LASSERT(dt_object_exists(o));
+ dt = lu2dt_dev(o->do_lu.lo_dev);
+ LASSERT(dt);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+ if (rc)
+ GOTO(out, rc);
+
+ while (*cur_offset < lgi->lgi_attr.la_size) {
+ struct llog_rec_hdr *rec, *last_rec;
+ struct llog_rec_tail *tail;
+
+ llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+ /* read up to next LLOG_CHUNK_SIZE block */
+ lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+ (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+ lgi->lgi_buf.lb_buf = buf;
+
+ /* Note: read lock is not needed around la_size get above at
+ * the time of dt_attr_get(). There are only two cases that
+ * matter. Either la_size == cur_offset, in which case the
+ * entire read is skipped, or la_size > cur_offset and the loop
+ * is entered and this thread is blocked at dt_read_lock()
+ * until the write is completed. When the write completes, then
+ * the dt_read() will be done with the full length, and will
+ * get the full data.
+ */
+ dt_read_lock(env, o, 0);
+ rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+ dt_read_unlock(env, o);
+ if (rc < 0) {
+ CERROR("%s: can't read llog block from log "DFID
+ " offset "LPU64": rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+ rc);
+ GOTO(out, rc);
+ }
+
+ if (rc < len) {
+ /* signal the end of the valid buffer to
+ * llog_process */
+ memset(buf + rc, 0, len - rc);
+ }
+
+ if (rc == 0) /* end of file, nothing to do */
+ GOTO(out, rc);
+
+ if (rc < sizeof(*tail)) {
+ CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+ "offset "LPU64"\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, *cur_offset);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ rec = buf;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ tail = (struct llog_rec_tail *)((char *)buf + rc -
+ sizeof(struct llog_rec_tail));
+ /* get the last record in block */
+ last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+ le32_to_cpu(tail->lrt_len));
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+ lustre_swab_llog_rec(last_rec);
+ LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+ *cur_idx = tail->lrt_index;
+
+ /* this shouldn't happen */
+ if (tail->lrt_index == 0) {
+ CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+ "offset "LPU64"\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, *cur_offset);
+ GOTO(out, rc = -EINVAL);
+ }
+ if (tail->lrt_index < next_idx)
+ continue;
+
+ /* sanity check that the start of the new buffer is no farther
+ * than the record that we wanted. This shouldn't happen. */
+ if (rec->lrh_index > next_idx) {
+ CERROR("%s: missed desired record? %u > %u\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ rec->lrh_index, next_idx);
+ GOTO(out, rc = -ENOENT);
+ }
+ GOTO(out, rc = 0);
+ }
+ GOTO(out, rc = -EIO);
+out:
+ return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+ struct llog_handle *loghandle,
+ int prev_idx, void *buf, int len)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct dt_object *o;
+ struct dt_device *dt;
+ loff_t cur_offset;
+ int rc;
+
+ ENTRY;
+
+ if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+ RETURN(-EINVAL);
+
+ CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+ LASSERT(loghandle);
+ LASSERT(loghandle->lgh_ctxt);
+
+ o = loghandle->lgh_obj;
+ LASSERT(o);
+ LASSERT(dt_object_exists(o));
+ dt = lu2dt_dev(o->do_lu.lo_dev);
+ LASSERT(dt);
+
+ cur_offset = LLOG_CHUNK_SIZE;
+ llog_skip_over(&cur_offset, 0, prev_idx);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+ if (rc)
+ GOTO(out, rc);
+
+ while (cur_offset < lgi->lgi_attr.la_size) {
+ struct llog_rec_hdr *rec, *last_rec;
+ struct llog_rec_tail *tail;
+
+ lgi->lgi_buf.lb_len = len;
+ lgi->lgi_buf.lb_buf = buf;
+ /* It is OK to have locking around dt_read() only, see
+ * comment in llog_osd_next_block for details
+ */
+ dt_read_lock(env, o, 0);
+ rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+ dt_read_unlock(env, o);
+ if (rc < 0) {
+ CERROR("%s: can't read llog block from log "DFID
+ " offset "LPU64": rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+ GOTO(out, rc);
+ }
+
+ if (rc == 0) /* end of file, nothing to do */
+ GOTO(out, rc);
+
+ if (rc < sizeof(*tail)) {
+ CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+ "offset "LPU64"\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ rec = buf;
+ if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+ lustre_swab_llog_rec(rec);
+
+ tail = (struct llog_rec_tail *)((char *)buf + rc -
+ sizeof(struct llog_rec_tail));
+ /* get the last record in block */
+ last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+ le32_to_cpu(tail->lrt_len));
+
+ if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+ lustre_swab_llog_rec(last_rec);
+ LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+ /* this shouldn't happen */
+ if (tail->lrt_index == 0) {
+ CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+ "offset "LPU64"\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ POSTID(&loghandle->lgh_id.lgl_oi),
+ loghandle->lgh_id.lgl_ogen, cur_offset);
+ GOTO(out, rc = -EINVAL);
+ }
+ if (tail->lrt_index < prev_idx)
+ continue;
+
+ /* sanity check that the start of the new buffer is no farther
+ * than the record that we wanted. This shouldn't happen. */
+ if (rec->lrh_index > prev_idx) {
+ CERROR("%s: missed desired record? %u > %u\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ rec->lrh_index, prev_idx);
+ GOTO(out, rc = -ENOENT);
+ }
+ GOTO(out, rc = 0);
+ }
+ GOTO(out, rc = -EIO);
+out:
+ return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+ struct llog_ctxt *ctxt)
+{
+ struct dt_device *dt;
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dir;
+ int rc;
+
+ dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+ if (ctxt->loc_dir == NULL) {
+ rc = dt_root_get(env, dt, &dti->dti_fid);
+ if (rc)
+ return ERR_PTR(rc);
+ dir = dt_locate(env, dt, &dti->dti_fid);
+ } else {
+ lu_object_get(&ctxt->loc_dir->do_lu);
+ dir = ctxt->loc_dir;
+ }
+
+ return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+ struct llog_logid *logid, char *name,
+ enum llog_open_param open_param)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct llog_ctxt *ctxt = handle->lgh_ctxt;
+ struct dt_object *o;
+ struct dt_device *dt;
+ struct ls_device *ls;
+ struct local_oid_storage *los;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(env);
+ LASSERT(ctxt);
+ LASSERT(ctxt->loc_exp);
+ LASSERT(ctxt->loc_exp->exp_obd);
+ dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+ LASSERT(dt);
+
+ ls = ls_device_get(dt);
+ if (IS_ERR(ls))
+ RETURN(PTR_ERR(ls));
+
+ mutex_lock(&ls->ls_los_mutex);
+ los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+ mutex_unlock(&ls->ls_los_mutex);
+ LASSERT(los);
+ ls_device_put(env, ls);
+
+ LASSERT(handle);
+
+ if (logid != NULL) {
+ logid_to_fid(logid, &lgi->lgi_fid);
+ } else if (name) {
+ struct dt_object *llog_dir;
+
+ llog_dir = llog_osd_dir_get(env, ctxt);
+ if (IS_ERR(llog_dir))
+ GOTO(out, rc = PTR_ERR(llog_dir));
+ dt_read_lock(env, llog_dir, 0);
+ rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+ dt_read_unlock(env, llog_dir);
+ lu_object_put(env, &llog_dir->do_lu);
+ if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+ /* generate fid for new llog */
+ rc = local_object_fid_generate(env, los,
+ &lgi->lgi_fid);
+ }
+ if (rc < 0)
+ GOTO(out, rc);
+ OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+ if (handle->lgh_name)
+ strcpy(handle->lgh_name, name);
+ else
+ GOTO(out, rc = -ENOMEM);
+ } else {
+ LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+ /* generate fid for new llog */
+ rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+ if (rc < 0)
+ GOTO(out, rc);
+ }
+
+ o = ls_locate(env, ls, &lgi->lgi_fid);
+ if (IS_ERR(o))
+ GOTO(out_name, rc = PTR_ERR(o));
+
+ /* No new llog is expected but doesn't exist */
+ if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+ GOTO(out_put, rc = -ENOENT);
+
+ fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+ handle->lgh_obj = o;
+ handle->private_data = los;
+ LASSERT(handle->lgh_ctxt);
+
+ RETURN(rc);
+
+out_put:
+ lu_object_put(env, &o->do_lu);
+out_name:
+ if (handle->lgh_name != NULL)
+ OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+ dt_los_put(los);
+ RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+ LASSERT(handle->lgh_obj);
+ return (dt_object_exists(handle->lgh_obj) &&
+ !lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+ struct llog_handle *res, struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct local_oid_storage *los;
+ struct dt_object *o;
+ int rc;
+
+ ENTRY;
+
+ LASSERT(res->lgh_obj);
+ LASSERT(th);
+
+ /* object can be created by another thread */
+ o = res->lgh_obj;
+ if (dt_object_exists(o))
+ RETURN(0);
+
+ los = res->private_data;
+ LASSERT(los);
+
+ rc = llog_osd_declare_new_object(env, los, o, th);
+ if (rc)
+ RETURN(rc);
+
+ rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+ if (rc)
+ RETURN(rc);
+
+ if (res->lgh_name) {
+ struct dt_object *llog_dir;
+
+ llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+ if (IS_ERR(llog_dir))
+ RETURN(PTR_ERR(llog_dir));
+ logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+ rc = dt_declare_insert(env, llog_dir,
+ (struct dt_rec *)&lgi->lgi_fid,
+ (struct dt_key *)res->lgh_name, th);
+ lu_object_put(env, &llog_dir->do_lu);
+ if (rc)
+ CERROR("%s: can't declare named llog %s: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ res->lgh_name, rc);
+ }
+ RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+ struct thandle *th)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct local_oid_storage *los;
+ struct dt_object *o;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(env);
+ o = res->lgh_obj;
+ LASSERT(o);
+
+ /* llog can be already created */
+ if (dt_object_exists(o))
+ RETURN(-EEXIST);
+
+ los = res->private_data;
+ LASSERT(los);
+
+ dt_write_lock(env, o, 0);
+ if (!dt_object_exists(o))
+ rc = llog_osd_create_new_object(env, los, o, th);
+ else
+ rc = -EEXIST;
+
+ dt_write_unlock(env, o);
+ if (rc)
+ RETURN(rc);
+
+ if (res->lgh_name) {
+ struct dt_object *llog_dir;
+
+ llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+ if (IS_ERR(llog_dir))
+ RETURN(PTR_ERR(llog_dir));
+
+ logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+ dt_read_lock(env, llog_dir, 0);
+ rc = dt_insert(env, llog_dir,
+ (struct dt_rec *)&lgi->lgi_fid,
+ (struct dt_key *)res->lgh_name,
+ th, BYPASS_CAPA, 1);
+ dt_read_unlock(env, llog_dir);
+ lu_object_put(env, &llog_dir->do_lu);
+ if (rc)
+ CERROR("%s: can't create named llog %s: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ res->lgh_name, rc);
+ }
+ RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+ struct local_oid_storage *los;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(handle->lgh_obj);
+
+ lu_object_put(env, &handle->lgh_obj->do_lu);
+
+ los = handle->private_data;
+ LASSERT(los);
+ dt_los_put(los);
+
+ if (handle->lgh_name)
+ OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+ RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+ struct llog_handle *loghandle)
+{
+ struct llog_ctxt *ctxt;
+ struct dt_object *o, *llog_dir = NULL;
+ struct dt_device *d;
+ struct thandle *th;
+ char *name = NULL;
+ int rc;
+
+ ENTRY;
+
+ ctxt = loghandle->lgh_ctxt;
+ LASSERT(ctxt);
+
+ o = loghandle->lgh_obj;
+ LASSERT(o);
+
+ d = lu2dt_dev(o->do_lu.lo_dev);
+ LASSERT(d);
+ LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+ th = dt_trans_create(env, d);
+ if (IS_ERR(th))
+ RETURN(PTR_ERR(th));
+
+ if (loghandle->lgh_name) {
+ llog_dir = llog_osd_dir_get(env, ctxt);
+ if (IS_ERR(llog_dir))
+ GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+ name = loghandle->lgh_name;
+ rc = dt_declare_delete(env, llog_dir,
+ (struct dt_key *)name, th);
+ if (rc)
+ GOTO(out_trans, rc);
+ }
+
+ dt_declare_ref_del(env, o, th);
+
+ rc = dt_declare_destroy(env, o, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_trans_start_local(env, d, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ dt_write_lock(env, o, 0);
+ if (dt_object_exists(o)) {
+ if (name) {
+ dt_read_lock(env, llog_dir, 0);
+ rc = dt_delete(env, llog_dir,
+ (struct dt_key *) name,
+ th, BYPASS_CAPA);
+ dt_read_unlock(env, llog_dir);
+ if (rc) {
+ CERROR("%s: can't remove llog %s: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ name, rc);
+ GOTO(out_unlock, rc);
+ }
+ }
+ dt_ref_del(env, o, th);
+ rc = dt_destroy(env, o, th);
+ if (rc)
+ GOTO(out_unlock, rc);
+ }
+out_unlock:
+ dt_write_unlock(env, o);
+out_trans:
+ dt_trans_stop(env, d, th);
+ if (llog_dir != NULL)
+ lu_object_put(env, &llog_dir->do_lu);
+ RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+ struct obd_llog_group *olg, int ctxt_idx,
+ struct obd_device *disk_obd)
+{
+ struct local_oid_storage *los;
+ struct llog_thread_info *lgi = llog_info(env);
+ struct llog_ctxt *ctxt;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(obd);
+ LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+ ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+ LASSERT(ctxt);
+
+ /* initialize data allowing to generate new fids,
+ * literally we need a sequece */
+ lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+ lgi->lgi_fid.f_oid = 1;
+ lgi->lgi_fid.f_ver = 0;
+ rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+ &lgi->lgi_fid, &los);
+ if (rc < 0)
+ return rc;
+
+ lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+ lgi->lgi_fid.f_oid = 1;
+ lgi->lgi_fid.f_ver = 0;
+ rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+ &lgi->lgi_fid, &los);
+ llog_ctxt_put(ctxt);
+ return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct dt_device *dt;
+ struct ls_device *ls;
+ struct local_oid_storage *los, *nlos;
+
+ LASSERT(ctxt->loc_exp->exp_obd);
+ dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+ ls = ls_device_get(dt);
+ if (IS_ERR(ls))
+ RETURN(PTR_ERR(ls));
+
+ mutex_lock(&ls->ls_los_mutex);
+ los = dt_los_find(ls, FID_SEQ_LLOG);
+ nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+ mutex_unlock(&ls->ls_los_mutex);
+ if (los != NULL) {
+ dt_los_put(los);
+ local_oid_storage_fini(env, los);
+ }
+ if (nlos != NULL) {
+ dt_los_put(nlos);
+ local_oid_storage_fini(env, nlos);
+ }
+ ls_device_put(env, ls);
+ return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+ .lop_next_block = llog_osd_next_block,
+ .lop_prev_block = llog_osd_prev_block,
+ .lop_read_header = llog_osd_read_header,
+ .lop_destroy = llog_osd_destroy,
+ .lop_setup = llog_osd_setup,
+ .lop_cleanup = llog_osd_cleanup,
+ .lop_open = llog_osd_open,
+ .lop_exist = llog_osd_exist,
+ .lop_declare_create = llog_osd_declare_create,
+ .lop_create = llog_osd_create,
+ .lop_declare_write_rec = llog_osd_declare_write_rec,
+ .lop_write_rec = llog_osd_write_rec,
+ .lop_close = llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+ int idx, int count, struct llog_catid *idarray)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct dt_object *o = NULL;
+ struct thandle *th;
+ int rc, size;
+
+ ENTRY;
+
+ LASSERT(d);
+
+ size = sizeof(*idarray) * count;
+ lgi->lgi_off = idx * sizeof(*idarray);
+
+ lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+ o = dt_locate(env, d, &lgi->lgi_fid);
+ if (IS_ERR(o))
+ RETURN(PTR_ERR(o));
+
+ if (!dt_object_exists(o)) {
+ th = dt_trans_create(env, d);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ lgi->lgi_attr.la_valid = LA_MODE;
+ lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+ lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+ rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+ &lgi->lgi_dof, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_trans_start_local(env, d, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ dt_write_lock(env, o, 0);
+ if (!dt_object_exists(o))
+ rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+ &lgi->lgi_dof, th);
+ dt_write_unlock(env, o);
+out_trans:
+ dt_trans_stop(env, d, th);
+ if (rc)
+ GOTO(out, rc);
+ }
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+ if (rc)
+ GOTO(out, rc);
+
+ if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+ CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ lgi->lgi_attr.la_mode);
+ GOTO(out, rc = -ENOENT);
+ }
+
+ CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+ (int)lgi->lgi_attr.la_size, size);
+
+ /* return just number of llogs */
+ if (idarray == NULL) {
+ rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+ GOTO(out, rc);
+ }
+
+ /* read for new ost index or for empty file */
+ memset(idarray, 0, size);
+ if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+ GOTO(out, rc = 0);
+ if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+ size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+ lgi->lgi_buf.lb_buf = idarray;
+ lgi->lgi_buf.lb_len = size;
+ rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+ if (rc) {
+ CERROR("%s: error reading CATALOGS: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, rc);
+ GOTO(out, rc);
+ }
+
+ EXIT;
+out:
+ lu_object_put(env, &o->do_lu);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+ int idx, int count, struct llog_catid *idarray)
+{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct dt_object *o = NULL;
+ struct thandle *th;
+ int rc, size;
+
+ if (!count)
+ RETURN(0);
+
+ LASSERT(d);
+
+ size = sizeof(*idarray) * count;
+ lgi->lgi_off = idx * sizeof(*idarray);
+
+ lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+ o = dt_locate(env, d, &lgi->lgi_fid);
+ if (IS_ERR(o))
+ RETURN(PTR_ERR(o));
+
+ if (!dt_object_exists(o))
+ GOTO(out, rc = -ENOENT);
+
+ rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+ if (rc)
+ GOTO(out, rc);
+
+ if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+ CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ lgi->lgi_attr.la_mode);
+ GOTO(out, rc = -ENOENT);
+ }
+
+ th = dt_trans_create(env, d);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = dt_trans_start_local(env, d, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ lgi->lgi_buf.lb_buf = idarray;
+ lgi->lgi_buf.lb_len = size;
+ rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+ if (rc)
+ CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+ dt_trans_stop(env, d, th);
+out:
+ lu_object_put(env, &o->do_lu);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 000000000000..dedfecff95bc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
@@ -0,0 +1,407 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+ CDEBUG(D_OTHER, "llogd body: %p\n", d);
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+ POSTID(&d->lgd_logid.lgl_oi));
+ CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+ CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+ CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+ CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+ CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+ CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+ CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+ __swab64s (&fid->f_seq);
+ __swab32s (&fid->f_oid);
+ __swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+ if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+ __swab64s(&oid->oi.oi_id);
+ __swab64s(&oid->oi.oi_seq);
+ } else {
+ lustre_swab_lu_fid(&oid->oi_fid);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+ __swab64s(&log_id->lgl_oi.oi.oi_id);
+ __swab64s(&log_id->lgl_oi.oi.oi_seq);
+ __swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+ ENTRY;
+ print_llogd_body(d);
+ lustre_swab_llog_id(&d->lgd_logid);
+ __swab32s (&d->lgd_ctxt_idx);
+ __swab32s (&d->lgd_llh_flags);
+ __swab32s (&d->lgd_index);
+ __swab32s (&d->lgd_saved_index);
+ __swab32s (&d->lgd_len);
+ __swab64s (&d->lgd_cur_offset);
+ print_llogd_body(d);
+ EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+ __swab64s (&d->lgdc_gen.mnt_cnt);
+ __swab64s (&d->lgdc_gen.conn_cnt);
+ lustre_swab_llog_id(&d->lgdc_logid);
+ __swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+ __swab64s (&fid->id);
+ __swab32s (&fid->generation);
+ __swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+ __swab64s (&range->lsr_start);
+ __swab64s (&range->lsr_end);
+ __swab32s (&range->lsr_index);
+ __swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+ struct llog_rec_tail *tail = NULL;
+
+ __swab32s(&rec->lrh_len);
+ __swab32s(&rec->lrh_index);
+ __swab32s(&rec->lrh_type);
+ __swab32s(&rec->lrh_id);
+
+ switch (rec->lrh_type) {
+ case OST_SZ_REC:
+ {
+ struct llog_size_change_rec *lsc =
+ (struct llog_size_change_rec *)rec;
+
+ lustre_swab_ll_fid(&lsc->lsc_fid);
+ __swab32s(&lsc->lsc_ioepoch);
+ tail = &lsc->lsc_tail;
+ break;
+ }
+ case MDS_UNLINK_REC:
+ {
+ struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+ __swab64s(&lur->lur_oid);
+ __swab32s(&lur->lur_oseq);
+ __swab32s(&lur->lur_count);
+ tail = &lur->lur_tail;
+ break;
+ }
+ case MDS_UNLINK64_REC:
+ {
+ struct llog_unlink64_rec *lur =
+ (struct llog_unlink64_rec *)rec;
+
+ lustre_swab_lu_fid(&lur->lur_fid);
+ __swab32s(&lur->lur_count);
+ tail = &lur->lur_tail;
+ break;
+ }
+ case CHANGELOG_REC:
+ {
+ struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+ __swab16s(&cr->cr.cr_namelen);
+ __swab16s(&cr->cr.cr_flags);
+ __swab32s(&cr->cr.cr_type);
+ __swab64s(&cr->cr.cr_index);
+ __swab64s(&cr->cr.cr_prev);
+ __swab64s(&cr->cr.cr_time);
+ lustre_swab_lu_fid(&cr->cr.cr_tfid);
+ lustre_swab_lu_fid(&cr->cr.cr_pfid);
+ if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+ struct llog_changelog_ext_rec *ext =
+ (struct llog_changelog_ext_rec *)rec;
+
+ lustre_swab_lu_fid(&ext->cr.cr_sfid);
+ lustre_swab_lu_fid(&ext->cr.cr_spfid);
+ tail = &ext->cr_tail;
+ } else {
+ tail = &cr->cr_tail;
+ }
+ break;
+ }
+ case CHANGELOG_USER_REC:
+ {
+ struct llog_changelog_user_rec *cur =
+ (struct llog_changelog_user_rec*)rec;
+
+ __swab32s(&cur->cur_id);
+ __swab64s(&cur->cur_endrec);
+ tail = &cur->cur_tail;
+ break;
+ }
+
+ case MDS_SETATTR64_REC:
+ {
+ struct llog_setattr64_rec *lsr =
+ (struct llog_setattr64_rec *)rec;
+
+ lustre_swab_ost_id(&lsr->lsr_oi);
+ __swab32s(&lsr->lsr_uid);
+ __swab32s(&lsr->lsr_uid_h);
+ __swab32s(&lsr->lsr_gid);
+ __swab32s(&lsr->lsr_gid_h);
+ tail = &lsr->lsr_tail;
+ break;
+ }
+ case OBD_CFG_REC:
+ /* these are swabbed as they are consumed */
+ break;
+ case LLOG_HDR_MAGIC:
+ {
+ struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+ __swab64s(&llh->llh_timestamp);
+ __swab32s(&llh->llh_count);
+ __swab32s(&llh->llh_bitmap_offset);
+ __swab32s(&llh->llh_flags);
+ __swab32s(&llh->llh_size);
+ __swab32s(&llh->llh_cat_idx);
+ tail = &llh->llh_tail;
+ break;
+ }
+ case LLOG_LOGID_MAGIC:
+ {
+ struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+ lustre_swab_llog_id(&lid->lid_id);
+ tail = &lid->lid_tail;
+ break;
+ }
+ case LLOG_GEN_REC:
+ {
+ struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+ __swab64s(&lgr->lgr_gen.mnt_cnt);
+ __swab64s(&lgr->lgr_gen.conn_cnt);
+ tail = &lgr->lgr_tail;
+ break;
+ }
+ case LLOG_PAD_MAGIC:
+ break;
+ default:
+ CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+ rec->lrh_type, rec);
+ }
+
+ if (tail) {
+ __swab32s(&tail->lrt_len);
+ __swab32s(&tail->lrt_index);
+ }
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+ CDEBUG(D_OTHER, "llog header: %p\n", h);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+ CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+ CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+ CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+ CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+ CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+ CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+ CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+ CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+ ENTRY;
+ print_llog_hdr(h);
+
+ lustre_swab_llog_rec(&h->llh_hdr);
+
+ print_llog_hdr(h);
+ EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+ ENTRY;
+
+ if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+ return;
+ CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+ if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+ for (i = 0; i < lcfg->lcfg_bufcount; i++)
+ CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+ i, lcfg->lcfg_buflens[i]);
+ EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+ int i;
+ ENTRY;
+
+ __swab32s(&lcfg->lcfg_version);
+
+ if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+ CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+ lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+ EXIT;
+ return;
+ }
+
+ __swab32s(&lcfg->lcfg_command);
+ __swab32s(&lcfg->lcfg_num);
+ __swab32s(&lcfg->lcfg_flags);
+ __swab64s(&lcfg->lcfg_nid);
+ __swab32s(&lcfg->lcfg_bufcount);
+ for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+ __swab32s(&lcfg->lcfg_buflens[i]);
+
+ print_lustre_cfg(lcfg);
+ EXIT;
+ return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+ __u32 cm_step;
+ __u32 cm_flags;
+ __u32 cm_vers;
+ __u32 padding;
+ __u32 cm_createtime;
+ __u32 cm_canceltime;
+ char cm_tgtname[MTI_NAME_MAXLEN];
+ char cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32 (MTI_NAME_MAXLEN - \
+ (sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+ struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+ ENTRY;
+
+ if (swab) {
+ __swab32s(&marker->cm_step);
+ __swab32s(&marker->cm_flags);
+ __swab32s(&marker->cm_vers);
+ }
+ if (size == sizeof(*cm32)) {
+ __u32 createtime, canceltime;
+ /* There was a problem with the original declaration of
+ * cfg_marker on 32-bit systems because it used time_t as
+ * a wire protocol structure, and didn't verify this in
+ * wirecheck. We now have to convert the offsets of the
+ * later fields in order to work on 32- and 64-bit systems.
+ *
+ * Fortunately, the cm_comment field has no functional use
+ * so can be sacrificed when converting the timestamp size.
+ *
+ * Overwrite fields from the end first, so they are not
+ * clobbered, and use memmove() instead of memcpy() because
+ * the source and target buffers overlap. bug 16771 */
+ createtime = cm32->cm_createtime;
+ canceltime = cm32->cm_canceltime;
+ memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+ marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+ memmove(marker->cm_tgtname, cm32->cm_tgtname,
+ sizeof(marker->cm_tgtname));
+ if (swab) {
+ __swab32s(&createtime);
+ __swab32s(&canceltime);
+ }
+ marker->cm_createtime = createtime;
+ marker->cm_canceltime = canceltime;
+ CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+ "for target %s, converting\n",
+ marker->cm_tgtname);
+ } else if (swab) {
+ __swab64s(&marker->cm_createtime);
+ __swab64s(&marker->cm_canceltime);
+ }
+
+ EXIT;
+ return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644
index 000000000000..d397f781ec43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c
@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+ struct llog_rec_hdr lmr_hdr;
+ struct llog_rec_tail lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+ int i;
+ int last_idx = 0;
+ int active_recs = 0;
+
+ for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+ if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+ last_idx = i;
+ active_recs++;
+ }
+ }
+
+ if (active_recs != num_recs) {
+ CERROR("%s: expected %d active recs after write, found %d\n",
+ test, num_recs, active_recs);
+ RETURN(-ERANGE);
+ }
+
+ if (llh->lgh_hdr->llh_count != num_recs) {
+ CERROR("%s: handle->count is %d, expected %d after write\n",
+ test, llh->lgh_hdr->llh_count, num_recs);
+ RETURN(-ERANGE);
+ }
+
+ if (llh->lgh_last_idx < last_idx) {
+ CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+ test, llh->lgh_last_idx, last_idx);
+ RETURN(-ERANGE);
+ }
+
+ RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+ struct obd_device *obd, char *name)
+{
+ struct llog_handle *llh;
+ struct llog_ctxt *ctxt;
+ int rc;
+ int rc2;
+
+ ENTRY;
+
+ CWARN("1a: create a log with name: %s\n", name);
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+
+ rc = llog_open_create(env, ctxt, &llh, NULL, name);
+ if (rc) {
+ CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+ GOTO(out, rc);
+ }
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+ if (rc) {
+ CERROR("1a: can't init llog handle: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+
+ rc = verify_handle("1", llh, 1);
+
+ CWARN("1b: close newly-created log\n");
+out_close:
+ rc2 = llog_close(env, llh);
+ if (rc2) {
+ CERROR("1b: close log %s failed: %d\n", name, rc2);
+ if (rc == 0)
+ rc = rc2;
+ }
+out:
+ llog_ctxt_put(ctxt);
+ RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+ char *name, struct llog_handle **llh)
+{
+ struct llog_ctxt *ctxt;
+ struct llog_handle *loghandle;
+ struct llog_logid logid;
+ int rc;
+
+ ENTRY;
+
+ CWARN("2a: re-open a log with name: %s\n", name);
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+
+ rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc) {
+ CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+ GOTO(out_put, rc);
+ }
+
+ rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+ if (rc) {
+ CERROR("2a: can't init llog handle: %d\n", rc);
+ GOTO(out_close_llh, rc);
+ }
+
+ rc = verify_handle("2", *llh, 1);
+ if (rc)
+ GOTO(out_close_llh, rc);
+
+ /* XXX: there is known issue with tests 2b, MGS is not able to create
+ * anonymous llog, exit now to allow following tests run.
+ * It is fixed in upcoming llog over OSD code */
+ GOTO(out_put, rc);
+
+ CWARN("2b: create a log without specified NAME & LOGID\n");
+ rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+ if (rc) {
+ CERROR("2b: create log failed\n");
+ GOTO(out_close_llh, rc);
+ }
+ rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+ if (rc) {
+ CERROR("2b: can't init llog handle: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+
+ logid = loghandle->lgh_id;
+ llog_close(env, loghandle);
+
+ CWARN("2c: re-open the log by LOGID\n");
+ rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+ if (rc) {
+ CERROR("2c: re-open log by LOGID failed\n");
+ GOTO(out_close_llh, rc);
+ }
+
+ rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+ if (rc) {
+ CERROR("2c: can't init llog handle: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+
+ CWARN("2b: destroy this log\n");
+ rc = llog_destroy(env, loghandle);
+ if (rc)
+ CERROR("2d: destroy log failed\n");
+out_close:
+ llog_close(env, loghandle);
+out_close_llh:
+ if (rc)
+ llog_close(env, *llh);
+out_put:
+ llog_ctxt_put(ctxt);
+
+ RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+ struct llog_handle *llh)
+{
+ struct llog_gen_rec lgr;
+ int rc, i;
+ int num_recs = 1; /* 1 for the header */
+
+ ENTRY;
+
+ lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+ lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+ CWARN("3a: write one create_rec\n");
+ rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+ num_recs++;
+ if (rc < 0) {
+ CERROR("3a: write one log record failed: %d\n", rc);
+ RETURN(rc);
+ }
+
+ rc = verify_handle("3a", llh, num_recs);
+ if (rc)
+ RETURN(rc);
+
+ CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+ for (i = 0; i < 10; i++) {
+ struct llog_rec_hdr hdr;
+ char buf[8];
+
+ hdr.lrh_len = 8;
+ hdr.lrh_type = OBD_CFG_REC;
+ memset(buf, 0, sizeof buf);
+ rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+ if (rc < 0) {
+ CERROR("3b: write 10 records failed at #%d: %d\n",
+ i + 1, rc);
+ RETURN(rc);
+ }
+ num_recs++;
+ }
+
+ rc = verify_handle("3b", llh, num_recs);
+ if (rc)
+ RETURN(rc);
+
+ CWARN("3c: write 1000 more log records\n");
+ for (i = 0; i < 1000; i++) {
+ rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+ if (rc < 0) {
+ CERROR("3c: write 1000 records failed at #%d: %d\n",
+ i + 1, rc);
+ RETURN(rc);
+ }
+ num_recs++;
+ }
+
+ rc = verify_handle("3c", llh, num_recs);
+ if (rc)
+ RETURN(rc);
+
+ CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+ for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+ struct llog_rec_hdr hdr;
+ char buf_even[24];
+ char buf_odd[32];
+
+ memset(buf_odd, 0, sizeof buf_odd);
+ memset(buf_even, 0, sizeof buf_even);
+ if ((i % 2) == 0) {
+ hdr.lrh_len = 24;
+ hdr.lrh_type = OBD_CFG_REC;
+ rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+ } else {
+ hdr.lrh_len = 32;
+ hdr.lrh_type = OBD_CFG_REC;
+ rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+ }
+ if (rc == -ENOSPC) {
+ break;
+ } else if (rc < 0) {
+ CERROR("3d: write recs failed at #%d: %d\n",
+ i + 1, rc);
+ RETURN(rc);
+ }
+ num_recs++;
+ }
+ if (rc != -ENOSPC) {
+ CWARN("3d: write record more than BITMAP size!\n");
+ RETURN(-EINVAL);
+ }
+ CWARN("3d: wrote %d more records before end of llog is reached\n",
+ num_recs);
+
+ rc = verify_handle("3d", llh, num_recs);
+
+ RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_handle *cath;
+ char name[10];
+ int rc, rc2, i, buflen;
+ struct llog_mini_rec lmr;
+ struct llog_cookie cookie;
+ struct llog_ctxt *ctxt;
+ int num_recs = 0;
+ char *buf;
+ struct llog_rec_hdr rec;
+
+ ENTRY;
+
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+
+ lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+ lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+ sprintf(name, "%x", llog_test_rand + 1);
+ CWARN("4a: create a catalog log with name: %s\n", name);
+ rc = llog_open_create(env, ctxt, &cath, NULL, name);
+ if (rc) {
+ CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+ GOTO(ctxt_release, rc);
+ }
+ rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+ if (rc) {
+ CERROR("4a: can't init llog handle: %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ num_recs++;
+ cat_logid = cath->lgh_id;
+
+ CWARN("4b: write 1 record into the catalog\n");
+ rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+ if (rc != 1) {
+ CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+ GOTO(out, rc);
+ }
+ num_recs++;
+ rc = verify_handle("4b", cath, 2);
+ if (rc)
+ GOTO(out, rc);
+
+ rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+ if (rc)
+ GOTO(out, rc);
+
+ CWARN("4c: cancel 1 log record\n");
+ rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+ if (rc) {
+ CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+ num_recs--;
+
+ rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+ if (rc)
+ GOTO(out, rc);
+
+ CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+ for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+ rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+ if (rc) {
+ CERROR("4d: write %d records failed at #%d: %d\n",
+ LLOG_TEST_RECNUM, i + 1, rc);
+ GOTO(out, rc);
+ }
+ num_recs++;
+ }
+
+ /* make sure new plain llog appears */
+ rc = verify_handle("4d", cath, 3);
+ if (rc)
+ GOTO(out, rc);
+
+ CWARN("4e: add 5 large records, one record per block\n");
+ buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+ sizeof(struct llog_rec_tail);
+ OBD_ALLOC(buf, buflen);
+ if (buf == NULL)
+ GOTO(out, rc = -ENOMEM);
+ for (i = 0; i < 5; i++) {
+ rec.lrh_len = buflen;
+ rec.lrh_type = OBD_CFG_REC;
+ rc = llog_cat_add(env, cath, &rec, NULL, buf);
+ if (rc) {
+ CERROR("4e: write 5 records failed at #%d: %d\n",
+ i + 1, rc);
+ GOTO(out_free, rc);
+ }
+ num_recs++;
+ }
+out_free:
+ OBD_FREE(buf, buflen);
+out:
+ CWARN("4f: put newly-created catalog\n");
+ rc2 = llog_cat_close(env, cath);
+ if (rc2) {
+ CERROR("4: close log %s failed: %d\n", name, rc2);
+ if (rc == 0)
+ rc = rc2;
+ }
+ctxt_release:
+ llog_ctxt_put(ctxt);
+ RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+ struct lu_fid fid = {0};
+
+ if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+ CERROR("invalid record in catalog\n");
+ RETURN(-EINVAL);
+ }
+
+ logid_to_fid(&lir->lid_id, &fid);
+
+ CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+ rec->lrh_index, PFID(&fid),
+ PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+ cat_counter++;
+
+ RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct lu_fid fid = {0};
+
+ if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+ CERROR("log is not plain\n");
+ RETURN(-EINVAL);
+ }
+
+ logid_to_fid(&llh->lgh_id, &fid);
+
+ CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+ rec->lrh_index, PFID(&fid));
+
+ plain_counter++;
+
+ RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+ struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct llog_cookie cookie;
+
+ if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+ CERROR("log is not plain\n");
+ RETURN(-EINVAL);
+ }
+
+ cookie.lgc_lgl = llh->lgh_id;
+ cookie.lgc_index = rec->lrh_index;
+
+ llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+ cancel_count++;
+ if (cancel_count == LLOG_TEST_RECNUM)
+ RETURN(-LLOG_EEMPTY);
+ RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_handle *llh = NULL;
+ char name[10];
+ int rc, rc2;
+ struct llog_mini_rec lmr;
+ struct llog_ctxt *ctxt;
+
+ ENTRY;
+
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+
+ lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+ lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+ CWARN("5a: re-open catalog by id\n");
+ rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+ if (rc) {
+ CERROR("5a: llog_create with logid failed: %d\n", rc);
+ GOTO(out_put, rc);
+ }
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+ if (rc) {
+ CERROR("5a: can't init llog handle: %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ CWARN("5b: print the catalog entries.. we expect 2\n");
+ cat_counter = 0;
+ rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+ if (rc) {
+ CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+ if (cat_counter != 2) {
+ CERROR("5b: %d entries in catalog\n", cat_counter);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+ cancel_count = 0;
+ rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+ if (rc != -LLOG_EEMPTY) {
+ CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ CWARN("5c: print the catalog entries.. we expect 1\n");
+ cat_counter = 0;
+ rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+ if (rc) {
+ CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+ if (cat_counter != 1) {
+ CERROR("5c: %d entries in catalog\n", cat_counter);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+ rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+ if (rc) {
+ CERROR("5d: add record to the log with many canceled empty "
+ "pages failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("5e: print plain log entries.. expect 6\n");
+ plain_counter = 0;
+ rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+ if (rc) {
+ CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+ GOTO(out, rc);
+ }
+ if (plain_counter != 6) {
+ CERROR("5e: found %d records\n", plain_counter);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ CWARN("5f: print plain log entries reversely.. expect 6\n");
+ plain_counter = 0;
+ rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+ if (rc) {
+ CERROR("5f: reversely process with plain_print_cb failed:"
+ "%d\n", rc);
+ GOTO(out, rc);
+ }
+ if (plain_counter != 6) {
+ CERROR("5f: found %d records\n", plain_counter);
+ GOTO(out, rc = -EINVAL);
+ }
+
+out:
+ CWARN("5g: close re-opened catalog\n");
+ rc2 = llog_cat_close(env, llh);
+ if (rc2) {
+ CERROR("5g: close log %s failed: %d\n", name, rc2);
+ if (rc == 0)
+ rc = rc2;
+ }
+out_put:
+ llog_ctxt_put(ctxt);
+
+ RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+ char *name)
+{
+ struct obd_device *mgc_obd;
+ struct llog_ctxt *ctxt;
+ struct obd_uuid *mgs_uuid;
+ struct obd_export *exp;
+ struct obd_uuid uuid = { "LLOG_TEST6_UUID" };
+ struct llog_handle *llh = NULL;
+ struct llog_ctxt *nctxt;
+ int rc, rc2;
+
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+ mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+ CWARN("6a: re-open log %s using client API\n", name);
+ mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+ if (mgc_obd == NULL) {
+ CERROR("6a: no MGC devices connected to %s found.\n",
+ mgs_uuid->uuid);
+ GOTO(ctxt_release, rc = -ENOENT);
+ }
+
+ rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+ NULL /* obd_connect_data */, NULL);
+ if (rc != -EALREADY) {
+ CERROR("6a: connect on connected MGC (%s) failed to return"
+ " -EALREADY", mgc_obd->obd_name);
+ if (rc == 0)
+ obd_disconnect(exp);
+ GOTO(ctxt_release, rc = -EINVAL);
+ }
+
+ nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+ rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc) {
+ CERROR("6a: llog_open failed %d\n", rc);
+ GOTO(nctxt_put, rc);
+ }
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc) {
+ CERROR("6a: llog_init_handle failed %d\n", rc);
+ GOTO(parse_out, rc);
+ }
+
+ plain_counter = 1; /* llog header is first record */
+ CWARN("6b: process log %s using client API\n", name);
+ rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+ if (rc)
+ CERROR("6b: llog_process failed %d\n", rc);
+ CWARN("6b: processed %d records\n", plain_counter);
+
+ rc = verify_handle("6b", llh, plain_counter);
+ if (rc)
+ GOTO(parse_out, rc);
+
+ plain_counter = 1; /* llog header is first record */
+ CWARN("6c: process log %s reversely using client API\n", name);
+ rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+ if (rc)
+ CERROR("6c: llog_reverse_process failed %d\n", rc);
+ CWARN("6c: processed %d records\n", plain_counter);
+
+ rc = verify_handle("6c", llh, plain_counter);
+ if (rc)
+ GOTO(parse_out, rc);
+
+parse_out:
+ rc2 = llog_close(env, llh);
+ if (rc2) {
+ CERROR("6: llog_close failed: rc = %d\n", rc2);
+ if (rc == 0)
+ rc = rc2;
+ }
+nctxt_put:
+ llog_ctxt_put(nctxt);
+ctxt_release:
+ llog_ctxt_put(ctxt);
+ RETURN(rc);
+}
+
+static union {
+ struct llog_rec_hdr lrh; /* common header */
+ struct llog_logid_rec llr; /* LLOG_LOGID_MAGIC */
+ struct llog_unlink64_rec lur; /* MDS_UNLINK64_REC */
+ struct llog_setattr64_rec lsr64; /* MDS_SETATTR64_REC */
+ struct llog_size_change_rec lscr; /* OST_SZ_REC */
+ struct llog_changelog_rec lcr; /* CHANGELOG_REC */
+ struct llog_changelog_user_rec lcur; /* CHANGELOG_USER_REC */
+ struct llog_gen_rec lgr; /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct lu_fid fid = {0};
+
+ logid_to_fid(&llh->lgh_id, &fid);
+
+ CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+ rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+ plain_counter++;
+ return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+ struct llog_rec_hdr *rec, void *data)
+{
+ plain_counter++;
+ /* test LLOG_DEL_RECORD is working */
+ return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+ struct llog_handle *llh;
+ int rc = 0, i, process_count;
+ int num_recs = 0;
+
+ ENTRY;
+
+ rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+ if (rc) {
+ CERROR("7_sub: create log failed\n");
+ RETURN(rc);
+ }
+
+ rc = llog_init_handle(env, llh,
+ LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+ &uuid);
+ if (rc) {
+ CERROR("7_sub: can't init llog handle: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+ for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+ rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+ NULL, -1);
+ if (rc == -ENOSPC) {
+ break;
+ } else if (rc < 0) {
+ CERROR("7_sub: write recs failed at #%d: %d\n",
+ i + 1, rc);
+ GOTO(out_close, rc);
+ }
+ num_recs++;
+ }
+ if (rc != -ENOSPC) {
+ CWARN("7_sub: write record more than BITMAP size!\n");
+ GOTO(out_close, rc = -EINVAL);
+ }
+
+ rc = verify_handle("7_sub", llh, num_recs + 1);
+ if (rc) {
+ CERROR("7_sub: verify handle failed: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+ if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+ CWARN("7_sub: records are not aligned, written %d from %u\n",
+ num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+ plain_counter = 0;
+ rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+ if (rc) {
+ CERROR("7_sub: llog process failed: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+ process_count = plain_counter;
+ if (process_count != num_recs) {
+ CERROR("7_sub: processed %d records from %d total\n",
+ process_count, num_recs);
+ GOTO(out_close, rc = -EINVAL);
+ }
+
+ plain_counter = 0;
+ rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+ if (rc) {
+ CERROR("7_sub: reverse llog process failed: %d\n", rc);
+ GOTO(out_close, rc);
+ }
+ if (process_count != plain_counter) {
+ CERROR("7_sub: Reverse/direct processing found different"
+ "number of records: %d/%d\n",
+ plain_counter, process_count);
+ GOTO(out_close, rc = -EINVAL);
+ }
+ if (llog_exist(llh)) {
+ CERROR("7_sub: llog exists but should be zapped\n");
+ GOTO(out_close, rc = -EEXIST);
+ }
+
+ rc = verify_handle("7_sub", llh, 1);
+out_close:
+ if (rc)
+ llog_destroy(env, llh);
+ llog_close(env, llh);
+ RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_ctxt *ctxt;
+ int rc;
+
+ ENTRY;
+
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+ CWARN("7a: test llog_logid_rec\n");
+ llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+ llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+ llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7a: llog_logid_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7b: test llog_unlink64_rec\n");
+ llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+ llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+ llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7b: llog_unlink_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7c: test llog_setattr64_rec\n");
+ llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+ llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+ llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7c: llog_setattr64_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7d: test llog_size_change_rec\n");
+ llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+ llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+ llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7d: llog_size_change_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7e: test llog_changelog_rec\n");
+ llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+ llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+ llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7e: llog_changelog_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7f: test llog_changelog_user_rec\n");
+ llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+ llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+ llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7f: llog_changelog_user_rec test failed\n");
+ GOTO(out, rc);
+ }
+
+ CWARN("7g: test llog_gen_rec\n");
+ llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+ llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+ llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+ rc = llog_test_7_sub(env, ctxt);
+ if (rc) {
+ CERROR("7g: llog_size_change_rec test failed\n");
+ GOTO(out, rc);
+ }
+out:
+ llog_ctxt_put(ctxt);
+ RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+ struct llog_handle *llh = NULL;
+ struct llog_ctxt *ctxt;
+ int rc, err;
+ char name[10];
+
+ ENTRY;
+ ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+
+ sprintf(name, "%x", llog_test_rand);
+
+ rc = llog_test_1(env, obd, name);
+ if (rc)
+ GOTO(cleanup_ctxt, rc);
+
+ rc = llog_test_2(env, obd, name, &llh);
+ if (rc)
+ GOTO(cleanup_ctxt, rc);
+
+ rc = llog_test_3(env, obd, llh);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ rc = llog_test_4(env, obd);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ rc = llog_test_5(env, obd);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ rc = llog_test_6(env, obd, name);
+ if (rc)
+ GOTO(cleanup, rc);
+
+ rc = llog_test_7(env, obd);
+ if (rc)
+ GOTO(cleanup, rc);
+
+cleanup:
+ err = llog_destroy(env, llh);
+ if (err)
+ CERROR("cleanup: llog_destroy failed: %d\n", err);
+ llog_close(env, llh);
+ if (rc == 0)
+ rc = err;
+cleanup_ctxt:
+ llog_ctxt_put(ctxt);
+ return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+ lvars->module_vars = lprocfs_llog_test_module_vars;
+ lvars->obd_vars = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+ struct obd_device *tgt;
+ struct lu_env env;
+ int rc;
+
+ ENTRY;
+
+ rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+ if (rc)
+ RETURN(rc);
+
+ tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+ rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+ if (rc)
+ CERROR("failed to llog_test_llog_finish: %d\n", rc);
+ lu_env_fini(&env);
+ RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_device *tgt;
+ struct llog_ctxt *ctxt;
+ struct dt_object *o;
+ struct lu_env env;
+ struct lu_context test_session;
+ int rc;
+
+ ENTRY;
+
+ if (lcfg->lcfg_bufcount < 2) {
+ CERROR("requires a TARGET OBD name\n");
+ RETURN(-EINVAL);
+ }
+
+ if (lcfg->lcfg_buflens[1] < 1) {
+ CERROR("requires a TARGET OBD name\n");
+ RETURN(-EINVAL);
+ }
+
+ /* disk obd */
+ tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+ if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+ CERROR("target device not attached or not set up (%s)\n",
+ lustre_cfg_string(lcfg, 1));
+ RETURN(-EINVAL);
+ }
+
+ rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+ if (rc)
+ RETURN(rc);
+
+ rc = lu_context_init(&test_session, LCT_SESSION);
+ if (rc)
+ GOTO(cleanup_env, rc);
+ test_session.lc_thread = (struct ptlrpc_thread *)current;
+ lu_context_enter(&test_session);
+ env.le_ses = &test_session;
+
+ CWARN("Setup llog-test device over %s device\n",
+ lustre_cfg_string(lcfg, 1));
+
+ OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+ obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+ rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+ &llog_osd_ops);
+ if (rc)
+ GOTO(cleanup_session, rc);
+
+ /* use MGS llog dir for tests */
+ ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+ LASSERT(ctxt);
+ o = ctxt->loc_dir;
+ llog_ctxt_put(ctxt);
+
+ ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+ LASSERT(ctxt);
+ ctxt->loc_dir = o;
+ llog_ctxt_put(ctxt);
+
+ llog_test_rand = cfs_rand();
+
+ rc = llog_run_tests(&env, tgt);
+ if (rc)
+ llog_test_cleanup(obd);
+cleanup_session:
+ lu_context_exit(&test_session);
+ lu_context_fini(&test_session);
+cleanup_env:
+ lu_env_fini(&env);
+ RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+ .o_owner = THIS_MODULE,
+ .o_setup = llog_test_setup,
+ .o_cleanup = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+ struct lprocfs_static_vars lvars;
+
+ lprocfs_llog_test_init_vars(&lvars);
+ return class_register_type(&llog_obd_ops, NULL,
+ lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+ class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644
index 000000000000..3be35a83a495
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c
@@ -0,0 +1,903 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+ const struct lu_object_conf *unused)
+{
+ struct ls_device *ls;
+ struct lu_object *below;
+ struct lu_device *under;
+
+ ENTRY;
+
+ ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+ under = &ls->ls_osd->dd_lu_dev;
+ below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+ if (below == NULL)
+ RETURN(-ENOMEM);
+
+ lu_object_add(o, below);
+
+ RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+ struct ls_object *obj = lu2ls_obj(o);
+ struct lu_object_header *h = o->lo_header;
+
+ dt_object_fini(&obj->ls_obj);
+ lu_object_header_fini(h);
+ OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+ .loo_object_init = ls_object_init,
+ .loo_object_free = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+ const struct lu_object_header *_h,
+ struct lu_device *d)
+{
+ struct lu_object_header *h;
+ struct ls_object *o;
+ struct lu_object *l;
+
+ LASSERT(_h == NULL);
+
+ OBD_ALLOC_PTR(o);
+ if (o != NULL) {
+ l = &o->ls_obj.do_lu;
+ h = &o->ls_header;
+
+ lu_object_header_init(h);
+ dt_object_init(&o->ls_obj, h, d);
+ lu_object_add_top(h, l);
+
+ l->lo_ops = &ls_lu_obj_ops;
+
+ return l;
+ } else {
+ return NULL;
+ }
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+ .ldo_object_alloc = ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+ struct ls_device *ls, *ret = NULL;
+
+ list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+ if (ls->ls_osd == dev) {
+ atomic_inc(&ls->ls_refcount);
+ ret = ls;
+ break;
+ }
+ }
+ return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+ struct ls_device *ls;
+
+ mutex_lock(&ls_list_mutex);
+ ls = __ls_find_dev(dev);
+ mutex_unlock(&ls_list_mutex);
+
+ return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+ .ldto_start = NULL,
+ .ldto_stop = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+ .ldt_name = "local_storage",
+ .ldt_ops = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+ struct ls_device *ls;
+
+ ENTRY;
+
+ mutex_lock(&ls_list_mutex);
+ ls = __ls_find_dev(dev);
+ if (ls)
+ GOTO(out_ls, ls);
+
+ /* not found, then create */
+ OBD_ALLOC_PTR(ls);
+ if (ls == NULL)
+ GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+ atomic_set(&ls->ls_refcount, 1);
+ INIT_LIST_HEAD(&ls->ls_los_list);
+ mutex_init(&ls->ls_los_mutex);
+
+ ls->ls_osd = dev;
+
+ LASSERT(dev->dd_lu_dev.ld_site);
+ lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+ ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+ ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+ /* finally add ls to the list */
+ list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+ mutex_unlock(&ls_list_mutex);
+ RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+ LASSERT(env);
+ if (!atomic_dec_and_test(&ls->ls_refcount))
+ return;
+
+ mutex_lock(&ls_list_mutex);
+ if (atomic_read(&ls->ls_refcount) == 0) {
+ LASSERT(list_empty(&ls->ls_los_list));
+ list_del(&ls->ls_linkage);
+ lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+ lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+ OBD_FREE_PTR(ls);
+ }
+ mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct lu_fid *fid)
+{
+ LASSERT(los->los_dev);
+ LASSERT(los->los_obj);
+
+ /* take next OID */
+
+ /* to make it unique after reboot we store
+ * the latest generated fid atomically with
+ * object creation see local_object_create() */
+
+ mutex_lock(&los->los_id_lock);
+ fid->f_seq = los->los_seq;
+ fid->f_oid = ++los->los_last_oid;
+ fid->f_ver = 0;
+ mutex_unlock(&los->los_id_lock);
+
+ return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o, struct lu_attr *attr,
+ struct dt_object_format *dof,
+ struct thandle *th)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ int rc;
+
+ ENTRY;
+
+ /* update fid generation file */
+ if (los != NULL) {
+ LASSERT(dt_object_exists(los->los_obj));
+ rc = dt_declare_record_write(env, los->los_obj,
+ sizeof(struct los_ondisk), 0, th);
+ if (rc)
+ RETURN(rc);
+ }
+
+ rc = dt_declare_create(env, o, attr, NULL, dof, th);
+ if (rc)
+ RETURN(rc);
+
+ dti->dti_lb.lb_buf = NULL;
+ dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+ rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+ RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *o, struct lu_attr *attr,
+ struct dt_object_format *dof, struct thandle *th)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ obd_id lastid;
+ int rc;
+
+ ENTRY;
+
+ rc = dt_create(env, o, attr, NULL, dof, th);
+ if (rc)
+ RETURN(rc);
+
+ if (los == NULL)
+ RETURN(rc);
+
+ LASSERT(los->los_obj);
+ LASSERT(dt_object_exists(los->los_obj));
+
+ /* many threads can be updated this, serialize
+ * them here to avoid the race where one thread
+ * takes the value first, but writes it last */
+ mutex_lock(&los->los_id_lock);
+
+ /* update local oid number on disk so that
+ * we know the last one used after reboot */
+ lastid = cpu_to_le64(los->los_last_oid);
+
+ dti->dti_off = 0;
+ dti->dti_lb.lb_buf = &lastid;
+ dti->dti_lb.lb_len = sizeof(lastid);
+ rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+ th);
+ mutex_unlock(&los->los_id_lock);
+
+ RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+ const struct lu_fid *fid,
+ struct local_oid_storage *los,
+ struct ls_device *ls,
+ struct dt_object *parent,
+ const char *name, struct lu_attr *attr,
+ struct dt_object_format *dof)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ struct thandle *th;
+ int rc;
+
+ dto = ls_locate(env, ls, fid);
+ if (unlikely(IS_ERR(dto)))
+ RETURN(dto);
+
+ LASSERT(dto != NULL);
+ if (dt_object_exists(dto))
+ GOTO(out, rc = -EEXIST);
+
+ th = dt_trans_create(env, ls->ls_osd);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ rc = local_object_declare_create(env, los, dto, attr, dof, th);
+ if (rc)
+ GOTO(trans_stop, rc);
+
+ if (dti->dti_dof.dof_type == DFT_DIR) {
+ dt_declare_ref_add(env, dto, th);
+ dt_declare_ref_add(env, parent, th);
+ }
+
+ rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+ if (rc)
+ GOTO(trans_stop, rc);
+
+ rc = dt_trans_start_local(env, ls->ls_osd, th);
+ if (rc)
+ GOTO(trans_stop, rc);
+
+ dt_write_lock(env, dto, 0);
+ if (dt_object_exists(dto))
+ GOTO(unlock, rc = 0);
+
+ CDEBUG(D_OTHER, "create new object "DFID"\n",
+ PFID(lu_object_fid(&dto->do_lu)));
+ rc = local_object_create(env, los, dto, attr, dof, th);
+ if (rc)
+ GOTO(unlock, rc);
+ LASSERT(dt_object_exists(dto));
+
+ if (dti->dti_dof.dof_type == DFT_DIR) {
+ if (!dt_try_as_dir(env, dto))
+ GOTO(destroy, rc = -ENOTDIR);
+ /* Add "." and ".." for newly created dir */
+ rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+ BYPASS_CAPA, 1);
+ if (rc)
+ GOTO(destroy, rc);
+ dt_ref_add(env, dto, th);
+ rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+ (void *)"..", th, BYPASS_CAPA, 1);
+ if (rc)
+ GOTO(destroy, rc);
+ }
+
+ dt_write_lock(env, parent, 0);
+ rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+ (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+ if (dti->dti_dof.dof_type == DFT_DIR)
+ dt_ref_add(env, parent, th);
+ dt_write_unlock(env, parent);
+ if (rc)
+ GOTO(destroy, rc);
+destroy:
+ if (rc)
+ dt_destroy(env, dto, th);
+unlock:
+ dt_write_unlock(env, dto);
+trans_stop:
+ dt_trans_stop(env, ls->ls_osd, th);
+out:
+ if (rc) {
+ lu_object_put_nocache(env, &dto->do_lu);
+ dto = ERR_PTR(rc);
+ }
+ RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ int rc;
+
+ LASSERT(parent);
+
+ rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+ if (rc == 0)
+ /* name is found, get the object */
+ dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+ else if (rc != -ENOENT)
+ dto = ERR_PTR(rc);
+ else {
+ rc = local_object_fid_generate(env, los, &dti->dti_fid);
+ if (rc < 0) {
+ dto = ERR_PTR(rc);
+ } else {
+ /* create the object */
+ dti->dti_attr.la_valid = LA_MODE;
+ dti->dti_attr.la_mode = mode;
+ dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT);
+ dto = __local_file_create(env, &dti->dti_fid, los,
+ dt2ls_dev(los->los_dev),
+ parent, name, &dti->dti_attr,
+ &dti->dti_dof);
+ }
+ }
+ return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name,
+ __u32 mode)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ int rc;
+
+ LASSERT(parent);
+
+ rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+ if (rc == 0) {
+ dto = dt_locate(env, dt, &dti->dti_fid);
+ } else if (rc != -ENOENT) {
+ dto = ERR_PTR(rc);
+ } else {
+ struct ls_device *ls;
+
+ ls = ls_device_get(dt);
+ if (IS_ERR(ls)) {
+ dto = ERR_PTR(PTR_ERR(ls));
+ } else {
+ /* create the object */
+ dti->dti_attr.la_valid = LA_MODE;
+ dti->dti_attr.la_mode = mode;
+ dti->dti_dof.dof_type = dt_mode_to_dft(mode & S_IFMT);
+ dto = __local_file_create(env, fid, NULL, ls, parent,
+ name, &dti->dti_attr,
+ &dti->dti_dof);
+ /* ls_device_put() will finalize the ls device, we
+ * have to open the object in other device stack */
+ if (!IS_ERR(dto)) {
+ dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+ lu_object_put_nocache(env, &dto->do_lu);
+ dto = dt_locate(env, dt, &dti->dti_fid);
+ }
+ ls_device_put(env, ls);
+ }
+ }
+ return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+ struct local_oid_storage *los,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ int rc;
+
+ LASSERT(parent);
+
+ rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+ if (rc == 0) {
+ /* name is found, get the object */
+ dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+ } else if (rc != -ENOENT) {
+ dto = ERR_PTR(rc);
+ } else {
+ rc = local_object_fid_generate(env, los, &dti->dti_fid);
+ if (rc < 0) {
+ dto = ERR_PTR(rc);
+ } else {
+ /* create the object */
+ dti->dti_attr.la_valid = LA_MODE;
+ dti->dti_attr.la_mode = mode;
+ dti->dti_dof.dof_type = DFT_INDEX;
+ dti->dti_dof.u.dof_idx.di_feat = ft;
+ dto = __local_file_create(env, &dti->dti_fid, los,
+ dt2ls_dev(los->los_dev),
+ parent, name, &dti->dti_attr,
+ &dti->dti_dof);
+ }
+ }
+ return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+ struct dt_device *dt,
+ const struct lu_fid *fid,
+ struct dt_object *parent,
+ const char *name, __u32 mode,
+ const struct dt_index_features *ft)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ int rc;
+
+ LASSERT(parent);
+
+ rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+ if (rc == 0) {
+ /* name is found, get the object */
+ if (!lu_fid_eq(fid, &dti->dti_fid))
+ dto = ERR_PTR(-EINVAL);
+ else
+ dto = dt_locate(env, dt, fid);
+ } else if (rc != -ENOENT) {
+ dto = ERR_PTR(rc);
+ } else {
+ struct ls_device *ls;
+
+ ls = ls_device_get(dt);
+ if (IS_ERR(ls)) {
+ dto = ERR_PTR(PTR_ERR(ls));
+ } else {
+ /* create the object */
+ dti->dti_attr.la_valid = LA_MODE;
+ dti->dti_attr.la_mode = mode;
+ dti->dti_dof.dof_type = DFT_INDEX;
+ dti->dti_dof.u.dof_idx.di_feat = ft;
+ dto = __local_file_create(env, fid, NULL, ls, parent,
+ name, &dti->dti_attr,
+ &dti->dti_dof);
+ /* ls_device_put() will finalize the ls device, we
+ * have to open the object in other device stack */
+ if (!IS_ERR(dto)) {
+ dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+ lu_object_put_nocache(env, &dto->do_lu);
+ dto = dt_locate(env, dt, &dti->dti_fid);
+ }
+ ls_device_put(env, ls);
+ }
+ }
+ return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+ struct dt_device *dt,
+ struct dt_object *p,
+ struct dt_object *c, const char *name,
+ struct thandle *th)
+{
+ int rc;
+
+ rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+ if (rc < 0)
+ return rc;
+
+ rc = dt_declare_ref_del(env, c, th);
+ if (rc < 0)
+ return rc;
+
+ return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+ struct dt_object *parent, const char *name)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *dto;
+ struct thandle *th;
+ int rc;
+
+ ENTRY;
+
+ rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+ if (rc == -ENOENT)
+ RETURN(0);
+ else if (rc < 0)
+ RETURN(rc);
+
+ dto = dt_locate(env, dt, &dti->dti_fid);
+ if (unlikely(IS_ERR(dto)))
+ RETURN(PTR_ERR(dto));
+
+ th = dt_trans_create(env, dt);
+ if (IS_ERR(th))
+ GOTO(out, rc = PTR_ERR(th));
+
+ rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ rc = dt_trans_start_local(env, dt, th);
+ if (rc < 0)
+ GOTO(stop, rc);
+
+ dt_write_lock(env, dto, 0);
+ rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+ if (rc < 0)
+ GOTO(unlock, rc);
+
+ rc = dt_ref_del(env, dto, th);
+ if (rc < 0) {
+ rc = dt_insert(env, parent,
+ (const struct dt_rec *)&dti->dti_fid,
+ (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+ GOTO(unlock, rc);
+ }
+
+ rc = dt_destroy(env, dto, th);
+unlock:
+ dt_write_unlock(env, dto);
+stop:
+ dt_trans_stop(env, dt, th);
+out:
+ lu_object_put_nocache(env, &dto->do_lu);
+ return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+ struct local_oid_storage *los, *ret = NULL;
+
+ list_for_each_entry(los, &ls->ls_los_list, los_list) {
+ if (los->los_seq == seq) {
+ atomic_inc(&los->los_refcount);
+ ret = los;
+ break;
+ }
+ }
+ return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+ if (atomic_dec_and_test(&los->los_refcount))
+ /* should never happen, only local_oid_storage_fini should
+ * drop refcount to zero */
+ LBUG();
+ return;
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+ __u64 lastid_seq, __u32 *first_oid, struct ls_device *ls)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct dt_object *root = NULL;
+ struct los_ondisk losd;
+ struct dt_object *o = NULL;
+ int rc = 0;
+
+ rc = dt_root_get(env, dev, &dti->dti_fid);
+ if (rc)
+ return rc;
+
+ root = ls_locate(env, ls, &dti->dti_fid);
+ if (IS_ERR(root))
+ return PTR_ERR(root);
+
+ /* find old last_id file */
+ snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-"LPX64"-lastid",
+ lastid_seq);
+ rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+ lu_object_put_nocache(env, &root->do_lu);
+ if (rc == -ENOENT) {
+ /* old llog lastid accessed by FID only */
+ if (lastid_seq != FID_SEQ_LLOG)
+ return 0;
+ dti->dti_fid.f_seq = FID_SEQ_LLOG;
+ dti->dti_fid.f_oid = 1;
+ dti->dti_fid.f_ver = 0;
+ o = ls_locate(env, ls, &dti->dti_fid);
+ if (IS_ERR(o))
+ return PTR_ERR(o);
+
+ if (!dt_object_exists(o)) {
+ lu_object_put_nocache(env, &o->do_lu);
+ return 0;
+ }
+ CDEBUG(D_INFO, "Found old llog lastid file\n");
+ } else if (rc < 0) {
+ return rc;
+ } else {
+ CDEBUG(D_INFO, "Found old lastid file for sequence "LPX64"\n",
+ lastid_seq);
+ o = ls_locate(env, ls, &dti->dti_fid);
+ if (IS_ERR(o))
+ return PTR_ERR(o);
+ }
+ /* let's read seq-NNNNNN-lastid file value */
+ LASSERT(dt_object_exists(o));
+ dti->dti_off = 0;
+ dti->dti_lb.lb_buf = &losd;
+ dti->dti_lb.lb_len = sizeof(losd);
+ dt_read_lock(env, o, 0);
+ rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+ dt_read_unlock(env, o);
+ lu_object_put_nocache(env, &o->do_lu);
+ if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+ CERROR("%s: wrong content of seq-"LPX64"-lastid file, magic %x\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+ le32_to_cpu(losd.lso_magic));
+ return -EINVAL;
+ } else if (rc < 0) {
+ CERROR("%s: failed to read seq-"LPX64"-lastid: rc = %d\n",
+ o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+ return rc;
+ }
+ *first_oid = le32_to_cpu(losd.lso_next_oid);
+ return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+ const struct lu_fid *first_fid,
+ struct local_oid_storage **los)
+{
+ struct dt_thread_info *dti = dt_info(env);
+ struct ls_device *ls;
+ obd_id lastid;
+ struct dt_object *o = NULL;
+ struct thandle *th;
+ __u32 first_oid = fid_oid(first_fid);
+ int rc = 0;
+
+ ENTRY;
+
+ ls = ls_device_get(dev);
+ if (IS_ERR(ls))
+ RETURN(PTR_ERR(ls));
+
+ mutex_lock(&ls->ls_los_mutex);
+ *los = dt_los_find(ls, fid_seq(first_fid));
+ if (*los != NULL)
+ GOTO(out, rc = 0);
+
+ /* not found, then create */
+ OBD_ALLOC_PTR(*los);
+ if (*los == NULL)
+ GOTO(out, rc = -ENOMEM);
+
+ atomic_set(&(*los)->los_refcount, 1);
+ mutex_init(&(*los)->los_id_lock);
+ (*los)->los_dev = &ls->ls_top_dev;
+ atomic_inc(&ls->ls_refcount);
+ list_add(&(*los)->los_list, &ls->ls_los_list);
+
+ /* Use {seq, 0, 0} to create the LAST_ID file for every
+ * sequence. OIDs start at LUSTRE_FID_INIT_OID.
+ */
+ dti->dti_fid.f_seq = fid_seq(first_fid);
+ dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+ dti->dti_fid.f_ver = 0;
+ o = ls_locate(env, ls, &dti->dti_fid);
+ if (IS_ERR(o))
+ GOTO(out_los, rc = PTR_ERR(o));
+
+ if (!dt_object_exists(o)) {
+ rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+ &first_oid, ls);
+ if (rc < 0)
+ GOTO(out_los, rc);
+
+ th = dt_trans_create(env, dev);
+ if (IS_ERR(th))
+ GOTO(out_los, rc = PTR_ERR(th));
+
+ dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+ dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+ dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+ rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+ &dti->dti_dof, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_declare_record_write(env, o, sizeof(lastid), 0, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ rc = dt_trans_start_local(env, dev, th);
+ if (rc)
+ GOTO(out_trans, rc);
+
+ dt_write_lock(env, o, 0);
+ if (dt_object_exists(o))
+ GOTO(out_lock, rc = 0);
+
+ rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+ th);
+ if (rc)
+ GOTO(out_lock, rc);
+
+ lastid = cpu_to_le64(first_oid);
+
+ dti->dti_off = 0;
+ dti->dti_lb.lb_buf = &lastid;
+ dti->dti_lb.lb_len = sizeof(lastid);
+ rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+ if (rc)
+ GOTO(out_lock, rc);
+out_lock:
+ dt_write_unlock(env, o);
+out_trans:
+ dt_trans_stop(env, dev, th);
+ } else {
+ dti->dti_off = 0;
+ dti->dti_lb.lb_buf = &lastid;
+ dti->dti_lb.lb_len = sizeof(lastid);
+ dt_read_lock(env, o, 0);
+ rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+ dt_read_unlock(env, o);
+ if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+ CERROR("%s: bad oid "LPU64" is read from LAST_ID\n",
+ o->do_lu.lo_dev->ld_obd->obd_name,
+ le64_to_cpu(lastid));
+ rc = -EINVAL;
+ }
+ }
+out_los:
+ if (rc != 0) {
+ list_del(&(*los)->los_list);
+ atomic_dec(&ls->ls_refcount);
+ OBD_FREE_PTR(*los);
+ *los = NULL;
+ if (o != NULL && !IS_ERR(o))
+ lu_object_put_nocache(env, &o->do_lu);
+ } else {
+ (*los)->los_seq = fid_seq(first_fid);
+ (*los)->los_last_oid = le64_to_cpu(lastid);
+ (*los)->los_obj = o;
+ /* read value should not be less than initial one */
+ LASSERTF((*los)->los_last_oid >= first_oid, "%u < %u\n",
+ (*los)->los_last_oid, first_oid);
+ }
+out:
+ mutex_unlock(&ls->ls_los_mutex);
+ ls_device_put(env, ls);
+ return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+ struct local_oid_storage *los)
+{
+ struct ls_device *ls;
+
+ if (!atomic_dec_and_test(&los->los_refcount))
+ return;
+
+ LASSERT(env);
+ LASSERT(los->los_dev);
+ ls = dt2ls_dev(los->los_dev);
+
+ mutex_lock(&ls->ls_los_mutex);
+ if (atomic_read(&los->los_refcount) == 0) {
+ if (los->los_obj)
+ lu_object_put_nocache(env, &los->los_obj->do_lu);
+ list_del(&los->los_list);
+ OBD_FREE_PTR(los);
+ }
+ mutex_unlock(&ls->ls_los_mutex);
+ ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644
index 000000000000..d553c3752703
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License version 2 for more details. A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+ struct dt_device ls_top_dev;
+ /* all initialized ls_devices on this node linked by this */
+ struct list_head ls_linkage;
+ /* how many handle's reference this local storage */
+ atomic_t ls_refcount;
+ /* underlaying OSD device */
+ struct dt_device *ls_osd;
+ /* list of all local OID storages */
+ struct list_head ls_los_list;
+ struct mutex ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+ return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+ struct lu_object_header ls_header;
+ struct dt_object ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+ return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+ struct ls_device *ls,
+ const struct lu_fid *fid)
+{
+ return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+ __u32 lso_magic;
+ __u32 lso_next_oid;
+};
+
+#define LOS_MAGIC 0xdecafbee
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 000000000000..e2d57fef0da3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
@@ -0,0 +1,562 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ * JobID format: 32 bit integer.
+ * JobID env var: SLURM_JOB_ID.
+ * SGE:
+ * JobID format: Decimal integer range to 99999.
+ * JobID env var: JOB_ID.
+ * LSF:
+ * JobID format: 6 digit integer by default (up to 999999), can be
+ * increased to 10 digit (up to 2147483646).
+ * JobID env var: LSB_JOBID.
+ * Loadleveler:
+ * JobID format: String of machine_name.cluster_id.process_id, for
+ * example: fr2n02.32.0
+ * JobID env var: LOADL_STEP_ID.
+ * PBS:
+ * JobID format: String of sequence_number[.server_name][@server].
+ * JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ * JobID format: Same as PBS.
+ * JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+ struct hlist_node js_hash;
+ struct list_head js_list;
+ atomic_t js_refcount;
+ char js_jobid[JOBSTATS_JOBID_SIZE];
+ time_t js_timestamp; /* seconds */
+ struct lprocfs_stats *js_stats;
+ struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+ struct job_stat *job;
+ job = hlist_entry(hnode, struct job_stat, js_hash);
+ return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct job_stat *job;
+ job = hlist_entry(hnode, struct job_stat, js_hash);
+ return (strlen(job->js_jobid) == strlen(key)) &&
+ !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct job_stat *job;
+ job = hlist_entry(hnode, struct job_stat, js_hash);
+ atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+ LASSERT(atomic_read(&job->js_refcount) == 0);
+ LASSERT(job->js_jobstats);
+
+ write_lock(&job->js_jobstats->ojs_lock);
+ list_del_init(&job->js_list);
+ write_unlock(&job->js_jobstats->ojs_lock);
+
+ lprocfs_free_stats(&job->js_stats);
+ OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+ LASSERT(atomic_read(&job->js_refcount) > 0);
+ if (atomic_dec_and_test(&job->js_refcount))
+ job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct job_stat *job;
+ job = hlist_entry(hnode, struct job_stat, js_hash);
+ job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+ .hs_hash = job_stat_hash,
+ .hs_key = job_stat_key,
+ .hs_keycmp = job_stat_keycmp,
+ .hs_object = job_stat_object,
+ .hs_get = job_stat_get,
+ .hs_put_locked = job_stat_put_locked,
+ .hs_exit = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+ struct hlist_node *hnode, void *data)
+{
+ time_t oldest = *((time_t *)data);
+ struct job_stat *job;
+
+ job = hlist_entry(hnode, struct job_stat, js_hash);
+ if (!oldest || job->js_timestamp < oldest)
+ cfs_hash_bd_del_locked(hs, bd, hnode);
+
+ return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+ time_t oldest, now;
+
+ if (stats->ojs_cleanup_interval == 0)
+ return;
+
+ now = cfs_time_current_sec();
+ if (!force && now < stats->ojs_last_cleanup +
+ stats->ojs_cleanup_interval)
+ return;
+
+ oldest = now - stats->ojs_cleanup_interval;
+ cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+ &oldest);
+ stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+ struct job_stat *job;
+
+ LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+ OBD_ALLOC_PTR(job);
+ if (job == NULL)
+ return NULL;
+
+ job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+ if (job->js_stats == NULL) {
+ OBD_FREE_PTR(job);
+ return NULL;
+ }
+
+ jobs->ojs_cntr_init_fn(job->js_stats);
+
+ memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+ job->js_timestamp = cfs_time_current_sec();
+ job->js_jobstats = jobs;
+ INIT_HLIST_NODE(&job->js_hash);
+ INIT_LIST_HEAD(&job->js_list);
+ atomic_set(&job->js_refcount, 1);
+
+ return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+ int event, long amount)
+{
+ struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+ struct job_stat *job, *job2;
+ ENTRY;
+
+ LASSERT(stats && stats->ojs_hash);
+
+ lprocfs_job_cleanup(stats, false);
+
+ if (!jobid || !strlen(jobid))
+ RETURN(-EINVAL);
+
+ if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+ CERROR("Invalid jobid size (%lu), expect(%d)\n",
+ (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+ RETURN(-EINVAL);
+ }
+
+ job = cfs_hash_lookup(stats->ojs_hash, jobid);
+ if (job)
+ goto found;
+
+ job = job_alloc(jobid, stats);
+ if (job == NULL)
+ RETURN(-ENOMEM);
+
+ job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+ &job->js_hash);
+ if (job2 != job) {
+ job_putref(job);
+ job = job2;
+ /* We cannot LASSERT(!list_empty(&job->js_list)) here,
+ * since we just lost the race for inserting "job" into the
+ * ojs_list, and some other thread is doing it _right_now_.
+ * Instead, be content the other thread is doing this, since
+ * "job2" was initialized in job_alloc() already. LU-2163 */
+ } else {
+ LASSERT(list_empty(&job->js_list));
+ write_lock(&stats->ojs_lock);
+ list_add_tail(&job->js_list, &stats->ojs_list);
+ write_unlock(&stats->ojs_lock);
+ }
+
+found:
+ LASSERT(stats == job->js_jobstats);
+ LASSERT(stats->ojs_cntr_num > event);
+ job->js_timestamp = cfs_time_current_sec();
+ lprocfs_counter_add(job->js_stats, event, amount);
+
+ job_putref(job);
+ RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+ struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+ time_t oldest = 0;
+
+ if (stats->ojs_hash == NULL)
+ return;
+ cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+ cfs_hash_putref(stats->ojs_hash);
+ stats->ojs_hash = NULL;
+ LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+ struct obd_job_stats *stats = p->private;
+ loff_t off = *pos;
+ struct job_stat *job;
+
+ read_lock(&stats->ojs_lock);
+ if (off == 0)
+ return SEQ_START_TOKEN;
+ off--;
+ list_for_each_entry(job, &stats->ojs_list, js_list) {
+ if (!off--)
+ return job;
+ }
+ return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+ struct obd_job_stats *stats = p->private;
+
+ read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ struct obd_job_stats *stats = p->private;
+ struct job_stat *job;
+ struct list_head *next;
+
+ ++*pos;
+ if (v == SEQ_START_TOKEN) {
+ next = stats->ojs_list.next;
+ } else {
+ job = (struct job_stat *)v;
+ next = job->js_list.next;
+ }
+
+ return next == &stats->ojs_list ? NULL :
+ list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id: test_id.222.25844
+ * snapshot_time: 1322494486
+ * open: { samples: 3, unit: reqs }
+ * close: { samples: 3, unit: reqs }
+ * mknod: { samples: 0, unit: reqs }
+ * link: { samples: 0, unit: reqs }
+ * unlink: { samples: 0, unit: reqs }
+ * mkdir: { samples: 0, unit: reqs }
+ * rmdir: { samples: 0, unit: reqs }
+ * rename: { samples: 1, unit: reqs }
+ * getattr: { samples: 7, unit: reqs }
+ * setattr: { samples: 0, unit: reqs }
+ * getxattr: { samples: 0, unit: reqs }
+ * setxattr: { samples: 0, unit: reqs }
+ * statfs: { samples: 0, unit: reqs }
+ * sync: { samples: 0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id 4854
+ * snapshot_time: 1322494602
+ * read: { samples: 0, unit: bytes, min: 0, max: 0, sum: 0 }
+ * write: { samples: 1, unit: bytes, min: 10, max: 10, sum: 10 }
+ * setattr: { samples: 0, unit: reqs }
+ * punch: { samples: 0, unit: reqs }
+ * sync: { samples: 0, unit: reqs }
+ */
+
+static const char spaces[] = " ";
+
+static int inline width(const char *str, int len)
+{
+ return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+ struct job_stat *job = v;
+ struct lprocfs_stats *s;
+ struct lprocfs_counter ret;
+ struct lprocfs_counter *cntr;
+ struct lprocfs_counter_header *cntr_header;
+ int i;
+
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(p, "job_stats:\n");
+ return 0;
+ }
+
+ seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+ seq_printf(p, " %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+ s = job->js_stats;
+ for (i = 0; i < s->ls_num; i++) {
+ cntr = lprocfs_stats_counter_get(s, 0, i);
+ cntr_header = &s->ls_cnt_header[i];
+ lprocfs_stats_collect(s, i, &ret);
+
+ seq_printf(p, " %s:%.*s { samples: %11"LPF64"u",
+ cntr_header->lc_name,
+ width(cntr_header->lc_name, 15), spaces,
+ ret.lc_count);
+ if (cntr_header->lc_units[0] != '\0')
+ seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+ if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+ seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+ " sum:%16"LPF64"u",
+ ret.lc_count ? ret.lc_min : 0,
+ ret.lc_count ? ret.lc_max : 0,
+ ret.lc_count ? ret.lc_sum : 0);
+ }
+ if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+ seq_printf(p, ", sumsq: %18"LPF64"u",
+ ret.lc_count ? ret.lc_sumsquare : 0);
+ }
+
+ seq_printf(p, " }\n");
+
+ }
+ return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+ start: lprocfs_jobstats_seq_start,
+ stop: lprocfs_jobstats_seq_stop,
+ next: lprocfs_jobstats_seq_next,
+ show: lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc;
+
+ rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+ if (rc)
+ return rc;
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+ return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct obd_job_stats *stats = seq->private;
+ char jobid[JOBSTATS_JOBID_SIZE];
+ int all = 0;
+ struct job_stat *job;
+
+ if (!memcmp(buf, "clear", strlen("clear"))) {
+ all = 1;
+ } else if (len < JOBSTATS_JOBID_SIZE) {
+ memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+ /* Trim '\n' if any */
+ if (buf[len - 1] == '\n')
+ memcpy(jobid, buf, len - 1);
+ else
+ memcpy(jobid, buf, len);
+ } else {
+ return -EINVAL;
+ }
+
+ LASSERT(stats->ojs_hash);
+ if (all) {
+ time_t oldest = 0;
+ cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+ &oldest);
+ return len;
+ }
+
+ if (!strlen(jobid))
+ return -EINVAL;
+
+ job = cfs_hash_lookup(stats->ojs_hash, jobid);
+ if (!job)
+ return -EINVAL;
+
+ cfs_hash_del_key(stats->ojs_hash, jobid);
+
+ job_putref(job);
+ return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = lprocfs_jobstats_seq_open,
+ .read = seq_read,
+ .write = lprocfs_jobstats_seq_write,
+ .llseek = seq_lseek,
+ .release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+ cntr_init_callback init_fn)
+{
+ struct proc_dir_entry *entry;
+ struct obd_job_stats *stats;
+ ENTRY;
+
+ LASSERT(obd->obd_proc_entry != NULL);
+ LASSERT(obd->obd_type->typ_name);
+
+ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+ CERROR("Invalid obd device type.\n");
+ RETURN(-EINVAL);
+ }
+ stats = &obd->u.obt.obt_jobstats;
+
+ LASSERT(stats->ojs_hash == NULL);
+ stats->ojs_hash = cfs_hash_create("JOB_STATS",
+ HASH_JOB_STATS_CUR_BITS,
+ HASH_JOB_STATS_MAX_BITS,
+ HASH_JOB_STATS_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &job_stats_hash_ops,
+ CFS_HASH_DEFAULT);
+ if (stats->ojs_hash == NULL)
+ RETURN(-ENOMEM);
+
+ INIT_LIST_HEAD(&stats->ojs_list);
+ rwlock_init(&stats->ojs_lock);
+ stats->ojs_cntr_num = cntr_num;
+ stats->ojs_cntr_init_fn = init_fn;
+ stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+ stats->ojs_last_cleanup = cfs_time_current_sec();
+
+ entry = proc_create_data("job_stats", 0644, obd->obd_proc_entry,
+ &lprocfs_jobstats_seq_fops, stats);
+ if (entry)
+ RETURN(0);
+ else
+ RETURN(-ENOMEM);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_job_stats *stats;
+
+ LASSERT(obd != NULL);
+ stats = &obd->u.obt.obt_jobstats;
+ return seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_job_stats *stats;
+ int val, rc;
+
+ LASSERT(obd != NULL);
+ stats = &obd->u.obt.obt_jobstats;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ stats->ojs_cleanup_interval = val;
+ lprocfs_job_cleanup(stats, true);
+
+ return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 000000000000..f7af3d6a4efc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,1985 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/seq_file.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+ "Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+ return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+ return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+ char *name, void *data,
+ struct file_operations *fops)
+{
+ proc_dir_entry_t *proc;
+ umode_t mode = 0;
+
+ if (root == NULL || name == NULL || fops == NULL)
+ return ERR_PTR(-EINVAL);
+
+ if (fops->read)
+ mode = 0444;
+ if (fops->write)
+ mode |= 0200;
+ proc = proc_create_data(name, mode, root, fops, data);
+ if (!proc) {
+ CERROR("LprocFS: No memory to create /proc entry %s", name);
+ return ERR_PTR(-ENOMEM);
+ }
+ return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+ struct proc_dir_entry *parent, const char *format, ...)
+{
+ struct proc_dir_entry *entry;
+ char *dest;
+ va_list ap;
+
+ if (parent == NULL || format == NULL)
+ return NULL;
+
+ OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+ if (dest == NULL)
+ return NULL;
+
+ va_start(ap, format);
+ vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+ va_end(ap);
+
+ entry = proc_symlink(name, parent, dest);
+ if (entry == NULL)
+ CERROR("LprocFS: Could not create symbolic link from %s to %s",
+ name, dest);
+
+ OBD_FREE(dest, MAX_STRING_SIZE + 1);
+ return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in] The parent proc entry on which new entry will be added.
+ * \param list [in] Array of proc entries to be added.
+ * \param data [in] The argument to be passed when entries read/write routines
+ * are called through /proc file.
+ *
+ * \retval 0 on success
+ * < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+ void *data)
+{
+ if (root == NULL || list == NULL)
+ return -EINVAL;
+
+ while (list->name != NULL) {
+ struct proc_dir_entry *proc;
+ umode_t mode = 0;
+
+ if (list->proc_mode != 0000) {
+ mode = list->proc_mode;
+ } else if (list->fops) {
+ if (list->fops->read)
+ mode = 0444;
+ if (list->fops->write)
+ mode |= 0200;
+ }
+ proc = proc_create_data(list->name, mode, root,
+ list->fops ?: &lprocfs_generic_fops,
+ list->data ?: data);
+ if (proc == NULL)
+ return -ENOMEM;
+ list++;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+ proc_remove(*rooth);
+ *rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+ LASSERT(parent != NULL);
+ remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+ struct proc_dir_entry *parent,
+ struct lprocfs_vars *list, void *data)
+{
+ struct proc_dir_entry *newchild;
+
+ newchild = proc_mkdir(name, parent);
+ if (newchild != NULL && list != NULL) {
+ int rc = lprocfs_add_vars(newchild, list, data);
+ if (rc) {
+ lprocfs_remove(&newchild);
+ return ERR_PTR(rc);
+ }
+ }
+ return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+ return seq_printf(m, "%u\n", *(unsigned int *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ unsigned *p = data;
+ char dummy[MAX_STRING_SIZE + 1], *end;
+ unsigned long tmp;
+
+ dummy[MAX_STRING_SIZE] = '\0';
+ if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+ return -EFAULT;
+
+ tmp = simple_strtoul(dummy, &end, 0);
+ if (dummy == end)
+ return -EINVAL;
+
+ *p = (unsigned int)tmp;
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+ return seq_printf(m, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+ atomic_t *atom = data;
+ LASSERT(atom != NULL);
+ return seq_printf(m, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+ unsigned long count, void *data)
+{
+ atomic_t *atm = data;
+ int val = 0;
+ int rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc < 0)
+ return rc;
+
+ if (val <= 0)
+ return -ERANGE;
+
+ atomic_set(atm, val);
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+
+ LASSERT(obd != NULL);
+ return seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = data;
+
+ LASSERT(dev != NULL);
+ return seq_printf(m, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ rc = seq_printf(m, "%u\n", osfs.os_bsize);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_blocks;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ rc = seq_printf(m, LPU64"\n", result);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bfree;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ rc = seq_printf(m, LPU64"\n", result);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc) {
+ __u32 blk_size = osfs.os_bsize >> 10;
+ __u64 result = osfs.os_bavail;
+
+ while (blk_size >>= 1)
+ result <<= 1;
+
+ rc = seq_printf(m, LPU64"\n", result);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ rc = seq_printf(m, LPU64"\n", osfs.os_files);
+
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_statfs osfs;
+ int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ OBD_STATFS_NODELAY);
+ if (!rc)
+ rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct obd_import *imp;
+ char *imp_state_name = NULL;
+ int rc = 0;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+ imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+ rc = seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+ imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ struct ptlrpc_connection *conn;
+ int rc = 0;
+
+ LASSERT(obd != NULL);
+
+ LPROCFS_CLIMP_CHECK(obd);
+ conn = obd->u.cli.cl_import->imp_connection;
+ if (conn && obd->u.cli.cl_import)
+ rc = seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+ else
+ rc = seq_printf(m, "%s\n", "<none>");
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+ struct lprocfs_counter *cnt)
+{
+ unsigned int num_entry;
+ struct lprocfs_counter *percpu_cntr;
+ struct lprocfs_counter_header *cntr_header;
+ int i;
+ unsigned long flags = 0;
+
+ memset(cnt, 0, sizeof(*cnt));
+
+ if (stats == NULL) {
+ /* set count to 1 to avoid divide-by-zero errs in callers */
+ cnt->lc_count = 1;
+ return;
+ }
+
+ cnt->lc_min = LC_MIN_INIT;
+
+ num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+ for (i = 0; i < num_entry; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ cntr_header = &stats->ls_cnt_header[idx];
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+ cnt->lc_count += percpu_cntr->lc_count;
+ cnt->lc_sum += percpu_cntr->lc_sum;
+ if (percpu_cntr->lc_min < cnt->lc_min)
+ cnt->lc_min = percpu_cntr->lc_min;
+ if (percpu_cntr->lc_max > cnt->lc_max)
+ cnt->lc_max = percpu_cntr->lc_max;
+ cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+ }
+
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first) \
+ do { \
+ if (imp->imp_##flag) \
+ seq_printf(m, "%s" #flag, first ? "" : ", "); \
+ } while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+ bool first = true;
+
+ if (imp->imp_obd->obd_no_recov) {
+ seq_printf(m, "no_recov");
+ first = false;
+ }
+
+ flag2str(invalid, first);
+ first = false;
+ flag2str(deactive, first);
+ flag2str(replayable, first);
+ flag2str(pingable, first);
+ return 0;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+ "read_only",
+ "lov_index",
+ "unused",
+ "write_grant",
+ "server_lock",
+ "version",
+ "request_portal",
+ "acl",
+ "xattr",
+ "create_on_write",
+ "truncate_lock",
+ "initial_transno",
+ "inode_bit_locks",
+ "join_file(obsolete)",
+ "getattr_by_fid",
+ "no_oh_for_devices",
+ "remote_client",
+ "remote_client_by_force",
+ "max_byte_per_rpc",
+ "64bit_qdata",
+ "mds_capability",
+ "oss_capability",
+ "early_lock_cancel",
+ "som",
+ "adaptive_timeouts",
+ "lru_resize",
+ "mds_mds_connection",
+ "real_conn",
+ "change_qunit_size",
+ "alt_checksum_algorithm",
+ "fid_is_enabled",
+ "version_recovery",
+ "pools",
+ "grant_shrink",
+ "skip_orphan",
+ "large_ea",
+ "full20",
+ "layout_lock",
+ "64bithash",
+ "object_max_bytes",
+ "imp_recov",
+ "jobstats",
+ "umask",
+ "einprogress",
+ "grant_param",
+ "flock_owner",
+ "lvb_type",
+ "nanoseconds_times",
+ "lightweight_conn",
+ "short_io",
+ "pingless",
+ "unknown",
+ NULL
+};
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+ __u64 mask = 1;
+ int i;
+ bool first = true;
+
+ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+ if (flags & mask) {
+ seq_printf(m, "%s%s",
+ first ? sep : "", obd_connect_names[i]);
+ first = false;
+ }
+ }
+ if (flags & ~(mask - 1))
+ seq_printf(m, "%sunknown flags "LPX64,
+ first ? sep : "", flags & ~(mask - 1));
+}
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+ __u64 mask = 1;
+ int i, ret = 0;
+
+ for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+ if (flags & mask)
+ ret += snprintf(page + ret, count - ret, "%s%s",
+ ret ? sep : "", obd_connect_names[i]);
+ }
+ if (flags & ~(mask - 1))
+ ret += snprintf(page + ret, count - ret,
+ "%sunknown flags "LPX64,
+ ret ? sep : "", flags & ~(mask - 1));
+ return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+ struct lprocfs_counter ret;
+ struct lprocfs_counter_header *header;
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ struct obd_import_conn *conn;
+ int j;
+ int k;
+ int rw = 0;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ seq_printf(m,
+ "import:\n"
+ " name: %s\n"
+ " target: %s\n"
+ " state: %s\n"
+ " instance: %u\n"
+ " connect_flags: [",
+ obd->obd_name,
+ obd2cli_tgt(obd),
+ ptlrpc_import_state_name(imp->imp_state),
+ imp->imp_connect_data.ocd_instance);
+ obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+ seq_printf(m,
+ "]\n"
+ " import_flags: [");
+ obd_import_flags2str(imp, m);
+
+ seq_printf(m,
+ "]\n"
+ " connection:\n"
+ " failover_nids: [");
+ spin_lock(&imp->imp_lock);
+ j = 0;
+ list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+ seq_printf(m, "%s%s", j ? ", " : "",
+ libcfs_nid2str(conn->oic_conn->c_peer.nid));
+ j++;
+ }
+ seq_printf(m,
+ "]\n"
+ " current_connection: %s\n"
+ " connection_attempts: %u\n"
+ " generation: %u\n"
+ " in-progress_invalidations: %u\n",
+ imp->imp_connection == NULL ? "<none>" :
+ libcfs_nid2str(imp->imp_connection->c_peer.nid),
+ imp->imp_conn_cnt,
+ imp->imp_generation,
+ atomic_read(&imp->imp_inval_count));
+ spin_unlock(&imp->imp_lock);
+
+ if (obd->obd_svc_stats == NULL)
+ goto out_climp;
+
+ header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+ lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+ if (ret.lc_count != 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ } else
+ ret.lc_sum = 0;
+ seq_printf(m,
+ " rpcs:\n"
+ " inflight: %u\n"
+ " unregistering: %u\n"
+ " timeouts: %u\n"
+ " avg_waittime: "LPU64" %s\n",
+ atomic_read(&imp->imp_inflight),
+ atomic_read(&imp->imp_unregistering),
+ atomic_read(&imp->imp_timeouts),
+ ret.lc_sum, header->lc_units);
+
+ k = 0;
+ for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+ if (imp->imp_at.iat_portal[j] == 0)
+ break;
+ k = max_t(unsigned int, k,
+ at_get(&imp->imp_at.iat_service_estimate[j]));
+ }
+ seq_printf(m,
+ " service_estimates:\n"
+ " services: %u sec\n"
+ " network: %u sec\n",
+ k,
+ at_get(&imp->imp_at.iat_net_latency));
+
+ seq_printf(m,
+ " transactions:\n"
+ " last_replay: "LPU64"\n"
+ " peer_committed: "LPU64"\n"
+ " last_checked: "LPU64"\n",
+ imp->imp_last_replay_transno,
+ imp->imp_peer_committed_transno,
+ imp->imp_last_transno_checked);
+
+ /* avg data rates */
+ for (rw = 0; rw <= 1; rw++) {
+ lprocfs_stats_collect(obd->obd_svc_stats,
+ PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+ &ret);
+ if (ret.lc_sum > 0 && ret.lc_count > 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ seq_printf(m,
+ " %s_data_averages:\n"
+ " bytes_per_rpc: "LPU64"\n",
+ rw ? "write" : "read",
+ ret.lc_sum);
+ }
+ k = (int)ret.lc_sum;
+ j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+ header = &obd->obd_svc_stats->ls_cnt_header[j];
+ lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+ if (ret.lc_sum > 0 && ret.lc_count != 0) {
+ /* first argument to do_div MUST be __u64 */
+ __u64 sum = ret.lc_sum;
+ do_div(sum, ret.lc_count);
+ ret.lc_sum = sum;
+ seq_printf(m,
+ " %s_per_rpc: "LPU64"\n",
+ header->lc_units, ret.lc_sum);
+ j = (int)ret.lc_sum;
+ if (j > 0)
+ seq_printf(m,
+ " MB_per_sec: %u.%.02u\n",
+ k / j, (100 * k / j) % 100);
+ }
+ }
+
+out_climp:
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ int j, k;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ seq_printf(m, "current_state: %s\n",
+ ptlrpc_import_state_name(imp->imp_state));
+ seq_printf(m, "state_history:\n");
+ k = imp->imp_state_hist_idx;
+ for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+ struct import_state_hist *ish =
+ &imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+ if (ish->ish_state == 0)
+ continue;
+ seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+ ish->ish_time,
+ ptlrpc_import_state_name(ish->ish_state));
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+ int i;
+ for (i = 0; i < AT_BINS; i++)
+ seq_printf(m, "%3u ", at->at_hist[i]);
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct obd_import *imp;
+ unsigned int cur, worst;
+ time_t now, worstt;
+ struct dhms ts;
+ int i;
+
+ LASSERT(obd != NULL);
+ LPROCFS_CLIMP_CHECK(obd);
+ imp = obd->u.cli.cl_import;
+
+ now = cfs_time_current_sec();
+
+ /* Some network health info for kicks */
+ s2dhms(&ts, now - imp->imp_last_reply_time);
+ seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+ "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+ cur = at_get(&imp->imp_at.iat_net_latency);
+ worst = imp->imp_at.iat_net_latency.at_worst_ever;
+ worstt = imp->imp_at.iat_net_latency.at_worst_time;
+ s2dhms(&ts, now - worstt);
+ seq_printf(m, "%-10s : cur %3u worst %3u (at %ld, "DHMS_FMT" ago) ",
+ "network", cur, worst, worstt, DHMS_VARS(&ts));
+ lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+ for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+ if (imp->imp_at.iat_portal[i] == 0)
+ break;
+ cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+ worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+ worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+ s2dhms(&ts, now - worstt);
+ seq_printf(m, "portal %-2d : cur %3u worst %3u (at %ld, "
+ DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+ cur, worst, worstt, DHMS_VARS(&ts));
+ lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+ }
+
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+ __u64 flags;
+
+ LPROCFS_CLIMP_CHECK(obd);
+ flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+ seq_printf(m, "flags="LPX64"\n", flags);
+ obd_connect_seq_flags2str(m, flags, "\n");
+ seq_printf(m, "\n");
+ LPROCFS_CLIMP_EXIT(obd);
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = data;
+
+ LASSERT(obd != NULL);
+ return seq_printf(m, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+ struct obd_type *class = (struct obd_type*) data;
+
+ LASSERT(class != NULL);
+ return seq_printf(m, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+ int rc = 0;
+
+ LASSERT(obd != NULL);
+ LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+ LASSERT(obd->obd_type->typ_procroot != NULL);
+
+ obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+ obd->obd_type->typ_procroot,
+ list, obd);
+ if (IS_ERR(obd->obd_proc_entry)) {
+ rc = PTR_ERR(obd->obd_proc_entry);
+ CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+ obd->obd_proc_entry = NULL;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+ if (!obd)
+ return -EINVAL;
+ if (obd->obd_proc_exports_entry) {
+ /* Should be no exports left */
+ lprocfs_remove(&obd->obd_proc_exports_entry);
+ obd->obd_proc_exports_entry = NULL;
+ }
+ if (obd->obd_proc_entry) {
+ lprocfs_remove(&obd->obd_proc_entry);
+ obd->obd_proc_entry = NULL;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+ CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+ client_stat->nid_proc, client_stat->nid_stats);
+
+ LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+ "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+ atomic_read(&client_stat->nid_exp_ref_count));
+
+ if (client_stat->nid_proc)
+ lprocfs_remove(&client_stat->nid_proc);
+
+ if (client_stat->nid_stats)
+ lprocfs_free_stats(&client_stat->nid_stats);
+
+ if (client_stat->nid_ldlm_stats)
+ lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+ OBD_FREE_PTR(client_stat);
+ return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+ cfs_hash_t *hash = obd->obd_nid_stats_hash;
+ struct nid_stat *stat;
+ ENTRY;
+
+ /* we need extra list - because hash_exit called to early */
+ /* not need locking because all clients is died */
+ while (!list_empty(&obd->obd_nid_stats)) {
+ stat = list_entry(obd->obd_nid_stats.next,
+ struct nid_stat, nid_list);
+ list_del_init(&stat->nid_list);
+ cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+ lprocfs_free_client_stats(stat);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+ enum lprocfs_stats_flags flags)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_entry;
+ unsigned int percpusize = 0;
+ int i;
+
+ if (num == 0)
+ return NULL;
+
+ if (lprocfs_no_percpu_stats != 0)
+ flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+ if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+ num_entry = 1;
+ else
+ num_entry = num_possible_cpus();
+
+ /* alloc percpu pointers for all possible cpu slots */
+ LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+ if (stats == NULL)
+ return NULL;
+
+ stats->ls_num = num;
+ stats->ls_flags = flags;
+ spin_lock_init(&stats->ls_lock);
+
+ /* alloc num of counter headers */
+ LIBCFS_ALLOC(stats->ls_cnt_header,
+ stats->ls_num * sizeof(struct lprocfs_counter_header));
+ if (stats->ls_cnt_header == NULL)
+ goto fail;
+
+ if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+ /* contains only one set counters */
+ percpusize = lprocfs_stats_counter_size(stats);
+ LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+ if (stats->ls_percpu[0] == NULL)
+ goto fail;
+ stats->ls_biggest_alloc_num = 1;
+ } else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+ /* alloc all percpu data, currently only obd_memory use this */
+ for (i = 0; i < num_entry; ++i)
+ if (lprocfs_stats_alloc_one(stats, i) < 0)
+ goto fail;
+ }
+
+ return stats;
+
+fail:
+ lprocfs_free_stats(&stats);
+ return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+ struct lprocfs_stats *stats = *statsh;
+ unsigned int num_entry;
+ unsigned int percpusize;
+ unsigned int i;
+
+ if (stats == NULL || stats->ls_num == 0)
+ return;
+ *statsh = NULL;
+
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+ num_entry = 1;
+ else
+ num_entry = num_possible_cpus();
+
+ percpusize = lprocfs_stats_counter_size(stats);
+ for (i = 0; i < num_entry; i++)
+ if (stats->ls_percpu[i] != NULL)
+ LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+ if (stats->ls_cnt_header != NULL)
+ LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+ sizeof(struct lprocfs_counter_header));
+ LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+ struct lprocfs_counter *percpu_cntr;
+ struct lprocfs_counter_header *header;
+ int i;
+ int j;
+ unsigned int num_entry;
+ unsigned long flags = 0;
+
+ num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+ for (i = 0; i < num_entry; i++) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ for (j = 0; j < stats->ls_num; j++) {
+ header = &stats->ls_cnt_header[j];
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+ percpu_cntr->lc_count = 0;
+ percpu_cntr->lc_min = LC_MIN_INIT;
+ percpu_cntr->lc_max = 0;
+ percpu_cntr->lc_sumsquare = 0;
+ percpu_cntr->lc_sum = 0;
+ if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+ percpu_cntr->lc_sum_irq = 0;
+ }
+ }
+
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+ const char __user *buf,
+ size_t len, loff_t *off)
+{
+ struct seq_file *seq = file->private_data;
+ struct lprocfs_stats *stats = seq->private;
+
+ lprocfs_clear_stats(stats);
+
+ return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+ struct lprocfs_stats *stats = p->private;
+
+ return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+ struct lprocfs_stats *stats = p->private;
+ struct lprocfs_counter_header *hdr;
+ struct lprocfs_counter ctr;
+ int idx = *(loff_t *)v;
+ int rc = 0;
+
+ if (idx == 0) {
+ struct timeval now;
+ do_gettimeofday(&now);
+ rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+ "snapshot_time", now.tv_sec, now.tv_usec);
+ if (rc < 0)
+ return rc;
+ }
+ hdr = &stats->ls_cnt_header[idx];
+ lprocfs_stats_collect(stats, idx, &ctr);
+
+ if (ctr.lc_count == 0)
+ goto out;
+
+ rc = seq_printf(p, "%-25s "LPD64" samples [%s]", hdr->lc_name,
+ ctr.lc_count, hdr->lc_units);
+
+ if (rc < 0)
+ goto out;
+
+ if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ctr.lc_count > 0)) {
+ rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+ ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+ if (rc < 0)
+ goto out;
+ if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+ rc = seq_printf(p, " "LPD64, ctr.lc_sumsquare);
+ if (rc < 0)
+ goto out;
+ }
+ rc = seq_printf(p, "\n");
+out:
+ return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+ .start = lprocfs_stats_seq_start,
+ .stop = lprocfs_stats_seq_stop,
+ .next = lprocfs_stats_seq_next,
+ .show = lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+ struct seq_file *seq;
+ int rc;
+
+ rc = seq_open(file, &lprocfs_stats_seq_sops);
+ if (rc)
+ return rc;
+ seq = file->private_data;
+ seq->private = PDE_DATA(inode);
+ return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = lprocfs_stats_seq_open,
+ .read = seq_read,
+ .write = lprocfs_stats_seq_write,
+ .llseek = seq_lseek,
+ .release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+ struct lprocfs_stats *stats)
+{
+ struct proc_dir_entry *entry;
+ LASSERT(root != NULL);
+
+ entry = proc_create_data(name, 0644, root,
+ &lprocfs_stats_seq_fops, stats);
+ if (entry == NULL)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+ unsigned conf, const char *name, const char *units)
+{
+ struct lprocfs_counter_header *header;
+ struct lprocfs_counter *percpu_cntr;
+ unsigned long flags = 0;
+ unsigned int i;
+ unsigned int num_cpu;
+
+ LASSERT(stats != NULL);
+
+ header = &stats->ls_cnt_header[index];
+ LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+ index, name, units);
+
+ header->lc_config = conf;
+ header->lc_name = name;
+ header->lc_units = units;
+
+ num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+ for (i = 0; i < num_cpu; ++i) {
+ if (stats->ls_percpu[i] == NULL)
+ continue;
+ percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+ percpu_cntr->lc_count = 0;
+ percpu_cntr->lc_min = LC_MIN_INIT;
+ percpu_cntr->lc_max = 0;
+ percpu_cntr->lc_sumsquare = 0;
+ percpu_cntr->lc_sum = 0;
+ if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+ percpu_cntr->lc_sum_irq = 0;
+ }
+ lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op) \
+do { \
+ unsigned int coffset = base + OBD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < stats->ls_num); \
+ lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+ LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_stats;
+ int rc, i;
+
+ LASSERT(obd->obd_stats == NULL);
+ LASSERT(obd->obd_proc_entry != NULL);
+ LASSERT(obd->obd_cntr_base == 0);
+
+ num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+ num_private_stats - 1 /* o_owner */;
+ stats = lprocfs_alloc_stats(num_stats, 0);
+ if (stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_ops_stats(num_private_stats, stats);
+
+ for (i = num_private_stats; i < num_stats; i++) {
+ /* If this LBUGs, it is likely that an obd
+ * operation was added to struct obd_ops in
+ * <obd.h>, and that the corresponding line item
+ * LPROCFS_OBD_OP_INIT(.., .., opname)
+ * is missing from the list above. */
+ LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+ "Missing obd_stat initializer obd_op "
+ "operation at offset %d.\n", i - num_private_stats);
+ }
+ rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+ if (rc < 0) {
+ lprocfs_free_stats(&stats);
+ } else {
+ obd->obd_stats = stats;
+ obd->obd_cntr_base = num_private_stats;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+ if (obd->obd_stats)
+ lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op) \
+do { \
+ unsigned int coffset = base + MD_COUNTER_OFFSET(op); \
+ LASSERT(coffset < stats->ls_num); \
+ lprocfs_counter_init(stats, coffset, 0, #op, "reqs"); \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+ LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+ unsigned num_private_stats)
+{
+ struct lprocfs_stats *stats;
+ unsigned int num_stats;
+ int rc, i;
+
+ LASSERT(obd->md_stats == NULL);
+ LASSERT(obd->obd_proc_entry != NULL);
+ LASSERT(obd->md_cntr_base == 0);
+
+ num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+ num_private_stats;
+ stats = lprocfs_alloc_stats(num_stats, 0);
+ if (stats == NULL)
+ return -ENOMEM;
+
+ lprocfs_init_mps_stats(num_private_stats, stats);
+
+ for (i = num_private_stats; i < num_stats; i++) {
+ if (stats->ls_cnt_header[i].lc_name == NULL) {
+ CERROR("Missing md_stat initializer md_op "
+ "operation at offset %d. Aborting.\n",
+ i - num_private_stats);
+ LBUG();
+ }
+ }
+ rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+ if (rc < 0) {
+ lprocfs_free_stats(&stats);
+ } else {
+ obd->md_stats = stats;
+ obd->md_cntr_base = num_private_stats;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+ struct lprocfs_stats *stats = obd->md_stats;
+
+ if (stats != NULL) {
+ obd->md_stats = NULL;
+ obd->md_cntr_base = 0;
+ lprocfs_free_stats(&stats);
+ }
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_ENQUEUE - LDLM_FIRST_OPC,
+ 0, "ldlm_enqueue", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CONVERT - LDLM_FIRST_OPC,
+ 0, "ldlm_convert", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CANCEL - LDLM_FIRST_OPC,
+ 0, "ldlm_cancel", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_bl_callback", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_cp_callback", "reqs");
+ lprocfs_counter_init(ldlm_stats,
+ LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+ 0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+ struct hlist_node *hnode, void *data)
+
+{
+ struct obd_export *exp = cfs_hash_object(hs, hnode);
+ struct seq_file *m = (struct seq_file *)data;
+
+ if (exp->exp_nid_stats)
+ seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+ return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+ struct nid_stat *stats = (struct nid_stat *)m->private;
+ struct obd_device *obd = stats->nid_obd;
+
+ cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+ lprocfs_exp_print_uuid, m);
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+ struct seq_file *m;
+ bool first;
+};
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+ struct hlist_node *hnode, void *cb_data)
+
+{
+ struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+ struct obd_export *exp = cfs_hash_object(hs, hnode);
+
+ if (exp->exp_lock_hash != NULL) {
+ if (data->first) {
+ cfs_hash_debug_header(data->m);
+ data->first = false;
+ }
+ cfs_hash_debug_str(hs, data->m);
+ }
+
+ return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+ struct nid_stat *stats = (struct nid_stat *)m->private;
+ struct obd_device *obd = stats->nid_obd;
+ struct exp_hash_cb_data cb_data = {m, true};
+
+ cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+ lprocfs_exp_print_hash, &cb_data);
+ return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+ return seq_printf(m, "%s\n",
+ "Write into this file to clear all nid stats and "
+ "stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+ struct nid_stat *stat = obj;
+ ENTRY;
+
+ CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+ if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+ /* object has only hash references. */
+ spin_lock(&stat->nid_obd->obd_nid_lock);
+ list_move(&stat->nid_list, data);
+ spin_unlock(&stat->nid_obd->obd_nid_lock);
+ RETURN(1);
+ }
+ /* we has reference to object - only clear data*/
+ if (stat->nid_stats)
+ lprocfs_clear_stats(stat->nid_stats);
+
+ RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = (struct obd_device *)data;
+ struct nid_stat *client_stat;
+ LIST_HEAD(free_list);
+
+ cfs_hash_cond_del(obd->obd_nid_stats_hash,
+ lprocfs_nid_stats_clear_write_cb, &free_list);
+
+ while (!list_empty(&free_list)) {
+ client_stat = list_entry(free_list.next, struct nid_stat,
+ nid_list);
+ list_del_init(&client_stat->nid_list);
+ lprocfs_free_client_stats(client_stat);
+ }
+
+ return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+ struct nid_stat *new_stat, *old_stat;
+ struct obd_device *obd = NULL;
+ proc_dir_entry_t *entry;
+ char *buffer = NULL;
+ int rc = 0;
+ ENTRY;
+
+ *newnid = 0;
+
+ if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+ !exp->exp_obd->obd_nid_stats_hash)
+ RETURN(-EINVAL);
+
+ /* not test against zero because eric say:
+ * You may only test nid against another nid, or LNET_NID_ANY.
+ * Anything else is nonsense.*/
+ if (!nid || *nid == LNET_NID_ANY)
+ RETURN(0);
+
+ obd = exp->exp_obd;
+
+ CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+ OBD_ALLOC_PTR(new_stat);
+ if (new_stat == NULL)
+ RETURN(-ENOMEM);
+
+ new_stat->nid = *nid;
+ new_stat->nid_obd = exp->exp_obd;
+ /* we need set default refcount to 1 to balance obd_disconnect */
+ atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+ old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+ nid, &new_stat->nid_hash);
+ CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+ old_stat, libcfs_nid2str(*nid),
+ atomic_read(&new_stat->nid_exp_ref_count));
+
+ /* We need to release old stats because lprocfs_exp_cleanup() hasn't
+ * been and will never be called. */
+ if (exp->exp_nid_stats) {
+ nidstat_putref(exp->exp_nid_stats);
+ exp->exp_nid_stats = NULL;
+ }
+
+ /* Return -EALREADY here so that we know that the /proc
+ * entry already has been created */
+ if (old_stat != new_stat) {
+ exp->exp_nid_stats = old_stat;
+ GOTO(destroy_new, rc = -EALREADY);
+ }
+ /* not found - create */
+ OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+ if (buffer == NULL)
+ GOTO(destroy_new, rc = -ENOMEM);
+
+ memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+ new_stat->nid_proc = lprocfs_register(buffer,
+ obd->obd_proc_exports_entry,
+ NULL, NULL);
+ OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+ if (new_stat->nid_proc == NULL) {
+ CERROR("Error making export directory for nid %s\n",
+ libcfs_nid2str(*nid));
+ GOTO(destroy_new_ns, rc = -ENOMEM);
+ }
+
+ entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+ new_stat, &lproc_exp_uuid_fops);
+ if (IS_ERR(entry)) {
+ CWARN("Error adding the NID stats file\n");
+ rc = PTR_ERR(entry);
+ GOTO(destroy_new_ns, rc);
+ }
+
+ entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+ new_stat, &lproc_exp_hash_fops);
+ if (IS_ERR(entry)) {
+ CWARN("Error adding the hash file\n");
+ rc = PTR_ERR(entry);
+ GOTO(destroy_new_ns, rc);
+ }
+
+ exp->exp_nid_stats = new_stat;
+ *newnid = 1;
+ /* protect competitive add to list, not need locking on destroy */
+ spin_lock(&obd->obd_nid_lock);
+ list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+ spin_unlock(&obd->obd_nid_lock);
+
+ RETURN(rc);
+
+destroy_new_ns:
+ if (new_stat->nid_proc != NULL)
+ lprocfs_remove(&new_stat->nid_proc);
+ cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+ nidstat_putref(new_stat);
+ OBD_FREE_PTR(new_stat);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+ struct nid_stat *stat = exp->exp_nid_stats;
+
+ if(!stat || !exp->exp_obd)
+ RETURN(0);
+
+ nidstat_putref(exp->exp_nid_stats);
+ exp->exp_nid_stats = NULL;
+
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+ int *val)
+{
+ return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+ int *val, int mult)
+{
+ char kernbuf[20], *end, *pbuf;
+
+ if (count > (sizeof(kernbuf) - 1))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ mult = -mult;
+ pbuf++;
+ }
+
+ *val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+ if (pbuf == end)
+ return -EINVAL;
+
+ if (end != NULL && *end == '.') {
+ int temp_val, pow = 1;
+ int i;
+
+ pbuf = end + 1;
+ if (strlen(pbuf) > 5)
+ pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+ temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+ if (pbuf < end) {
+ for (i = 0; i < (end - pbuf); i++)
+ pow *= 10;
+
+ *val += temp_val / pow;
+ }
+ }
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+ int mult)
+{
+ long decimal_val, frac_val;
+ int prtn;
+
+ if (count < 10)
+ return -EINVAL;
+
+ decimal_val = val / mult;
+ prtn = snprintf(buffer, count, "%ld", decimal_val);
+ frac_val = val % mult;
+
+ if (prtn < (count - 4) && frac_val > 0) {
+ long temp_frac;
+ int i, temp_mult = 1, frac_bits = 0;
+
+ temp_frac = frac_val * 10;
+ buffer[prtn++] = '.';
+ while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+ /* only reserved 2 bits fraction */
+ buffer[prtn++] ='0';
+ temp_frac *= 10;
+ frac_bits++;
+ }
+ /*
+ * Need to think these cases :
+ * 1. #echo x.00 > /proc/xxx output result : x
+ * 2. #echo x.0x > /proc/xxx output result : x.0x
+ * 3. #echo x.x0 > /proc/xxx output result : x.x
+ * 4. #echo x.xx > /proc/xxx output result : x.xx
+ * Only reserved 2 bits fraction.
+ */
+ for (i = 0; i < (5 - prtn); i++)
+ temp_mult *= 10;
+
+ frac_bits = min((int)count - prtn, 3 - frac_bits);
+ prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+ frac_val * temp_mult / mult);
+
+ prtn--;
+ while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+ prtn--;
+ if (buffer[prtn] == '.') {
+ prtn--;
+ break;
+ }
+ }
+ prtn++;
+ }
+ buffer[prtn++] ='\n';
+ return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+ long decimal_val, frac_val;
+
+ decimal_val = val / mult;
+ seq_printf(m, "%ld", decimal_val);
+ frac_val = val % mult;
+
+ if (frac_val > 0) {
+ frac_val *= 100;
+ frac_val /= mult;
+ }
+ if (frac_val > 0) {
+ /* Three cases: x0, xx, 0x */
+ if ((frac_val % 10) != 0)
+ seq_printf(m, ".%ld", frac_val);
+ else
+ seq_printf(m, ".%ld", frac_val / 10);
+ }
+
+ seq_printf(m, "\n");
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+ return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+ __u64 *val, int mult)
+{
+ char kernbuf[22], *end, *pbuf;
+ __u64 whole, frac = 0, units;
+ unsigned frac_d = 1;
+
+ if (count > (sizeof(kernbuf) - 1))
+ return -EINVAL;
+
+ if (copy_from_user(kernbuf, buffer, count))
+ return -EFAULT;
+
+ kernbuf[count] = '\0';
+ pbuf = kernbuf;
+ if (*pbuf == '-') {
+ mult = -mult;
+ pbuf++;
+ }
+
+ whole = simple_strtoull(pbuf, &end, 10);
+ if (pbuf == end)
+ return -EINVAL;
+
+ if (end != NULL && *end == '.') {
+ int i;
+ pbuf = end + 1;
+
+ /* need to limit frac_d to a __u32 */
+ if (strlen(pbuf) > 10)
+ pbuf[10] = '\0';
+
+ frac = simple_strtoull(pbuf, &end, 10);
+ /* count decimal places */
+ for (i = 0; i < (end - pbuf); i++)
+ frac_d *= 10;
+ }
+
+ units = 1;
+ switch(*end) {
+ case 'p': case 'P':
+ units <<= 10;
+ case 't': case 'T':
+ units <<= 10;
+ case 'g': case 'G':
+ units <<= 10;
+ case 'm': case 'M':
+ units <<= 10;
+ case 'k': case 'K':
+ units <<= 10;
+ }
+ /* Specified units override the multiplier */
+ if (units)
+ mult = mult < 0 ? -units : units;
+
+ frac *= mult;
+ do_div(frac, frac_d);
+ *val = whole * mult + frac;
+ return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+ size_t l2;
+
+ l2 = strlen(s2);
+ if (!l2)
+ return (char *)s1;
+ while (len >= l2) {
+ len--;
+ if (!memcmp(s1, s2, l2))
+ return (char *)s1;
+ s1++;
+ }
+ return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+ unsigned long *count)
+{
+ char *val;
+ size_t buflen = *count;
+
+ /* there is no strnstr() in rhel5 and ubuntu kernels */
+ val = lprocfs_strnstr(buffer, name, buflen);
+ if (val == NULL)
+ return (char *)buffer;
+
+ val += strlen(name); /* skip prefix */
+ while (val < buffer + buflen && isspace(*val)) /* skip separator */
+ val++;
+
+ *count = 0;
+ while (val < buffer + buflen && isalnum(*val)) {
+ ++*count;
+ ++val;
+ }
+
+ return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+ const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data)
+{
+ struct proc_dir_entry *entry;
+ ENTRY;
+
+ /* Disallow secretly (un)writable entries. */
+ LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+ entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+ if (entry == NULL)
+ RETURN(-ENOMEM);
+
+ RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+ const char *name,
+ umode_t mode,
+ const struct file_operations *seq_fops,
+ void *data)
+{
+ return (lprocfs_seq_create(dev->obd_proc_entry, name,
+ mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+ if (value >= OBD_HIST_MAX)
+ value = OBD_HIST_MAX - 1;
+
+ spin_lock(&oh->oh_lock);
+ oh->oh_buckets[value]++;
+ spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+ unsigned int val;
+
+ for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+ ;
+
+ lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+ unsigned long ret = 0;
+ int i;
+
+ for (i = 0; i < OBD_HIST_MAX; i++)
+ ret += oh->oh_buckets[i];
+ return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+ spin_lock(&oh->oh_lock);
+ memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+ spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+ struct obd_device *dev = data;
+ struct client_obd *cli = &dev->u.cli;
+ int rc;
+
+ client_obd_list_lock(&cli->cl_loi_list_lock);
+ rc = seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+ client_obd_list_unlock(&cli->cl_loi_list_lock);
+ return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 000000000000..fdf0ed367693
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -0,0 +1,2185 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object_header *top;
+ struct lu_site *site;
+ struct lu_object *orig;
+ cfs_hash_bd_t bd;
+ const struct lu_fid *fid;
+
+ top = o->lo_header;
+ site = o->lo_dev->ld_site;
+ orig = o;
+
+ /*
+ * till we have full fids-on-OST implemented anonymous objects
+ * are possible in OSP. such an object isn't listed in the site
+ * so we should not remove it from the site.
+ */
+ fid = lu_object_fid(o);
+ if (fid_is_zero(fid)) {
+ LASSERT(top->loh_hash.next == NULL
+ && top->loh_hash.pprev == NULL);
+ LASSERT(list_empty(&top->loh_lru));
+ if (!atomic_dec_and_test(&top->loh_ref))
+ return;
+ list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_release != NULL)
+ o->lo_ops->loo_object_release(env, o);
+ }
+ lu_object_free(env, orig);
+ return;
+ }
+
+ cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+ bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+ if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+ if (lu_object_is_dying(top)) {
+
+ /*
+ * somebody may be waiting for this, currently only
+ * used for cl_object, see cl_object_put_last().
+ */
+ wake_up_all(&bkt->lsb_marche_funebre);
+ }
+ return;
+ }
+
+ LASSERT(bkt->lsb_busy > 0);
+ bkt->lsb_busy--;
+ /*
+ * When last reference is released, iterate over object
+ * layers, and notify them that object is no longer busy.
+ */
+ list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_release != NULL)
+ o->lo_ops->loo_object_release(env, o);
+ }
+
+ if (!lu_object_is_dying(top)) {
+ LASSERT(list_empty(&top->loh_lru));
+ list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+ cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+ return;
+ }
+
+ /*
+ * If object is dying (will not be cached), removed it
+ * from hash table and LRU.
+ *
+ * This is done with hash table and LRU lists locked. As the only
+ * way to acquire first reference to previously unreferenced
+ * object is through hash-table lookup (lu_object_find()),
+ * or LRU scanning (lu_site_purge()), that are done under hash-table
+ * and LRU lock, no race with concurrent object lookup is possible
+ * and we can safely destroy object below.
+ */
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+ cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+ cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+ /*
+ * Object was already removed from hash and lru above, can
+ * kill it.
+ */
+ lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+ return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_object_header *top;
+
+ top = o->lo_header;
+ set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+ if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+ cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+ cfs_hash_bd_t bd;
+
+ cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+ list_del_init(&top->loh_lru);
+ cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+ cfs_hash_bd_unlock(obj_hash, &bd, 1);
+ }
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *scan;
+ struct lu_object *top;
+ struct list_head *layers;
+ int clean;
+ int result;
+ ENTRY;
+
+ /*
+ * Create top-level object slice. This will also create
+ * lu_object_header.
+ */
+ top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+ if (top == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+ if (IS_ERR(top))
+ RETURN(top);
+ /*
+ * This is the only place where object fid is assigned. It's constant
+ * after this point.
+ */
+ top->lo_header->loh_fid = *f;
+ layers = &top->lo_header->loh_layers;
+ do {
+ /*
+ * Call ->loo_object_init() repeatedly, until no more new
+ * object slices are created.
+ */
+ clean = 1;
+ list_for_each_entry(scan, layers, lo_linkage) {
+ if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+ continue;
+ clean = 0;
+ scan->lo_header = top->lo_header;
+ result = scan->lo_ops->loo_object_init(env, scan, conf);
+ if (result != 0) {
+ lu_object_free(env, top);
+ RETURN(ERR_PTR(result));
+ }
+ scan->lo_flags |= LU_OBJECT_ALLOCATED;
+ }
+ } while (!clean);
+
+ list_for_each_entry_reverse(scan, layers, lo_linkage) {
+ if (scan->lo_ops->loo_object_start != NULL) {
+ result = scan->lo_ops->loo_object_start(env, scan);
+ if (result != 0) {
+ lu_object_free(env, top);
+ RETURN(ERR_PTR(result));
+ }
+ }
+ }
+
+ lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+ RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_site *site;
+ struct lu_object *scan;
+ struct list_head *layers;
+ struct list_head splice;
+
+ site = o->lo_dev->ld_site;
+ layers = &o->lo_header->loh_layers;
+ bkt = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+ /*
+ * First call ->loo_object_delete() method to release all resources.
+ */
+ list_for_each_entry_reverse(scan, layers, lo_linkage) {
+ if (scan->lo_ops->loo_object_delete != NULL)
+ scan->lo_ops->loo_object_delete(env, scan);
+ }
+
+ /*
+ * Then, splice object layers into stand-alone list, and call
+ * ->loo_object_free() on all layers to free memory. Splice is
+ * necessary, because lu_object_header is freed together with the
+ * top-level slice.
+ */
+ INIT_LIST_HEAD(&splice);
+ list_splice_init(layers, &splice);
+ while (!list_empty(&splice)) {
+ /*
+ * Free layers in bottom-to-top order, so that object header
+ * lives as long as possible and ->loo_object_free() methods
+ * can look at its contents.
+ */
+ o = container_of0(splice.prev, struct lu_object, lo_linkage);
+ list_del_init(&o->lo_linkage);
+ LASSERT(o->lo_ops->loo_object_free != NULL);
+ o->lo_ops->loo_object_free(env, o);
+ }
+
+ if (waitqueue_active(&bkt->lsb_marche_funebre))
+ wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+ struct lu_object_header *h;
+ struct lu_object_header *temp;
+ struct lu_site_bkt_data *bkt;
+ cfs_hash_bd_t bd;
+ cfs_hash_bd_t bd2;
+ struct list_head dispose;
+ int did_sth;
+ int start;
+ int count;
+ int bnr;
+ int i;
+
+ if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+ RETURN(0);
+
+ INIT_LIST_HEAD(&dispose);
+ /*
+ * Under LRU list lock, scan LRU list and move unreferenced objects to
+ * the dispose list, removing them from LRU and hash table.
+ */
+ start = s->ls_purge_start;
+ bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+ did_sth = 0;
+ cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+ if (i < start)
+ continue;
+ count = bnr;
+ cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+ list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+ LASSERT(atomic_read(&h->loh_ref) == 0);
+
+ cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+ LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+ cfs_hash_bd_del_locked(s->ls_obj_hash,
+ &bd2, &h->loh_hash);
+ list_move(&h->loh_lru, &dispose);
+ if (did_sth == 0)
+ did_sth = 1;
+
+ if (nr != ~0 && --nr == 0)
+ break;
+
+ if (count > 0 && --count == 0)
+ break;
+
+ }
+ cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+ cond_resched();
+ /*
+ * Free everything on the dispose list. This is safe against
+ * races due to the reasons described in lu_object_put().
+ */
+ while (!list_empty(&dispose)) {
+ h = container_of0(dispose.next,
+ struct lu_object_header, loh_lru);
+ list_del_init(&h->loh_lru);
+ lu_object_free(env, lu_object_top(h));
+ lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+ }
+
+ if (nr == 0)
+ break;
+ }
+
+ if (nr != 0 && did_sth && start != 0) {
+ start = 0; /* restart from the first bucket */
+ goto again;
+ }
+ /* race on s->ls_purge_start, but nobody cares */
+ s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+ return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+ /**
+ * Maximal line size.
+ *
+ * XXX overflow is not handled correctly.
+ */
+ LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+ /**
+ * Temporary buffer.
+ */
+ char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+ .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+ LCT_MG_THREAD | LCT_CL_THREAD,
+ .lct_init = lu_global_key_init,
+ .lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+ void *cookie, const char *format, ...)
+{
+ struct libcfs_debug_msg_data *msgdata = cookie;
+ struct lu_cdebug_data *key;
+ int used;
+ int complete;
+ va_list args;
+
+ va_start(args, format);
+
+ key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+ LASSERT(key != NULL);
+
+ used = strlen(key->lck_area);
+ complete = format[strlen(format) - 1] == '\n';
+ /*
+ * Append new chunk to the buffer.
+ */
+ vsnprintf(key->lck_area + used,
+ ARRAY_SIZE(key->lck_area) - used, format, args);
+ if (complete) {
+ if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+ libcfs_debug_msg(msgdata, "%s", key->lck_area);
+ key->lck_area[0] = 0;
+ }
+ va_end(args);
+ return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer,
+ const struct lu_object_header *hdr)
+{
+ (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+ hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+ PFID(&hdr->loh_fid),
+ hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+ list_empty((struct list_head *)&hdr->loh_lru) ? \
+ "" : " lru",
+ hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+ lu_printer_t printer, const struct lu_object *o)
+{
+ static const char ruler[] = "........................................";
+ struct lu_object_header *top;
+ int depth;
+
+ top = o->lo_header;
+ lu_object_header_print(env, cookie, printer, top);
+ (*printer)(env, cookie, "{ \n");
+ list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+ depth = o->lo_depth + 4;
+
+ /*
+ * print `.' \a depth times followed by type name and address
+ */
+ (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+ o->lo_dev->ld_type->ldt_name, o);
+ if (o->lo_ops->loo_object_print != NULL)
+ o->lo_ops->loo_object_print(env, cookie, printer, o);
+ (*printer)(env, cookie, "\n");
+ }
+ (*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+ struct lu_object_header *top;
+
+ top = o->lo_header;
+ list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+ if (o->lo_ops->loo_object_invariant != NULL &&
+ !o->lo_ops->loo_object_invariant(o))
+ return 0;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+ cfs_hash_bd_t *bd,
+ const struct lu_fid *f,
+ wait_queue_t *waiter,
+ __u64 *version)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object_header *h;
+ struct hlist_node *hnode;
+ __u64 ver = cfs_hash_bd_version_get(bd);
+
+ if (*version == ver)
+ return NULL;
+
+ *version = ver;
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+ /* cfs_hash_bd_peek_locked is a somehow "internal" function
+ * of cfs_hash, it doesn't add refcount on object. */
+ hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+ if (hnode == NULL) {
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+ return NULL;
+ }
+
+ h = container_of0(hnode, struct lu_object_header, loh_hash);
+ if (likely(!lu_object_is_dying(h))) {
+ cfs_hash_get(s->ls_obj_hash, hnode);
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+ list_del_init(&h->loh_lru);
+ return lu_object_top(h);
+ }
+
+ /*
+ * Lookup found an object being destroyed this object cannot be
+ * returned (to assure that references to dying objects are eventually
+ * drained), and moreover, lookup has to wait until object is freed.
+ */
+
+ init_waitqueue_entry_current(waiter);
+ add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+ return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+ struct lu_device *dev, const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *o;
+ cfs_hash_t *hs;
+ cfs_hash_bd_t bd;
+ struct lu_site_bkt_data *bkt;
+
+ o = lu_object_alloc(env, dev, f, conf);
+ if (unlikely(IS_ERR(o)))
+ return o;
+
+ hs = dev->ld_site->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf,
+ wait_queue_t *waiter)
+{
+ struct lu_object *o;
+ struct lu_object *shadow;
+ struct lu_site *s;
+ cfs_hash_t *hs;
+ cfs_hash_bd_t bd;
+ __u64 version = 0;
+
+ /*
+ * This uses standard index maintenance protocol:
+ *
+ * - search index under lock, and return object if found;
+ * - otherwise, unlock index, allocate new object;
+ * - lock index and search again;
+ * - if nothing is found (usual case), insert newly created
+ * object into index;
+ * - otherwise (race: other thread inserted object), free
+ * object just allocated.
+ * - unlock index;
+ * - return object.
+ *
+ * For "LOC_F_NEW" case, we are sure the object is new established.
+ * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+ * just alloc and insert directly.
+ *
+ * If dying object is found during index search, add @waiter to the
+ * site wait-queue and return ERR_PTR(-EAGAIN).
+ */
+ if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+ return lu_object_new(env, dev, f, conf);
+
+ s = dev->ld_site;
+ hs = s->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+ o = htable_lookup(s, &bd, f, waiter, &version);
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ if (o != NULL)
+ return o;
+
+ /*
+ * Allocate new object. This may result in rather complicated
+ * operations, including fld queries, inode loading, etc.
+ */
+ o = lu_object_alloc(env, dev, f, conf);
+ if (unlikely(IS_ERR(o)))
+ return o;
+
+ LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+ cfs_hash_bd_lock(hs, &bd, 1);
+
+ shadow = htable_lookup(s, &bd, f, waiter, &version);
+ if (likely(shadow == NULL)) {
+ struct lu_site_bkt_data *bkt;
+
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ return o;
+ }
+
+ lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ lu_object_free(env, o);
+ return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_site_bkt_data *bkt;
+ struct lu_object *obj;
+ wait_queue_t wait;
+
+ while (1) {
+ obj = lu_object_find_try(env, dev, f, conf, &wait);
+ if (obj != ERR_PTR(-EAGAIN))
+ return obj;
+ /*
+ * lu_object_find_try() already added waiter into the
+ * wait queue.
+ */
+ waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+ bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+ remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+ }
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_fid *f,
+ const struct lu_object_conf *conf)
+{
+ struct lu_object *top;
+ struct lu_object *obj;
+
+ top = lu_object_find(env, dev, f, conf);
+ if (!IS_ERR(top)) {
+ obj = lu_object_locate(top->lo_header, dev->ld_type);
+ if (obj == NULL)
+ lu_object_put(env, top);
+ } else
+ obj = top;
+ return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+ int result = 0;
+
+ INIT_LIST_HEAD(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_init)
+ result = ldt->ldt_ops->ldto_init(ldt);
+ if (result == 0)
+ list_add(&ldt->ldt_linkage, &lu_device_types);
+ return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+ list_del_init(&ldt->ldt_linkage);
+ if (ldt->ldt_ops->ldto_fini)
+ ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+ struct lu_device_type *ldt;
+
+ list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+ if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+ ldt->ldt_ops->ldto_stop(ldt);
+ }
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+ struct lu_env *lsp_env;
+ void *lsp_cookie;
+ lu_printer_t lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+ struct hlist_node *hnode, void *data)
+{
+ struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ if (!list_empty(&h->loh_layers)) {
+ const struct lu_object *o;
+
+ o = lu_object_top(h);
+ lu_object_print(arg->lsp_env, arg->lsp_cookie,
+ arg->lsp_printer, o);
+ } else {
+ lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+ arg->lsp_printer, h);
+ }
+ return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+ lu_printer_t printer)
+{
+ struct lu_site_print_arg arg = {
+ .lsp_env = (struct lu_env *)env,
+ .lsp_cookie = cookie,
+ .lsp_printer = printer,
+ };
+
+ cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+ LU_CACHE_PERCENT_MAX = 50,
+ LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+ "Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+ unsigned long cache_size;
+ int bits;
+
+ /*
+ * Calculate hash table size, assuming that we want reasonable
+ * performance when 20% of total memory is occupied by cache of
+ * lu_objects.
+ *
+ * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+ */
+ cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+ /* limit hashtable size for lowmem systems to low RAM */
+ if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+ cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+ /* clear off unreasonable cache setting. */
+ if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+ CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+ " the range of (0, %u]. Will use default value: %u.\n",
+ lu_cache_percent, LU_CACHE_PERCENT_MAX,
+ LU_CACHE_PERCENT_DEFAULT);
+
+ lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+ }
+ cache_size = cache_size / 100 * lu_cache_percent *
+ (PAGE_CACHE_SIZE / 1024);
+
+ for (bits = 1; (1 << bits) < cache_size; ++bits) {
+ ;
+ }
+ return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+ const void *key, unsigned mask)
+{
+ struct lu_fid *fid = (struct lu_fid *)key;
+ __u32 hash;
+
+ hash = fid_flatten32(fid);
+ hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+ hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+ /* give me another random factor */
+ hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+ hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+ hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+ return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct lu_object_header *h;
+
+ h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+ if (atomic_add_return(1, &h->loh_ref) == 1) {
+ struct lu_site_bkt_data *bkt;
+ cfs_hash_bd_t bd;
+
+ cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ bkt->lsb_busy++;
+ }
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+ .hs_hash = lu_obj_hop_hash,
+ .hs_key = lu_obj_hop_key,
+ .hs_keycmp = lu_obj_hop_keycmp,
+ .hs_object = lu_obj_hop_object,
+ .hs_get = lu_obj_hop_get,
+ .hs_put_locked = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+ spin_lock(&s->ls_ld_lock);
+ if (list_empty(&d->ld_linkage))
+ list_add(&d->ld_linkage, &s->ls_ld_linkage);
+ spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+ spin_lock(&s->ls_ld_lock);
+ list_del_init(&d->ld_linkage);
+ spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN 12
+#define LU_SITE_BITS_MAX 24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS 8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+ struct lu_site_bkt_data *bkt;
+ cfs_hash_bd_t bd;
+ char name[16];
+ int bits;
+ int i;
+ ENTRY;
+
+ memset(s, 0, sizeof *s);
+ bits = lu_htable_order();
+ snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+ for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+ bits >= LU_SITE_BITS_MIN; bits--) {
+ s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+ bits - LU_SITE_BKT_BITS,
+ sizeof(*bkt), 0, 0,
+ &lu_site_hash_ops,
+ CFS_HASH_SPIN_BKTLOCK |
+ CFS_HASH_NO_ITEMREF |
+ CFS_HASH_DEPTH |
+ CFS_HASH_ASSERT_EMPTY);
+ if (s->ls_obj_hash != NULL)
+ break;
+ }
+
+ if (s->ls_obj_hash == NULL) {
+ CERROR("failed to create lu_site hash with bits: %d\n", bits);
+ return -ENOMEM;
+ }
+
+ cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+ bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+ INIT_LIST_HEAD(&bkt->lsb_lru);
+ init_waitqueue_head(&bkt->lsb_marche_funebre);
+ }
+
+ s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+ if (s->ls_stats == NULL) {
+ cfs_hash_putref(s->ls_obj_hash);
+ s->ls_obj_hash = NULL;
+ return -ENOMEM;
+ }
+
+ lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+ 0, "created", "created");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+ 0, "cache_hit", "cache_hit");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+ 0, "cache_miss", "cache_miss");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+ 0, "cache_race", "cache_race");
+ lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+ 0, "cache_death_race", "cache_death_race");
+ lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+ 0, "lru_purged", "lru_purged");
+
+ INIT_LIST_HEAD(&s->ls_linkage);
+ s->ls_top_dev = top;
+ top->ld_site = s;
+ lu_device_get(top);
+ lu_ref_add(&top->ld_reference, "site-top", s);
+
+ INIT_LIST_HEAD(&s->ls_ld_linkage);
+ spin_lock_init(&s->ls_ld_lock);
+
+ lu_dev_add_linkage(s, top);
+
+ RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+ mutex_lock(&lu_sites_guard);
+ list_del_init(&s->ls_linkage);
+ mutex_unlock(&lu_sites_guard);
+
+ if (s->ls_obj_hash != NULL) {
+ cfs_hash_putref(s->ls_obj_hash);
+ s->ls_obj_hash = NULL;
+ }
+
+ if (s->ls_top_dev != NULL) {
+ s->ls_top_dev->ld_site = NULL;
+ lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+ lu_device_put(s->ls_top_dev);
+ s->ls_top_dev = NULL;
+ }
+
+ if (s->ls_stats != NULL)
+ lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+ int result;
+ mutex_lock(&lu_sites_guard);
+ result = lu_context_refill(&lu_shrink_env.le_ctx);
+ if (result == 0)
+ list_add(&s->ls_linkage, &lu_sites);
+ mutex_unlock(&lu_sites_guard);
+ return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+ atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+ LASSERT(atomic_read(&d->ld_ref) > 0);
+ atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+ if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+ t->ldt_ops->ldto_start(t);
+ memset(d, 0, sizeof *d);
+ atomic_set(&d->ld_ref, 0);
+ d->ld_type = t;
+ lu_ref_init(&d->ld_reference);
+ INIT_LIST_HEAD(&d->ld_linkage);
+ return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+ struct lu_device_type *t;
+
+ t = d->ld_type;
+ if (d->ld_obd != NULL) {
+ d->ld_obd->obd_lu_dev = NULL;
+ d->ld_obd = NULL;
+ }
+
+ lu_ref_fini(&d->ld_reference);
+ LASSERTF(atomic_read(&d->ld_ref) == 0,
+ "Refcount is %u\n", atomic_read(&d->ld_ref));
+ LASSERT(t->ldt_device_nr > 0);
+ if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+ t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+ struct lu_object_header *h, struct lu_device *d)
+{
+ memset(o, 0, sizeof *o);
+ o->lo_header = h;
+ o->lo_dev = d;
+ lu_device_get(d);
+ o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+ INIT_LIST_HEAD(&o->lo_linkage);
+ return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+ struct lu_device *dev = o->lo_dev;
+
+ LASSERT(list_empty(&o->lo_linkage));
+
+ if (dev != NULL) {
+ lu_ref_del_at(&dev->ld_reference,
+ o->lo_dev_ref , "lu_object", o);
+ lu_device_put(dev);
+ o->lo_dev = NULL;
+ }
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+ list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+ list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+ memset(h, 0, sizeof *h);
+ atomic_set(&h->loh_ref, 1);
+ INIT_HLIST_NODE(&h->loh_hash);
+ INIT_LIST_HEAD(&h->loh_lru);
+ INIT_LIST_HEAD(&h->loh_layers);
+ lu_ref_init(&h->loh_reference);
+ return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+ LASSERT(list_empty(&h->loh_layers));
+ LASSERT(list_empty(&h->loh_lru));
+ LASSERT(hlist_unhashed(&h->loh_hash));
+ lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+ const struct lu_device_type *dtype)
+{
+ struct lu_object *o;
+
+ list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+ if (o->lo_dev->ld_type == dtype)
+ return o;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+ struct lu_site *site = top->ld_site;
+ struct lu_device *scan;
+ struct lu_device *next;
+
+ lu_site_purge(env, site, ~0);
+ for (scan = top; scan != NULL; scan = next) {
+ next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+ lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+ lu_device_put(scan);
+ }
+
+ /* purge again. */
+ lu_site_purge(env, site, ~0);
+
+ for (scan = top; scan != NULL; scan = next) {
+ const struct lu_device_type *ldt = scan->ld_type;
+ struct obd_type *type;
+
+ next = ldt->ldt_ops->ldto_device_free(env, scan);
+ type = ldt->ldt_obd_type;
+ if (type != NULL) {
+ type->typ_refcnt--;
+ class_put_type(type);
+ }
+ }
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+ /**
+ * Maximal number of tld slots.
+ */
+ LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+ int result;
+ int i;
+
+ LASSERT(key->lct_init != NULL);
+ LASSERT(key->lct_fini != NULL);
+ LASSERT(key->lct_tags != 0);
+ LASSERT(key->lct_owner != NULL);
+
+ result = -ENFILE;
+ spin_lock(&lu_keys_guard);
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ if (lu_keys[i] == NULL) {
+ key->lct_index = i;
+ atomic_set(&key->lct_used, 1);
+ lu_keys[i] = key;
+ lu_ref_init(&key->lct_reference);
+ result = 0;
+ ++key_set_version;
+ break;
+ }
+ }
+ spin_unlock(&lu_keys_guard);
+ return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+ if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+ struct lu_context_key *key;
+
+ key = lu_keys[index];
+ LASSERT(key != NULL);
+ LASSERT(key->lct_fini != NULL);
+ LASSERT(atomic_read(&key->lct_used) > 1);
+
+ key->lct_fini(ctx, key, ctx->lc_value[index]);
+ lu_ref_del(&key->lct_reference, "ctx", ctx);
+ atomic_dec(&key->lct_used);
+
+ LASSERT(key->lct_owner != NULL);
+ if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+ LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+ module_put(key->lct_owner);
+ }
+ ctx->lc_value[index] = NULL;
+ }
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+ LASSERT(atomic_read(&key->lct_used) >= 1);
+ LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+ lu_context_key_quiesce(key);
+
+ ++key_set_version;
+ spin_lock(&lu_keys_guard);
+ key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+ if (lu_keys[key->lct_index]) {
+ lu_keys[key->lct_index] = NULL;
+ lu_ref_fini(&key->lct_reference);
+ }
+ spin_unlock(&lu_keys_guard);
+
+ LASSERTF(atomic_read(&key->lct_used) == 1,
+ "key has instances: %d\n",
+ atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+ struct lu_context_key *key = k;
+ va_list args;
+ int result;
+
+ va_start(args, k);
+ do {
+ result = lu_context_key_register(key);
+ if (result)
+ break;
+ key = va_arg(args, struct lu_context_key *);
+ } while (key != NULL);
+ va_end(args);
+
+ if (result != 0) {
+ va_start(args, k);
+ while (k != key) {
+ lu_context_key_degister(k);
+ k = va_arg(args, struct lu_context_key *);
+ }
+ va_end(args);
+ }
+
+ return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_degister(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_revive(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+ va_list args;
+
+ va_start(args, k);
+ do {
+ lu_context_key_quiesce(k);
+ k = va_arg(args, struct lu_context_key*);
+ } while (k != NULL);
+ va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+ const struct lu_context_key *key)
+{
+ LINVRNT(ctx->lc_state == LCS_ENTERED);
+ LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+ LASSERT(lu_keys[key->lct_index] == key);
+ return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+ struct lu_context *ctx;
+
+ if (!(key->lct_tags & LCT_QUIESCENT)) {
+ /*
+ * XXX layering violation.
+ */
+ key->lct_tags |= LCT_QUIESCENT;
+ /*
+ * XXX memory barrier has to go here.
+ */
+ spin_lock(&lu_keys_guard);
+ list_for_each_entry(ctx, &lu_context_remembered,
+ lc_remember)
+ key_fini(ctx, key->lct_index);
+ spin_unlock(&lu_keys_guard);
+ ++key_set_version;
+ }
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+ key->lct_tags &= ~LCT_QUIESCENT;
+ ++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+ int i;
+
+ if (ctx->lc_value == NULL)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+ key_fini(ctx, i);
+
+ OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+ ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+ int i;
+
+ LINVRNT(ctx->lc_value != NULL);
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ struct lu_context_key *key;
+
+ key = lu_keys[i];
+ if (ctx->lc_value[i] == NULL && key != NULL &&
+ (key->lct_tags & ctx->lc_tags) &&
+ /*
+ * Don't create values for a LCT_QUIESCENT key, as this
+ * will pin module owning a key.
+ */
+ !(key->lct_tags & LCT_QUIESCENT)) {
+ void *value;
+
+ LINVRNT(key->lct_init != NULL);
+ LINVRNT(key->lct_index == i);
+
+ value = key->lct_init(ctx, key);
+ if (unlikely(IS_ERR(value)))
+ return PTR_ERR(value);
+
+ LASSERT(key->lct_owner != NULL);
+ if (!(ctx->lc_tags & LCT_NOREF))
+ try_module_get(key->lct_owner);
+ lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+ atomic_inc(&key->lct_used);
+ /*
+ * This is the only place in the code, where an
+ * element of ctx->lc_value[] array is set to non-NULL
+ * value.
+ */
+ ctx->lc_value[i] = value;
+ if (key->lct_exit != NULL)
+ ctx->lc_tags |= LCT_HAS_EXIT;
+ }
+ ctx->lc_version = key_set_version;
+ }
+ return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+ OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+ if (likely(ctx->lc_value != NULL))
+ return keys_fill(ctx);
+
+ return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+ int rc;
+
+ memset(ctx, 0, sizeof *ctx);
+ ctx->lc_state = LCS_INITIALIZED;
+ ctx->lc_tags = tags;
+ if (tags & LCT_REMEMBER) {
+ spin_lock(&lu_keys_guard);
+ list_add(&ctx->lc_remember, &lu_context_remembered);
+ spin_unlock(&lu_keys_guard);
+ } else {
+ INIT_LIST_HEAD(&ctx->lc_remember);
+ }
+
+ rc = keys_init(ctx);
+ if (rc != 0)
+ lu_context_fini(ctx);
+
+ return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+ LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+ ctx->lc_state = LCS_FINALIZED;
+
+ if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+ LASSERT(list_empty(&ctx->lc_remember));
+ keys_fini(ctx);
+
+ } else { /* could race with key degister */
+ spin_lock(&lu_keys_guard);
+ keys_fini(ctx);
+ list_del_init(&ctx->lc_remember);
+ spin_unlock(&lu_keys_guard);
+ }
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+ LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+ ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+ int i;
+
+ LINVRNT(ctx->lc_state == LCS_ENTERED);
+ ctx->lc_state = LCS_LEFT;
+ if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+ for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+ if (ctx->lc_value[i] != NULL) {
+ struct lu_context_key *key;
+
+ key = lu_keys[i];
+ LASSERT(key != NULL);
+ if (key->lct_exit != NULL)
+ key->lct_exit(ctx,
+ key, ctx->lc_value[i]);
+ }
+ }
+ }
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+ return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_context_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default |= tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+ spin_lock(&lu_keys_guard);
+ lu_session_tags_default &= ~tags;
+ key_set_version++;
+ spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+ int result;
+
+ env->le_ses = NULL;
+ result = lu_context_init(&env->le_ctx, tags);
+ if (likely(result == 0))
+ lu_context_enter(&env->le_ctx);
+ return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+ lu_context_exit(&env->le_ctx);
+ lu_context_fini(&env->le_ctx);
+ env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+ int result;
+
+ result = lu_context_refill(&env->le_ctx);
+ if (result == 0 && env->le_ses != NULL)
+ result = lu_context_refill(env->le_ses);
+ return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+ __u32 stags)
+{
+ int result;
+
+ if ((env->le_ctx.lc_tags & ctags) != ctags) {
+ env->le_ctx.lc_version = 0;
+ env->le_ctx.lc_tags |= ctags;
+ }
+
+ if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+ env->le_ses->lc_version = 0;
+ env->le_ses->lc_tags |= stags;
+ }
+
+ result = lu_env_refill(env);
+
+ return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+ unsigned lss_populated;
+ unsigned lss_max_search;
+ unsigned lss_total;
+ unsigned lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+ lu_site_stats_t *stats, int populated)
+{
+ cfs_hash_bd_t bd;
+ int i;
+
+ cfs_hash_for_each_bucket(hs, &bd, i) {
+ struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+ struct hlist_head *hhead;
+
+ cfs_hash_bd_lock(hs, &bd, 1);
+ stats->lss_busy += bkt->lsb_busy;
+ stats->lss_total += cfs_hash_bd_count_get(&bd);
+ stats->lss_max_search = max((int)stats->lss_max_search,
+ cfs_hash_bd_depmax_get(&bd));
+ if (!populated) {
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ continue;
+ }
+
+ cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+ if (!hlist_empty(hhead))
+ stats->lss_populated++;
+ }
+ cfs_hash_bd_unlock(hs, &bd, 1);
+ }
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+ lu_site_stats_t stats;
+ struct lu_site *s;
+ struct lu_site *tmp;
+ int cached = 0;
+ int remain = shrink_param(sc, nr_to_scan);
+ LIST_HEAD(splice);
+
+ if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+ if (remain != 0)
+ return -1;
+ else
+ /* We must not take the lu_sites_guard lock when
+ * __GFP_FS is *not* set because of the deadlock
+ * possibility detailed above. Additionally,
+ * since we cannot determine the number of
+ * objects in the cache without taking this
+ * lock, we're in a particularly tough spot. As
+ * a result, we'll just lie and say our cache is
+ * empty. This _should_ be ok, as we can't
+ * reclaim objects when __GFP_FS is *not* set
+ * anyways.
+ */
+ return 0;
+ }
+
+ CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+ mutex_lock(&lu_sites_guard);
+ list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+ if (shrink_param(sc, nr_to_scan) != 0) {
+ remain = lu_site_purge(&lu_shrink_env, s, remain);
+ /*
+ * Move just shrunk site to the tail of site list to
+ * assure shrinking fairness.
+ */
+ list_move_tail(&s->ls_linkage, &splice);
+ }
+
+ memset(&stats, 0, sizeof(stats));
+ lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+ cached += stats.lss_total - stats.lss_busy;
+ if (shrink_param(sc, nr_to_scan) && remain <= 0)
+ break;
+ }
+ list_splice(&splice, lu_sites.prev);
+ mutex_unlock(&lu_sites_guard);
+
+ cached = (cached / 100) * sysctl_vfs_cache_pressure;
+ if (shrink_param(sc, nr_to_scan) == 0)
+ CDEBUG(D_INODE, "%d objects cached\n", cached);
+ return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+ void *unused, const char *format, ...)
+{
+ va_list args;
+
+ va_start(args, format);
+ vprintk(format, args);
+ va_end(args);
+ return 0;
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+ int result;
+
+ CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+ result = lu_ref_global_init();
+ if (result != 0)
+ return result;
+
+ LU_CONTEXT_KEY_INIT(&lu_global_key);
+ result = lu_context_key_register(&lu_global_key);
+ if (result != 0)
+ return result;
+
+ /*
+ * At this level, we don't know what tags are needed, so allocate them
+ * conservatively. This should not be too bad, because this
+ * environment is global.
+ */
+ mutex_lock(&lu_sites_guard);
+ result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+ mutex_unlock(&lu_sites_guard);
+ if (result != 0)
+ return result;
+
+ /*
+ * seeks estimation: 3 seeks to read a record from oi, one to read
+ * inode, one for ea. Unfortunately setting this high value results in
+ * lu_object/inode cache consuming all the memory.
+ */
+ lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+ if (lu_site_shrinker == NULL)
+ return -ENOMEM;
+
+ return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+ if (lu_site_shrinker != NULL) {
+ remove_shrinker(lu_site_shrinker);
+ lu_site_shrinker = NULL;
+ }
+
+ lu_context_key_degister(&lu_global_key);
+
+ /*
+ * Tear shrinker environment down _after_ de-registering
+ * lu_global_key, because the latter has a value in the former.
+ */
+ mutex_lock(&lu_sites_guard);
+ lu_env_fini(&lu_shrink_env);
+ mutex_unlock(&lu_sites_guard);
+
+ lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+ struct lprocfs_counter ret;
+
+ lprocfs_stats_collect(stats, idx, &ret);
+ return (__u32)ret.lc_count;
+#else
+ return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+ lu_site_stats_t stats;
+
+ memset(&stats, 0, sizeof(stats));
+ lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+ return seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+ stats.lss_busy,
+ stats.lss_total,
+ stats.lss_populated,
+ CFS_HASH_NHLIST(s->ls_obj_hash),
+ stats.lss_max_search,
+ ls_stats_read(s->ls_stats, LU_SS_CREATED),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+ ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+ ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+ int result;
+ struct lu_kmem_descr *iter = caches;
+
+ for (result = 0; iter->ckd_cache != NULL; ++iter) {
+ *iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+ iter->ckd_size,
+ 0, 0, NULL);
+ if (*iter->ckd_cache == NULL) {
+ result = -ENOMEM;
+ /* free all previously allocated caches */
+ lu_kmem_fini(caches);
+ break;
+ }
+ }
+ return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+ for (; caches->ckd_cache != NULL; ++caches) {
+ if (*caches->ckd_cache != NULL) {
+ kmem_cache_destroy(*caches->ckd_cache);
+ *caches->ckd_cache = NULL;
+ }
+ }
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+ const struct lu_fid *fid)
+{
+ struct lu_site *s = o->lo_dev->ld_site;
+ struct lu_fid *old = &o->lo_header->loh_fid;
+ struct lu_site_bkt_data *bkt;
+ struct lu_object *shadow;
+ wait_queue_t waiter;
+ cfs_hash_t *hs;
+ cfs_hash_bd_t bd;
+ __u64 version = 0;
+
+ LASSERT(fid_is_zero(old));
+
+ hs = s->ls_obj_hash;
+ cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+ shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+ /* supposed to be unique */
+ LASSERT(shadow == NULL);
+ *old = *fid;
+ bkt = cfs_hash_bd_extra_get(hs, &bd);
+ cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+ bkt->lsb_busy++;
+ cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+ struct lu_device *dev,
+ const struct lu_object_conf *conf)
+{
+ struct lu_fid fid;
+ struct lu_object *o;
+
+ fid_zero(&fid);
+ o = lu_object_alloc(env, dev, &fid, conf);
+
+ return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+ .lb_buf = NULL,
+ .lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+ LASSERT(buf);
+ if (buf->lb_buf) {
+ LASSERT(buf->lb_len > 0);
+ OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+ buf->lb_buf = NULL;
+ buf->lb_len = 0;
+ }
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+ LASSERT(buf);
+ LASSERT(buf->lb_buf == NULL);
+ LASSERT(buf->lb_len == 0);
+ OBD_ALLOC_LARGE(buf->lb_buf, size);
+ if (likely(buf->lb_buf))
+ buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+ lu_buf_free(buf);
+ lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+ if (buf->lb_buf == NULL && buf->lb_len == 0)
+ lu_buf_alloc(buf, len);
+
+ if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+ lu_buf_realloc(buf, len);
+
+ return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+ char *ptr;
+
+ if (len <= buf->lb_len)
+ return 0;
+
+ OBD_ALLOC_LARGE(ptr, len);
+ if (ptr == NULL)
+ return -ENOMEM;
+
+ /* Free the old buf */
+ if (buf->lb_buf != NULL) {
+ memcpy(ptr, buf->lb_buf, buf->lb_len);
+ OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+ }
+
+ buf->lb_buf = ptr;
+ buf->lb_len = len;
+ return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 000000000000..23a76f158356
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644
index 000000000000..229db6c39b78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ * Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+ .lct_tags = LCT_SESSION,
+ .lct_init = lu_ucred_key_init,
+ .lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+ if (!env->le_ses)
+ return NULL;
+ return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+ struct lu_ucred *uc = lu_ucred(env);
+ if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+ return NULL;
+ return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+ struct lu_ucred *uc = lu_ucred_check(env);
+ LASSERT(uc != NULL);
+ return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+ LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+ return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+ lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 000000000000..69d6499ef731
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+ spinlock_t lock;
+ struct list_head head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+ struct portals_handle_ops *ops)
+{
+ struct handle_bucket *bucket;
+ ENTRY;
+
+ LASSERT(h != NULL);
+ LASSERT(list_empty(&h->h_link));
+
+ /*
+ * This is fast, but simplistic cookie generation algorithm, it will
+ * need a re-do at some point in the future for security.
+ */
+ spin_lock(&handle_base_lock);
+ handle_base += HANDLE_INCR;
+
+ if (unlikely(handle_base == 0)) {
+ /*
+ * Cookie of zero is "dangerous", because in many places it's
+ * assumed that 0 means "unassigned" handle, not bound to any
+ * object.
+ */
+ CWARN("The universe has been exhausted: cookie wrap-around.\n");
+ handle_base += HANDLE_INCR;
+ }
+ h->h_cookie = handle_base;
+ spin_unlock(&handle_base_lock);
+
+ h->h_ops = ops;
+ spin_lock_init(&h->h_lock);
+
+ bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+ spin_lock(&bucket->lock);
+ list_add_rcu(&h->h_link, &bucket->head);
+ h->h_in = 1;
+ spin_unlock(&bucket->lock);
+
+ CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+ h, h->h_cookie);
+ EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+ if (list_empty(&h->h_link)) {
+ CERROR("removing an already-removed handle ("LPX64")\n",
+ h->h_cookie);
+ return;
+ }
+
+ CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+ h, h->h_cookie);
+
+ spin_lock(&h->h_lock);
+ if (h->h_in == 0) {
+ spin_unlock(&h->h_lock);
+ return;
+ }
+ h->h_in = 0;
+ spin_unlock(&h->h_lock);
+ list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+ struct handle_bucket *bucket;
+ bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+ spin_lock(&bucket->lock);
+ class_handle_unhash_nolock(h);
+ spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+ struct handle_bucket *bucket;
+ ENTRY;
+
+ bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+ spin_lock(&bucket->lock);
+ list_add_rcu(&h->h_link, &bucket->head);
+ h->h_in = 1;
+ spin_unlock(&bucket->lock);
+
+ EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+ struct handle_bucket *bucket;
+ struct portals_handle *h;
+ void *retval = NULL;
+ ENTRY;
+
+ LASSERT(handle_hash != NULL);
+
+ /* Be careful when you want to change this code. See the
+ * rcu_read_lock() definition on top this file. - jxiong */
+ bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(h, &bucket->head, h_link) {
+ if (h->h_cookie != cookie)
+ continue;
+
+ spin_lock(&h->h_lock);
+ if (likely(h->h_in != 0)) {
+ h->h_ops->hop_addref(h);
+ retval = h;
+ }
+ spin_unlock(&h->h_lock);
+ break;
+ }
+ rcu_read_unlock();
+
+ RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+ struct portals_handle *h = RCU2HANDLE(rcu);
+ void *ptr = (void *)(unsigned long)h->h_cookie;
+
+ if (h->h_ops->hop_free != NULL)
+ h->h_ops->hop_free(ptr, h->h_size);
+ else
+ OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+ struct handle_bucket *bucket;
+ struct timeval tv;
+ int seed[2];
+
+ LASSERT(handle_hash == NULL);
+
+ OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+ if (handle_hash == NULL)
+ return -ENOMEM;
+
+ spin_lock_init(&handle_base_lock);
+ for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+ bucket--) {
+ INIT_LIST_HEAD(&bucket->head);
+ spin_lock_init(&bucket->lock);
+ }
+
+ /** bug 21430: add randomness to the initial base */
+ cfs_get_random_bytes(seed, sizeof(seed));
+ do_gettimeofday(&tv);
+ cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+ cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+ LASSERT(handle_base != 0ULL);
+
+ return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+ int rc;
+ int i;
+
+ for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+ struct portals_handle *h;
+
+ spin_lock(&handle_hash[i].lock);
+ list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+ CERROR("force clean handle "LPX64" addr %p ops %p\n",
+ h->h_cookie, h, h->h_ops);
+
+ class_handle_unhash_nolock(h);
+ rc++;
+ }
+ spin_unlock(&handle_hash[i].lock);
+ }
+
+ return rc;
+}
+
+void class_handle_cleanup(void)
+{
+ int count;
+ LASSERT(handle_hash != NULL);
+
+ count = cleanup_all_handles();
+
+ OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+ handle_hash = NULL;
+
+ if (count != 0)
+ CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 000000000000..2fa2589dc8eb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX 32
+
+struct uuid_nid_data {
+ struct list_head un_list;
+ struct obd_uuid un_uuid;
+ int un_nid_count;
+ lnet_nid_t un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head g_uuid_list;
+static spinlock_t g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+ INIT_LIST_HEAD(&g_uuid_list);
+ spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+ /* delete all */
+ class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+ struct uuid_nid_data *data;
+ struct obd_uuid tmp;
+ int rc = -ENOENT;
+
+ obd_str2uuid(&tmp, uuid);
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(data, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+ if (index >= data->un_nid_count)
+ break;
+
+ rc = 0;
+ *peer_nid = data->un_nids[index];
+ break;
+ }
+ }
+ spin_unlock(&g_uuid_lock);
+ return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid. Multiple nids can be added to a single uuid;
+ LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+ struct uuid_nid_data *data, *entry;
+ int found = 0;
+
+ LASSERT(nid != 0); /* valid newconfig NID is never zero */
+
+ if (strlen(uuid) > UUID_MAX - 1)
+ return -EOVERFLOW;
+
+ OBD_ALLOC_PTR(data);
+ if (data == NULL)
+ return -ENOMEM;
+
+ obd_str2uuid(&data->un_uuid, uuid);
+ data->un_nids[0] = nid;
+ data->un_nid_count = 1;
+
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(entry, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+ int i;
+
+ found = 1;
+ for (i = 0; i < entry->un_nid_count; i++)
+ if (nid == entry->un_nids[i])
+ break;
+
+ if (i == entry->un_nid_count) {
+ LASSERT(entry->un_nid_count < NIDS_MAX);
+ entry->un_nids[entry->un_nid_count++] = nid;
+ }
+ break;
+ }
+ }
+ if (!found)
+ list_add(&data->un_list, &g_uuid_list);
+ spin_unlock(&g_uuid_lock);
+
+ if (found) {
+ CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+ libcfs_nid2str(nid), entry->un_nid_count);
+ OBD_FREE(data, sizeof(*data));
+ } else {
+ CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+ }
+ return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+ LIST_HEAD(deathrow);
+ struct uuid_nid_data *data;
+
+ spin_lock(&g_uuid_lock);
+ if (uuid != NULL) {
+ struct obd_uuid tmp;
+
+ obd_str2uuid(&tmp, uuid);
+ list_for_each_entry(data, &g_uuid_list, un_list) {
+ if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+ list_move(&data->un_list, &deathrow);
+ break;
+ }
+ }
+ } else
+ list_splice_init(&g_uuid_list, &deathrow);
+ spin_unlock(&g_uuid_lock);
+
+ if (uuid != NULL && list_empty(&deathrow)) {
+ CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+ return -EINVAL;
+ }
+
+ while (!list_empty(&deathrow)) {
+ data = list_entry(deathrow.next, struct uuid_nid_data,
+ un_list);
+ list_del(&data->un_list);
+
+ CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+ obd_uuid2str(&data->un_uuid),
+ libcfs_nid2str(data->un_nids[0]),
+ data->un_nid_count);
+
+ OBD_FREE(data, sizeof(*data));
+ }
+
+ return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+ struct uuid_nid_data *entry;
+ int found = 0;
+ ENTRY;
+
+ CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+ obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+ spin_lock(&g_uuid_lock);
+ list_for_each_entry(entry, &g_uuid_list, un_list) {
+ int i;
+
+ if (!obd_uuid_equals(&entry->un_uuid, uuid))
+ continue;
+
+ /* found the uuid, check if it has @nid */
+ for (i = 0; i < entry->un_nid_count; i++) {
+ if (entry->un_nids[i] == nid) {
+ found = 1;
+ break;
+ }
+ }
+ break;
+ }
+ spin_unlock(&g_uuid_lock);
+ RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644
index 000000000000..b71344a04c7e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+ __u32 incompat)
+{
+ lma->lma_compat = 0;
+ lma->lma_incompat = incompat;
+ lma->lma_self_fid = *fid;
+
+ /* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+ * and change the test below. */
+ LASSERT(sizeof(*lma) ==
+ (offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+ sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+ /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+ if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+ __swab32s(&lma->lma_compat);
+ __swab32s(&lma->lma_incompat);
+ lustre_swab_lu_fid(&lma->lma_self_fid);
+ }
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+ /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+ if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+ __swab32s(&attrs->som_compat);
+ __swab32s(&attrs->som_incompat);
+ __swab64s(&attrs->som_ioepoch);
+ __swab64s(&attrs->som_size);
+ __swab64s(&attrs->som_blocks);
+ __swab64s(&attrs->som_mountid);
+ }
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+ struct som_attrs *attrs = (struct som_attrs *)buf;
+ ENTRY;
+
+ if (rc == 0 || rc == -ENODATA)
+ /* no SOM attributes */
+ RETURN(-ENODATA);
+
+ if (rc < 0)
+ /* error hit while fetching xattr */
+ RETURN(rc);
+
+ /* check SOM compatibility */
+ if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+ RETURN(-ENODATA);
+
+ /* unpack SOM attributes */
+ lustre_som_swab(attrs);
+
+ /* fill in-memory msd structure */
+ msd->msd_compat = attrs->som_compat;
+ msd->msd_incompat = attrs->som_incompat;
+ msd->msd_ioepoch = attrs->som_ioepoch;
+ msd->msd_size = attrs->som_size;
+ msd->msd_blocks = attrs->som_blocks;
+ msd->msd_mountid = attrs->som_mountid;
+
+ RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+ /* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+ if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+ __swab32s(&attrs->hsm_compat);
+ __swab32s(&attrs->hsm_flags);
+ __swab64s(&attrs->hsm_arch_id);
+ __swab64s(&attrs->hsm_arch_ver);
+ }
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc - is the HSM xattr stored in \a buf
+ * \param mh - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+ struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+ ENTRY;
+
+ if (rc == 0 || rc == -ENODATA)
+ /* no HSM attributes */
+ RETURN(-ENODATA);
+
+ if (rc < 0)
+ /* error hit while fetching xattr */
+ RETURN(rc);
+
+ /* unpack HSM attributes */
+ lustre_hsm_swab(attrs);
+
+ /* fill md_hsm structure */
+ mh->mh_compat = attrs->hsm_compat;
+ mh->mh_flags = attrs->hsm_flags;
+ mh->mh_arch_id = attrs->hsm_arch_id;
+ mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+ RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+ struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+ ENTRY;
+
+ /* copy HSM attributes */
+ attrs->hsm_compat = mh->mh_compat;
+ attrs->hsm_flags = mh->mh_flags;
+ attrs->hsm_arch_id = mh->mh_arch_id;
+ attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+ /* pack xattr */
+ lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644
index 000000000000..c4f0dbc23611
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/mea.c
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h> /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+ unsigned int c;
+
+ c = name[namelen - 1];
+ if (c == 0)
+ CWARN("looks like wrong len is passed\n");
+ c = c % count;
+ return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+ unsigned int c = 0;
+
+ while (--namelen >= 0)
+ c += name[namelen];
+ c = c % count;
+ return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+ unsigned int c = 0;
+ int idx;
+
+ LASSERT(namelen > 0);
+
+ if (filename_is_volatile(name, namelen, &idx)) {
+ if ((idx >= 0) && (idx < count))
+ return idx;
+ goto hashchoice;
+ }
+
+ if (count <= 1)
+ return 0;
+
+hashchoice:
+ switch (hashtype) {
+ case MEA_MAGIC_LAST_CHAR:
+ c = mea_last_char_hash(count, (char *)name, namelen);
+ break;
+ case MEA_MAGIC_ALL_CHARS:
+ c = mea_all_chars_hash(count, (char *)name, namelen);
+ break;
+ case MEA_MAGIC_HASH_SEGMENT:
+ CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+ break;
+ default:
+ CERROR("Unknown hash type 0x%x\n", hashtype);
+ }
+
+ LASSERT(c < count);
+ return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+ unsigned int c;
+
+ LASSERT(mea && mea->mea_count);
+
+ c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+ LASSERT(c < mea->mea_count);
+ return c;
+}
+EXPORT_SYMBOL(mea_name2idx);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 000000000000..bbf06d009fd0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c
@@ -0,0 +1,1904 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+ char *ptr;
+
+ if (!buf)
+ return 1;
+
+ if ((ptr = strstr(buf, key)) == NULL)
+ return 1;
+
+ if (valp)
+ *valp = ptr + strlen(key);
+
+ return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param proc parameter
+ * \param ptr an array which contains the mapping from
+ * old parameters to new ones
+ *
+ * \retval valid-pointer pointer to the cfg_interop_param structure
+ * which contains the old and new parameters
+ * \retval NULL \a param or \a ptr is NULL,
+ * or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+ struct cfg_interop_param *ptr)
+{
+ char *value = NULL;
+ int name_len = 0;
+
+ if (param == NULL || ptr == NULL)
+ RETURN(NULL);
+
+ value = strchr(param, '=');
+ if (value == NULL)
+ name_len = strlen(param);
+ else
+ name_len = value - param;
+
+ while (ptr->old_param != NULL) {
+ if (strncmp(param, ptr->old_param, name_len) == 0 &&
+ name_len == strlen(ptr->old_param))
+ RETURN(ptr);
+ ptr++;
+ }
+
+ RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+ char *q1, *q2, *str;
+ int len;
+
+ str = *params;
+ while (*str == ' ')
+ str++;
+
+ if (*str == '\0') {
+ *params = NULL;
+ return 1;
+ }
+
+ while (1) {
+ q1 = strpbrk(str, " '\"");
+ if (q1 == NULL) {
+ len = strlen(str);
+ memcpy(copy, str, len);
+ copy[len] = '\0';
+ *params = NULL;
+ return 0;
+ }
+ len = q1 - str;
+ if (*q1 == ' ') {
+ memcpy(copy, str, len);
+ copy[len] = '\0';
+ *params = str + len;
+ return 0;
+ }
+
+ memcpy(copy, str, len);
+ copy += len;
+
+ /* search for the matching closing quote */
+ str = q1 + 1;
+ q2 = strchr(str, *q1);
+ if (q2 == NULL) {
+ CERROR("Unbalanced quota in parameters: \"%s\"\n",
+ *params);
+ return -EINVAL;
+ }
+ len = q2 - str;
+ memcpy(copy, str, len);
+ copy += len;
+ str = q2 + 1;
+ }
+ return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+ valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+ if (!buf)
+ return 1;
+
+ if (memcmp(buf, key, strlen(key)) != 0)
+ return 1;
+
+ if (valp)
+ *valp = buf + strlen(key);
+
+ return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+ lnet_nid_t *nid = (lnet_nid_t *)value;
+
+ *nid = libcfs_str2nid(buf);
+ if (*nid != LNET_NID_ANY)
+ return 0;
+
+ if (!quiet)
+ LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+ return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+ __u32 *net = (__u32 *)value;
+
+ *net = libcfs_str2net(buf);
+ CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+ return 0;
+}
+
+enum {
+ CLASS_PARSE_NID = 1,
+ CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+ 1 not found
+ < 0 error
+ endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+ int quiet)
+{
+ char *endp;
+ char tmp;
+ int rc = 0;
+
+ if (!buf)
+ return 1;
+ while (*buf == ',' || *buf == ':')
+ buf++;
+ if (*buf == ' ' || *buf == '/' || *buf == '\0')
+ return 1;
+
+ /* nid separators or end of nids */
+ endp = strpbrk(buf, ",: /");
+ if (endp == NULL)
+ endp = buf + strlen(buf);
+
+ tmp = *endp;
+ *endp = '\0';
+ switch (opc) {
+ default:
+ LBUG();
+ case CLASS_PARSE_NID:
+ rc = parse_nid(buf, value, quiet);
+ break;
+ case CLASS_PARSE_NET:
+ rc = parse_net(buf, value);
+ break;
+ }
+ *endp = tmp;
+ if (rc != 0)
+ return rc;
+ if (endh)
+ *endh = endp;
+ return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+ return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+ lnet_nid_t tmp;
+ int rc = -1;
+
+ while (class_find_param(buf, key, &buf) == 0) {
+ /* please restrict to the nids pertaining to
+ * the specified nids */
+ while (class_parse_nid(buf, &tmp, &buf) == 0) {
+ if (tmp == nid)
+ return 1;
+ }
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+ __u32 tmp;
+ int rc = -1;
+
+ while (class_find_param(buf, key, &buf) == 0) {
+ /* please restrict to the nids pertaining to
+ * the specified networks */
+ while (class_parse_net(buf, &tmp, &buf) == 0) {
+ if (tmp == net)
+ return 1;
+ }
+ rc = 0;
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid. If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+ struct obd_device *obd = NULL;
+ char *typename, *name, *uuid;
+ int rc, len;
+ ENTRY;
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+ CERROR("No type passed!\n");
+ RETURN(-EINVAL);
+ }
+ typename = lustre_cfg_string(lcfg, 1);
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+ CERROR("No name passed!\n");
+ RETURN(-EINVAL);
+ }
+ name = lustre_cfg_string(lcfg, 0);
+
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+ CERROR("No UUID passed!\n");
+ RETURN(-EINVAL);
+ }
+ uuid = lustre_cfg_string(lcfg, 2);
+
+ CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+ MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+ obd = class_newdev(typename, name);
+ if (IS_ERR(obd)) {
+ /* Already exists or out of obds */
+ rc = PTR_ERR(obd);
+ obd = NULL;
+ CERROR("Cannot create device %s of type %s : %d\n",
+ name, typename, rc);
+ GOTO(out, rc);
+ }
+ LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+ name, typename);
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "obd %p obd_magic %08X != %08X\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+ LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+ "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+ rwlock_init(&obd->obd_pool_lock);
+ obd->obd_pool_limit = 0;
+ obd->obd_pool_slv = 0;
+
+ INIT_LIST_HEAD(&obd->obd_exports);
+ INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+ INIT_LIST_HEAD(&obd->obd_delayed_exports);
+ INIT_LIST_HEAD(&obd->obd_exports_timed);
+ INIT_LIST_HEAD(&obd->obd_nid_stats);
+ spin_lock_init(&obd->obd_nid_lock);
+ spin_lock_init(&obd->obd_dev_lock);
+ mutex_init(&obd->obd_dev_mutex);
+ spin_lock_init(&obd->obd_osfs_lock);
+ /* obd->obd_osfs_age must be set to a value in the distant
+ * past to guarantee a fresh statfs is fetched on mount. */
+ obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+ /* XXX belongs in setup not attach */
+ init_rwsem(&obd->obd_observer_link_sem);
+ /* recovery data */
+ cfs_init_timer(&obd->obd_recovery_timer);
+ spin_lock_init(&obd->obd_recovery_task_lock);
+ init_waitqueue_head(&obd->obd_next_transno_waitq);
+ init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+ INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+ INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+ INIT_LIST_HEAD(&obd->obd_final_req_queue);
+ INIT_LIST_HEAD(&obd->obd_evict_list);
+
+ llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+ obd->obd_conn_inprogress = 0;
+
+ len = strlen(uuid);
+ if (len >= sizeof(obd->obd_uuid)) {
+ CERROR("uuid must be < %d bytes long\n",
+ (int)sizeof(obd->obd_uuid));
+ GOTO(out, rc = -EINVAL);
+ }
+ memcpy(obd->obd_uuid.uuid, uuid, len);
+
+ /* do the attach */
+ if (OBP(obd, attach)) {
+ rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+ if (rc)
+ GOTO(out, rc = -EINVAL);
+ }
+
+ /* Detach drops this */
+ spin_lock(&obd->obd_dev_lock);
+ atomic_set(&obd->obd_refcount, 1);
+ spin_unlock(&obd->obd_dev_lock);
+ lu_ref_init(&obd->obd_reference);
+ lu_ref_add(&obd->obd_reference, "attach", obd);
+
+ obd->obd_attached = 1;
+ CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+ obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+ RETURN(0);
+ out:
+ if (obd != NULL) {
+ class_release_dev(obd);
+ }
+ return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ int err = 0;
+ struct obd_export *exp;
+ ENTRY;
+
+ LASSERT(obd != NULL);
+ LASSERTF(obd == class_num2obd(obd->obd_minor),
+ "obd %p != obd_devs[%d] %p\n",
+ obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+ LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+ "obd %p obd_magic %08x != %08x\n",
+ obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+ /* have we attached a type to this device? */
+ if (!obd->obd_attached) {
+ CERROR("Device %d not attached\n", obd->obd_minor);
+ RETURN(-ENODEV);
+ }
+
+ if (obd->obd_set_up) {
+ CERROR("Device %d already setup (type %s)\n",
+ obd->obd_minor, obd->obd_type->typ_name);
+ RETURN(-EEXIST);
+ }
+
+ /* is someone else setting us up right now? (attach inits spinlock) */
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_starting) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("Device %d setup in progress (type %s)\n",
+ obd->obd_minor, obd->obd_type->typ_name);
+ RETURN(-EEXIST);
+ }
+ /* just leave this on forever. I can't use obd_set_up here because
+ other fns check that status, and we're not actually set up yet. */
+ obd->obd_starting = 1;
+ obd->obd_uuid_hash = NULL;
+ obd->obd_nid_hash = NULL;
+ obd->obd_nid_stats_hash = NULL;
+ spin_unlock(&obd->obd_dev_lock);
+
+ /* create an uuid-export lustre hash */
+ obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+ HASH_UUID_CUR_BITS,
+ HASH_UUID_MAX_BITS,
+ HASH_UUID_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &uuid_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_uuid_hash)
+ GOTO(err_hash, err = -ENOMEM);
+
+ /* create a nid-export lustre hash */
+ obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+ HASH_NID_CUR_BITS,
+ HASH_NID_MAX_BITS,
+ HASH_NID_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &nid_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_nid_hash)
+ GOTO(err_hash, err = -ENOMEM);
+
+ /* create a nid-stats lustre hash */
+ obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+ HASH_NID_STATS_CUR_BITS,
+ HASH_NID_STATS_MAX_BITS,
+ HASH_NID_STATS_BKT_BITS, 0,
+ CFS_HASH_MIN_THETA,
+ CFS_HASH_MAX_THETA,
+ &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+ if (!obd->obd_nid_stats_hash)
+ GOTO(err_hash, err = -ENOMEM);
+
+ exp = class_new_export(obd, &obd->obd_uuid);
+ if (IS_ERR(exp))
+ GOTO(err_hash, err = PTR_ERR(exp));
+
+ obd->obd_self_export = exp;
+ list_del_init(&exp->exp_obd_chain_timed);
+ class_export_put(exp);
+
+ err = obd_setup(obd, lcfg);
+ if (err)
+ GOTO(err_exp, err);
+
+ obd->obd_set_up = 1;
+
+ spin_lock(&obd->obd_dev_lock);
+ /* cleanup drops this */
+ class_incref(obd, "setup", obd);
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+
+ RETURN(0);
+err_exp:
+ if (obd->obd_self_export) {
+ class_unlink_export(obd->obd_self_export);
+ obd->obd_self_export = NULL;
+ }
+err_hash:
+ if (obd->obd_uuid_hash) {
+ cfs_hash_putref(obd->obd_uuid_hash);
+ obd->obd_uuid_hash = NULL;
+ }
+ if (obd->obd_nid_hash) {
+ cfs_hash_putref(obd->obd_nid_hash);
+ obd->obd_nid_hash = NULL;
+ }
+ if (obd->obd_nid_stats_hash) {
+ cfs_hash_putref(obd->obd_nid_stats_hash);
+ obd->obd_nid_stats_hash = NULL;
+ }
+ obd->obd_starting = 0;
+ CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+ return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ ENTRY;
+
+ if (obd->obd_set_up) {
+ CERROR("OBD device %d still set up\n", obd->obd_minor);
+ RETURN(-EBUSY);
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (!obd->obd_attached) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("OBD device %d not attached\n", obd->obd_minor);
+ RETURN(-ENODEV);
+ }
+ obd->obd_attached = 0;
+ spin_unlock(&obd->obd_dev_lock);
+
+ CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+
+ class_decref(obd, "attach", obd);
+ RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd. There may be in-progess ops when
+ * this is called. We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ int err = 0;
+ char *flag;
+ ENTRY;
+
+ OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+ if (!obd->obd_set_up) {
+ CERROR("Device %d not setup\n", obd->obd_minor);
+ RETURN(-ENODEV);
+ }
+
+ spin_lock(&obd->obd_dev_lock);
+ if (obd->obd_stopping) {
+ spin_unlock(&obd->obd_dev_lock);
+ CERROR("OBD %d already stopping\n", obd->obd_minor);
+ RETURN(-ENODEV);
+ }
+ /* Leave this on forever */
+ obd->obd_stopping = 1;
+
+ /* wait for already-arrived-connections to finish. */
+ while (obd->obd_conn_inprogress > 0) {
+ spin_unlock(&obd->obd_dev_lock);
+
+ cond_resched();
+
+ spin_lock(&obd->obd_dev_lock);
+ }
+ spin_unlock(&obd->obd_dev_lock);
+
+ if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+ for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+ switch (*flag) {
+ case 'F':
+ obd->obd_force = 1;
+ break;
+ case 'A':
+ LCONSOLE_WARN("Failing over %s\n",
+ obd->obd_name);
+ obd->obd_fail = 1;
+ obd->obd_no_transno = 1;
+ obd->obd_no_recov = 1;
+ if (OBP(obd, iocontrol)) {
+ obd_iocontrol(OBD_IOC_SYNC,
+ obd->obd_self_export,
+ 0, NULL, NULL);
+ }
+ break;
+ default:
+ CERROR("Unrecognised flag '%c'\n", *flag);
+ }
+ }
+
+ LASSERT(obd->obd_self_export);
+
+ /* The three references that should be remaining are the
+ * obd_self_export and the attach and setup references. */
+ if (atomic_read(&obd->obd_refcount) > 3) {
+ /* refcounf - 3 might be the number of real exports
+ (excluding self export). But class_incref is called
+ by other things as well, so don't count on it. */
+ CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+ obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+ dump_exports(obd, 0);
+ class_disconnect_exports(obd);
+ }
+
+ /* Precleanup, we must make sure all exports get destroyed. */
+ err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+ if (err)
+ CERROR("Precleanup %s returned %d\n",
+ obd->obd_name, err);
+
+ /* destroy an uuid-export hash body */
+ if (obd->obd_uuid_hash) {
+ cfs_hash_putref(obd->obd_uuid_hash);
+ obd->obd_uuid_hash = NULL;
+ }
+
+ /* destroy a nid-export hash body */
+ if (obd->obd_nid_hash) {
+ cfs_hash_putref(obd->obd_nid_hash);
+ obd->obd_nid_hash = NULL;
+ }
+
+ /* destroy a nid-stats hash body */
+ if (obd->obd_nid_stats_hash) {
+ cfs_hash_putref(obd->obd_nid_stats_hash);
+ obd->obd_nid_stats_hash = NULL;
+ }
+
+ class_decref(obd, "setup", obd);
+ obd->obd_set_up = 0;
+
+ RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+ const char *scope, const void *source)
+{
+ lu_ref_add_atomic(&obd->obd_reference, scope, source);
+ atomic_inc(&obd->obd_refcount);
+ CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+ atomic_read(&obd->obd_refcount));
+
+ return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+ int err;
+ int refs;
+
+ spin_lock(&obd->obd_dev_lock);
+ atomic_dec(&obd->obd_refcount);
+ refs = atomic_read(&obd->obd_refcount);
+ spin_unlock(&obd->obd_dev_lock);
+ lu_ref_del(&obd->obd_reference, scope, source);
+
+ CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+ if ((refs == 1) && obd->obd_stopping) {
+ /* All exports have been destroyed; there should
+ be no more in-progress ops by this point.*/
+
+ spin_lock(&obd->obd_self_export->exp_lock);
+ obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+ spin_unlock(&obd->obd_self_export->exp_lock);
+
+ /* note that we'll recurse into class_decref again */
+ class_unlink_export(obd->obd_self_export);
+ return;
+ }
+
+ if (refs == 0) {
+ CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+ obd->obd_name, obd->obd_uuid.uuid);
+ LASSERT(!obd->obd_attached);
+ if (obd->obd_stopping) {
+ /* If we're not stopping, we were never set up */
+ err = obd_cleanup(obd);
+ if (err)
+ CERROR("Cleanup %s returned %d\n",
+ obd->obd_name, err);
+ }
+ if (OBP(obd, detach)) {
+ err = OBP(obd, detach)(obd);
+ if (err)
+ CERROR("Detach returned %d\n", err);
+ }
+ class_release_dev(obd);
+ }
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_import *imp;
+ struct obd_uuid uuid;
+ int rc;
+ ENTRY;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+ CERROR("invalid conn_uuid\n");
+ RETURN(-EINVAL);
+ }
+ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+ CERROR("can't add connection on non-client dev\n");
+ RETURN(-EINVAL);
+ }
+
+ imp = obd->u.cli.cl_import;
+ if (!imp) {
+ CERROR("try to add conn on immature client dev\n");
+ RETURN(-EINVAL);
+ }
+
+ obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+ rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+ struct obd_import *imp;
+ struct obd_uuid uuid;
+ int rc;
+ ENTRY;
+
+ if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+ LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+ CERROR("invalid conn_uuid\n");
+ RETURN(-EINVAL);
+ }
+ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+ strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+ CERROR("can't del connection on non-client dev\n");
+ RETURN(-EINVAL);
+ }
+
+ imp = obd->u.cli.cl_import;
+ if (!imp) {
+ CERROR("try to del conn on immature client dev\n");
+ RETURN(-EINVAL);
+ }
+
+ obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+ rc = obd_del_conn(imp, &uuid);
+
+ RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+ struct lustre_profile *lprof;
+
+ ENTRY;
+ list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+ if (!strcmp(lprof->lp_profile, prof)) {
+ RETURN(lprof);
+ }
+ }
+ RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+ int mdclen, char *mdc)
+{
+ struct lustre_profile *lprof;
+ int err = 0;
+ ENTRY;
+
+ CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+ OBD_ALLOC(lprof, sizeof(*lprof));
+ if (lprof == NULL)
+ RETURN(-ENOMEM);
+ INIT_LIST_HEAD(&lprof->lp_list);
+
+ LASSERT(proflen == (strlen(prof) + 1));
+ OBD_ALLOC(lprof->lp_profile, proflen);
+ if (lprof->lp_profile == NULL)
+ GOTO(out, err = -ENOMEM);
+ memcpy(lprof->lp_profile, prof, proflen);
+
+ LASSERT(osclen == (strlen(osc) + 1));
+ OBD_ALLOC(lprof->lp_dt, osclen);
+ if (lprof->lp_dt == NULL)
+ GOTO(out, err = -ENOMEM);
+ memcpy(lprof->lp_dt, osc, osclen);
+
+ if (mdclen > 0) {
+ LASSERT(mdclen == (strlen(mdc) + 1));
+ OBD_ALLOC(lprof->lp_md, mdclen);
+ if (lprof->lp_md == NULL)
+ GOTO(out, err = -ENOMEM);
+ memcpy(lprof->lp_md, mdc, mdclen);
+ }
+
+ list_add(&lprof->lp_list, &lustre_profile_list);
+ RETURN(err);
+
+out:
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, mdclen);
+ if (lprof->lp_dt)
+ OBD_FREE(lprof->lp_dt, osclen);
+ if (lprof->lp_profile)
+ OBD_FREE(lprof->lp_profile, proflen);
+ OBD_FREE(lprof, sizeof(*lprof));
+ RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+ struct lustre_profile *lprof;
+ ENTRY;
+
+ CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+ lprof = class_get_profile(prof);
+ if (lprof) {
+ list_del(&lprof->lp_list);
+ OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+ OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+ OBD_FREE(lprof, sizeof *lprof);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+ struct lustre_profile *lprof, *n;
+ ENTRY;
+
+ list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+ list_del(&lprof->lp_list);
+ OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+ OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+ if (lprof->lp_md)
+ OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+ OBD_FREE(lprof, sizeof *lprof);
+ }
+ EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+ ENTRY;
+ if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+ at_min = val;
+ else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+ at_max = val;
+ else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+ at_extra = val;
+ else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+ at_early_margin = val;
+ else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+ at_history = val;
+ else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+ strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+ JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+ else
+ RETURN(-EINVAL);
+
+ CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+ RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+ client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer pointer to the newly-allocated config structure
+ * which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ * not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+ const char *new_name)
+{
+ struct lustre_cfg_bufs *bufs = NULL;
+ struct lustre_cfg *new_cfg = NULL;
+ char *param = NULL;
+ char *new_param = NULL;
+ char *value = NULL;
+ int name_len = 0;
+ int new_len = 0;
+ ENTRY;
+
+ if (cfg == NULL || new_name == NULL)
+ RETURN(ERR_PTR(-EINVAL));
+
+ param = lustre_cfg_string(cfg, 1);
+ if (param == NULL)
+ RETURN(ERR_PTR(-EINVAL));
+
+ value = strchr(param, '=');
+ if (value == NULL)
+ name_len = strlen(param);
+ else
+ name_len = value - param;
+
+ new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+ OBD_ALLOC(new_param, new_len);
+ if (new_param == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+
+ strcpy(new_param, new_name);
+ if (value != NULL)
+ strcat(new_param, value);
+
+ OBD_ALLOC_PTR(bufs);
+ if (bufs == NULL) {
+ OBD_FREE(new_param, new_len);
+ RETURN(ERR_PTR(-ENOMEM));
+ }
+
+ lustre_cfg_bufs_reset(bufs, NULL);
+ lustre_cfg_bufs_init(bufs, cfg);
+ lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+ new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+ OBD_FREE(new_param, new_len);
+ OBD_FREE_PTR(bufs);
+ if (new_cfg == NULL)
+ RETURN(ERR_PTR(-ENOMEM));
+
+ new_cfg->lcfg_num = cfg->lcfg_num;
+ new_cfg->lcfg_flags = cfg->lcfg_flags;
+ new_cfg->lcfg_nid = cfg->lcfg_nid;
+ new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+ RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+ quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+ struct obd_device *obd;
+ int err;
+
+ LASSERT(lcfg && !IS_ERR(lcfg));
+ CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+ /* Commands that don't need a device */
+ switch(lcfg->lcfg_command) {
+ case LCFG_ATTACH: {
+ err = class_attach(lcfg);
+ GOTO(out, err);
+ }
+ case LCFG_ADD_UUID: {
+ CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+ " (%s)\n", lustre_cfg_string(lcfg, 1),
+ lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+ err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+ GOTO(out, err);
+ }
+ case LCFG_DEL_UUID: {
+ CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+ (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+ ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+ err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+ GOTO(out, err);
+ }
+ case LCFG_MOUNTOPT: {
+ CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+ lustre_cfg_string(lcfg, 1),
+ lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ /* set these mount options somewhere, so ll_fill_super
+ * can find them. */
+ err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+ lustre_cfg_string(lcfg, 1),
+ LUSTRE_CFG_BUFLEN(lcfg, 2),
+ lustre_cfg_string(lcfg, 2),
+ LUSTRE_CFG_BUFLEN(lcfg, 3),
+ lustre_cfg_string(lcfg, 3));
+ GOTO(out, err);
+ }
+ case LCFG_DEL_MOUNTOPT: {
+ CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+ lustre_cfg_string(lcfg, 1));
+ class_del_profile(lustre_cfg_string(lcfg, 1));
+ GOTO(out, err = 0);
+ }
+ case LCFG_SET_TIMEOUT: {
+ CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+ obd_timeout, lcfg->lcfg_num);
+ obd_timeout = max(lcfg->lcfg_num, 1U);
+ obd_timeout_set = 1;
+ GOTO(out, err = 0);
+ }
+ case LCFG_SET_LDLM_TIMEOUT: {
+ CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+ ldlm_timeout, lcfg->lcfg_num);
+ ldlm_timeout = max(lcfg->lcfg_num, 1U);
+ if (ldlm_timeout >= obd_timeout)
+ ldlm_timeout = max(obd_timeout / 3, 1U);
+ ldlm_timeout_set = 1;
+ GOTO(out, err = 0);
+ }
+ case LCFG_SET_UPCALL: {
+ LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+ /* COMPAT_146 Don't fail on old configs */
+ GOTO(out, err = 0);
+ }
+ case LCFG_MARKER: {
+ struct cfg_marker *marker;
+ marker = lustre_cfg_buf(lcfg, 1);
+ CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+ marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+ GOTO(out, err = 0);
+ }
+ case LCFG_PARAM: {
+ char *tmp;
+ /* llite has no obd */
+ if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_LLITE, 0) == 0) &&
+ client_process_config) {
+ err = (*client_process_config)(lcfg);
+ GOTO(out, err);
+ } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_SYS, &tmp) == 0)) {
+ /* Global param settings */
+ err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+ /*
+ * Client or server should not fail to mount if
+ * it hits an unknown configuration parameter.
+ */
+ if (err != 0)
+ CWARN("Ignoring unknown param %s\n", tmp);
+
+ GOTO(out, err = 0);
+ } else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+ PARAM_QUOTA, &tmp) == 0) &&
+ quota_process_config) {
+ err = (*quota_process_config)(lcfg);
+ GOTO(out, err);
+ }
+ /* Fall through */
+ break;
+ }
+ }
+
+ /* Commands that require a device */
+ obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+ if (obd == NULL) {
+ if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+ CERROR("this lcfg command requires a device name\n");
+ else
+ CERROR("no device for: %s\n",
+ lustre_cfg_string(lcfg, 0));
+
+ GOTO(out, err = -EINVAL);
+ }
+
+ switch(lcfg->lcfg_command) {
+ case LCFG_SETUP: {
+ err = class_setup(obd, lcfg);
+ GOTO(out, err);
+ }
+ case LCFG_DETACH: {
+ err = class_detach(obd, lcfg);
+ GOTO(out, err = 0);
+ }
+ case LCFG_CLEANUP: {
+ err = class_cleanup(obd, lcfg);
+ GOTO(out, err = 0);
+ }
+ case LCFG_ADD_CONN: {
+ err = class_add_conn(obd, lcfg);
+ GOTO(out, err = 0);
+ }
+ case LCFG_DEL_CONN: {
+ err = class_del_conn(obd, lcfg);
+ GOTO(out, err = 0);
+ }
+ case LCFG_POOL_NEW: {
+ err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+ GOTO(out, err = 0);
+ break;
+ }
+ case LCFG_POOL_ADD: {
+ err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ GOTO(out, err = 0);
+ break;
+ }
+ case LCFG_POOL_REM: {
+ err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+ lustre_cfg_string(lcfg, 3));
+ GOTO(out, err = 0);
+ break;
+ }
+ case LCFG_POOL_DEL: {
+ err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+ GOTO(out, err = 0);
+ break;
+ }
+ default: {
+ err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+ GOTO(out, err);
+
+ }
+ }
+out:
+ if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+ CWARN("Ignoring error %d on optional command %#x\n", err,
+ lcfg->lcfg_command);
+ err = 0;
+ }
+ return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+ struct lustre_cfg *lcfg, void *data)
+{
+ struct lprocfs_vars *var;
+ struct file fakefile;
+ struct seq_file fake_seqfile;
+ char *key, *sval;
+ int i, keylen, vallen;
+ int matched = 0, j = 0;
+ int rc = 0;
+ int skip = 0;
+ ENTRY;
+
+ if (lcfg->lcfg_command != LCFG_PARAM) {
+ CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+ RETURN(-EINVAL);
+ }
+
+ /* fake a seq file so that var->fops->write can work... */
+ fakefile.private_data = &fake_seqfile;
+ fake_seqfile.private = data;
+ /* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+ or lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+ or lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+ for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+ key = lustre_cfg_buf(lcfg, i);
+ /* Strip off prefix */
+ class_match_param(key, prefix, &key);
+ sval = strchr(key, '=');
+ if (!sval || (*(sval + 1) == 0)) {
+ CERROR("Can't parse param %s (missing '=')\n", key);
+ /* rc = -EINVAL; continue parsing other params */
+ continue;
+ }
+ keylen = sval - key;
+ sval++;
+ vallen = strlen(sval);
+ matched = 0;
+ j = 0;
+ /* Search proc entries */
+ while (lvars[j].name) {
+ var = &lvars[j];
+ if (class_match_param(key, (char *)var->name, 0) == 0 &&
+ keylen == strlen(var->name)) {
+ matched++;
+ rc = -EROFS;
+ if (var->fops && var->fops->write) {
+ mm_segment_t oldfs;
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ rc = (var->fops->write)(&fakefile, sval,
+ vallen, NULL);
+ set_fs(oldfs);
+ }
+ break;
+ }
+ j++;
+ }
+ if (!matched) {
+ /* If the prefix doesn't match, return error so we
+ can pass it down the stack */
+ if (strnchr(key, keylen, '.'))
+ RETURN(-ENOSYS);
+ CERROR("%s: unknown param %s\n",
+ (char *)lustre_cfg_string(lcfg, 0), key);
+ /* rc = -EINVAL; continue parsing other params */
+ skip++;
+ } else if (rc < 0) {
+ CERROR("writing proc entry %s err %d\n",
+ var->name, rc);
+ rc = 0;
+ } else {
+ CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+ lustre_cfg_string(lcfg, 0),
+ (int)strlen(prefix) - 1, prefix,
+ (int)(sval - key - 1), key, sval);
+ }
+ }
+
+ if (rc > 0)
+ rc = 0;
+ if (!rc && skip)
+ rc = skip;
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ struct config_llog_instance *clli = data;
+ int cfg_len = rec->lrh_len;
+ char *cfg_buf = (char*) (rec + 1);
+ int rc = 0;
+ ENTRY;
+
+ //class_config_dump_handler(handle, rec, data);
+
+ switch (rec->lrh_type) {
+ case OBD_CFG_REC: {
+ struct lustre_cfg *lcfg, *lcfg_new;
+ struct lustre_cfg_bufs bufs;
+ char *inst_name = NULL;
+ int inst_len = 0;
+ int inst = 0, swab = 0;
+
+ lcfg = (struct lustre_cfg *)cfg_buf;
+ if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+ lustre_swab_lustre_cfg(lcfg);
+ swab = 1;
+ }
+
+ rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+ if (rc)
+ GOTO(out, rc);
+
+ /* Figure out config state info */
+ if (lcfg->lcfg_command == LCFG_MARKER) {
+ struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+ lustre_swab_cfg_marker(marker, swab,
+ LUSTRE_CFG_BUFLEN(lcfg, 1));
+ CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+ clli->cfg_flags, marker->cm_flags);
+ if (marker->cm_flags & CM_START) {
+ /* all previous flags off */
+ clli->cfg_flags = CFG_F_MARKER;
+ if (marker->cm_flags & CM_SKIP) {
+ clli->cfg_flags |= CFG_F_SKIP;
+ CDEBUG(D_CONFIG, "SKIP #%d\n",
+ marker->cm_step);
+ } else if ((marker->cm_flags & CM_EXCLUDE) ||
+ (clli->cfg_sb &&
+ lustre_check_exclusion(clli->cfg_sb,
+ marker->cm_tgtname))) {
+ clli->cfg_flags |= CFG_F_EXCLUDE;
+ CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+ marker->cm_step);
+ }
+ } else if (marker->cm_flags & CM_END) {
+ clli->cfg_flags = 0;
+ }
+ }
+ /* A config command without a start marker before it is
+ illegal (post 146) */
+ if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+ !(clli->cfg_flags & CFG_F_MARKER) &&
+ (lcfg->lcfg_command != LCFG_MARKER)) {
+ CWARN("Config not inside markers, ignoring! "
+ "(inst: %p, uuid: %s, flags: %#x)\n",
+ clli->cfg_instance,
+ clli->cfg_uuid.uuid, clli->cfg_flags);
+ clli->cfg_flags |= CFG_F_SKIP;
+ }
+ if (clli->cfg_flags & CFG_F_SKIP) {
+ CDEBUG(D_CONFIG, "skipping %#x\n",
+ clli->cfg_flags);
+ rc = 0;
+ /* No processing! */
+ break;
+ }
+
+ /*
+ * For interoperability between 1.8 and 2.0,
+ * rename "mds" obd device type to "mdt".
+ */
+ {
+ char *typename = lustre_cfg_string(lcfg, 1);
+ char *index = lustre_cfg_string(lcfg, 2);
+
+ if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+ strcmp(typename, "mds") == 0)) {
+ CWARN("For 1.8 interoperability, rename obd "
+ "type from mds to mdt\n");
+ typename[2] = 't';
+ }
+ if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+ strcmp(index, "type") == 0)) {
+ CDEBUG(D_INFO, "For 1.8 interoperability, "
+ "set this index to '0'\n");
+ index[0] = '0';
+ index[1] = 0;
+ }
+ }
+
+
+ if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+ (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+ /* Add inactive instead */
+ lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+ lustre_cfg_bufs_init(&bufs, lcfg);
+
+ if (clli && clli->cfg_instance &&
+ LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+ inst = 1;
+ inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+ sizeof(clli->cfg_instance) * 2 + 4;
+ OBD_ALLOC(inst_name, inst_len);
+ if (inst_name == NULL)
+ GOTO(out, rc = -ENOMEM);
+ sprintf(inst_name, "%s-%p",
+ lustre_cfg_string(lcfg, 0),
+ clli->cfg_instance);
+ lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+ CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+ lcfg->lcfg_command, inst_name);
+ }
+
+ /* we override the llog's uuid for clients, to insure they
+ are unique */
+ if (clli && clli->cfg_instance != NULL &&
+ lcfg->lcfg_command == LCFG_ATTACH) {
+ lustre_cfg_bufs_set_string(&bufs, 2,
+ clli->cfg_uuid.uuid);
+ }
+ /*
+ * sptlrpc config record, we expect 2 data segments:
+ * [0]: fs_name/target_name,
+ * [1]: rule string
+ * moving them to index [1] and [2], and insert MGC's
+ * obdname at index [0].
+ */
+ if (clli && clli->cfg_instance == NULL &&
+ lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+ lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+ bufs.lcfg_buflen[1]);
+ lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+ bufs.lcfg_buflen[0]);
+ lustre_cfg_bufs_set_string(&bufs, 0,
+ clli->cfg_obdname);
+ }
+
+ lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+ lcfg_new->lcfg_num = lcfg->lcfg_num;
+ lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+ /* XXX Hack to try to remain binary compatible with
+ * pre-newconfig logs */
+ if (lcfg->lcfg_nal != 0 && /* pre-newconfig log? */
+ (lcfg->lcfg_nid >> 32) == 0) {
+ __u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+ lcfg_new->lcfg_nid =
+ LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+ CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+ lcfg->lcfg_nal, addr,
+ libcfs_nid2str(lcfg_new->lcfg_nid));
+ } else {
+ lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+ }
+
+ lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+ rc = class_process_config(lcfg_new);
+ lustre_cfg_free(lcfg_new);
+
+ if (inst)
+ OBD_FREE(inst_name, inst_len);
+ break;
+ }
+ default:
+ CERROR("Unknown llog record type %#x encountered\n",
+ rec->lrh_type);
+ break;
+ }
+out:
+ if (rc) {
+ CERROR("%s: cfg command failed: rc = %d\n",
+ handle->lgh_ctxt->loc_obd->obd_name, rc);
+ class_config_dump_handler(NULL, handle, rec, data);
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg)
+{
+ struct llog_process_cat_data cd = {0, 0};
+ struct llog_handle *llh;
+ llog_cb_t callback;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_INFO, "looking up llog %s\n", name);
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc)
+ RETURN(rc);
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ GOTO(parse_out, rc);
+
+ /* continue processing from where we last stopped to end-of-log */
+ if (cfg) {
+ cd.lpcd_first_idx = cfg->cfg_last_idx;
+ callback = cfg->cfg_callback;
+ LASSERT(callback != NULL);
+ } else {
+ callback = class_config_llog_handler;
+ }
+
+ cd.lpcd_last_idx = 0;
+
+ rc = llog_process(env, llh, callback, cfg, &cd);
+
+ CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+ cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+ if (cfg)
+ cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+ llog_close(env, llh);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+ struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1);
+ char *ptr = buf;
+ char *end = buf + size;
+ int rc = 0;
+
+ ENTRY;
+
+ LASSERT(rec->lrh_type == OBD_CFG_REC);
+ rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+ if (rc < 0)
+ RETURN(rc);
+
+ ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+ if (lcfg->lcfg_flags)
+ ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+ lcfg->lcfg_flags);
+
+ if (lcfg->lcfg_num)
+ ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+ if (lcfg->lcfg_nid)
+ ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n ",
+ libcfs_nid2str(lcfg->lcfg_nid),
+ lcfg->lcfg_nid);
+
+ if (lcfg->lcfg_command == LCFG_MARKER) {
+ struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+ ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+ marker->cm_step, marker->cm_flags,
+ marker->cm_tgtname, marker->cm_comment);
+ } else {
+ int i;
+
+ for (i = 0; i < lcfg->lcfg_bufcount; i++) {
+ ptr += snprintf(ptr, end-ptr, "%d:%s ", i,
+ lustre_cfg_string(lcfg, i));
+ }
+ }
+ /* return consumed bytes */
+ rc = ptr - buf;
+ RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+ struct llog_handle *handle,
+ struct llog_rec_hdr *rec, void *data)
+{
+ char *outstr;
+ int rc = 0;
+
+ ENTRY;
+
+ OBD_ALLOC(outstr, 256);
+ if (outstr == NULL)
+ RETURN(-ENOMEM);
+
+ if (rec->lrh_type == OBD_CFG_REC) {
+ class_config_parse_rec(rec, outstr, 256);
+ LCONSOLE(D_WARNING, " %s\n", outstr);
+ } else {
+ LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+ rc = -EINVAL;
+ }
+
+ OBD_FREE(outstr, 256);
+ RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+ char *name, struct config_llog_instance *cfg)
+{
+ struct llog_handle *llh;
+ int rc;
+
+ ENTRY;
+
+ LCONSOLE_INFO("Dumping config log %s\n", name);
+
+ rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+ if (rc)
+ RETURN(rc);
+
+ rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+ if (rc)
+ GOTO(parse_out, rc);
+
+ rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+ llog_close(env, llh);
+
+ LCONSOLE_INFO("End config log %s\n", name);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+ char flags[3] = "";
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
+ int rc;
+ ENTRY;
+
+ if (!obd) {
+ CERROR("empty cleanup\n");
+ RETURN(-EALREADY);
+ }
+
+ if (obd->obd_force)
+ strcat(flags, "F");
+ if (obd->obd_fail)
+ strcat(flags, "A");
+
+ CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+ obd->obd_name, flags);
+
+ lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, flags);
+ lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+ if (!lcfg)
+ RETURN(-ENOMEM);
+
+ rc = class_process_config(lcfg);
+ if (rc) {
+ CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+ GOTO(out, rc);
+ }
+
+ /* the lcfg is almost the same for both ops */
+ lcfg->lcfg_command = LCFG_DETACH;
+ rc = class_process_config(lcfg);
+ if (rc)
+ CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+ lustre_cfg_free(lcfg);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+ sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+ return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ * state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ LASSERT(key);
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+ return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+ !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+ class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+ class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+ .hs_hash = uuid_hash,
+ .hs_key = uuid_key,
+ .hs_keycmp = uuid_keycmp,
+ .hs_object = uuid_export_object,
+ .hs_get = uuid_export_get,
+ .hs_put_locked = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+ return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+ RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ * state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ LASSERT(key);
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+ RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+ !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+ class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct obd_export *exp;
+
+ exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+ class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+ .hs_hash = nid_hash,
+ .hs_key = nid_key,
+ .hs_keycmp = nid_kepcmp,
+ .hs_object = nid_export_object,
+ .hs_get = nid_export_get,
+ .hs_put_locked = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+ return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+ return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+ return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+ nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+ struct nid_stat *ns;
+
+ ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+ nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+ .hs_hash = nid_hash,
+ .hs_key = nidstats_key,
+ .hs_keycmp = nidstats_keycmp,
+ .hs_object = nidstats_object,
+ .hs_get = nidstats_get,
+ .hs_put_locked = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 000000000000..99adad9793c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+ struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ * the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ * (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ * this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg)
+{
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs *bufs;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *mgc = lsi->lsi_mgc;
+ int rc;
+ ENTRY;
+
+ LASSERT(mgc);
+ LASSERT(cfg);
+
+ OBD_ALLOC_PTR(bufs);
+ if (bufs == NULL)
+ RETURN(-ENOMEM);
+
+ /* mgc_process_config */
+ lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+ lustre_cfg_bufs_set_string(bufs, 1, logname);
+ lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+ lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+ lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+ rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+ lustre_cfg_free(lcfg);
+
+ OBD_FREE_PTR(bufs);
+
+ if (rc == -EINVAL)
+ LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+ "failed from the MGS (%d). Make sure this "
+ "client and the MGS are running compatible "
+ "versions of Lustre.\n",
+ mgc->obd_name, logname, rc);
+
+ if (rc)
+ LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+ "failed (%d). This may be the result of "
+ "communication errors between this node and "
+ "the MGS, a bad configuration, or other "
+ "errors. See the syslog for more "
+ "information.\n", mgc->obd_name, logname,
+ rc);
+
+ /* class_obd_list(); */
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+ struct config_llog_instance *cfg)
+{
+ struct lustre_cfg *lcfg;
+ struct lustre_cfg_bufs bufs;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *mgc = lsi->lsi_mgc;
+ int rc;
+ ENTRY;
+
+ if (!mgc)
+ RETURN(-ENOENT);
+
+ /* mgc_process_config */
+ lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+ lustre_cfg_bufs_set_string(&bufs, 1, logname);
+ if (cfg)
+ lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+ lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+ rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+ lustre_cfg_free(lcfg);
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+ char *s1, char *s2, char *s3, char *s4)
+{
+ struct lustre_cfg_bufs bufs;
+ struct lustre_cfg * lcfg = NULL;
+ int rc;
+
+ CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+ cmd, s1, s2, s3, s4);
+
+ lustre_cfg_bufs_reset(&bufs, cfgname);
+ if (s1)
+ lustre_cfg_bufs_set_string(&bufs, 1, s1);
+ if (s2)
+ lustre_cfg_bufs_set_string(&bufs, 2, s2);
+ if (s3)
+ lustre_cfg_bufs_set_string(&bufs, 3, s3);
+ if (s4)
+ lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+ lcfg = lustre_cfg_new(cmd, &bufs);
+ lcfg->lcfg_nid = nid;
+ rc = class_process_config(lcfg);
+ lustre_cfg_free(lcfg);
+ return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup. These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+ char *s1, char *s2, char *s3, char *s4)
+{
+ int rc;
+ CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+ rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+ if (rc) {
+ CERROR("%s attach error %d\n", obdname, rc);
+ return rc;
+ }
+ rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+ if (rc) {
+ CERROR("%s setup error %d\n", obdname, rc);
+ do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+ }
+ return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+ struct obd_connect_data *data = NULL;
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *obd;
+ struct obd_export *exp;
+ struct obd_uuid *uuid;
+ class_uuid_t uuidc;
+ lnet_nid_t nid;
+ char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+ char *ptr;
+ int recov_bk;
+ int rc = 0, i = 0, j, len;
+ ENTRY;
+
+ LASSERT(lsi->lsi_lmd);
+
+ /* Find the first non-lo MGS nid for our MGC name */
+ if (IS_SERVER(lsi)) {
+ /* mount -o mgsnode=nid */
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ if (lsi->lsi_lmd->lmd_mgs &&
+ (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+ i++;
+ } else if (IS_MGS(lsi)) {
+ lnet_process_id_t id;
+ while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+ if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+ continue;
+ nid = id.nid;
+ i++;
+ break;
+ }
+ }
+ } else { /* client */
+ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+ ptr = lsi->lsi_lmd->lmd_dev;
+ if (class_parse_nid(ptr, &nid, &ptr) == 0)
+ i++;
+ }
+ if (i == 0) {
+ CERROR("No valid MGS nids found.\n");
+ RETURN(-EINVAL);
+ }
+
+ mutex_lock(&mgc_start_lock);
+
+ len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+ OBD_ALLOC(mgcname, len);
+ OBD_ALLOC(niduuid, len + 2);
+ if (!mgcname || !niduuid)
+ GOTO(out_free, rc = -ENOMEM);
+ sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+ mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+ OBD_ALLOC_PTR(data);
+ if (data == NULL)
+ GOTO(out_free, rc = -ENOMEM);
+
+ obd = class_name2obd(mgcname);
+ if (obd && !obd->obd_stopping) {
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ strlen(KEY_MGSSEC), KEY_MGSSEC,
+ strlen(mgssec), mgssec, NULL);
+ if (rc)
+ GOTO(out_free, rc);
+
+ /* Re-using an existing MGC */
+ atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+ /* IR compatibility check, only for clients */
+ if (lmd_is_client(lsi->lsi_lmd)) {
+ int has_ir;
+ int vallen = sizeof(*data);
+ __u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+ rc = obd_get_info(NULL, obd->obd_self_export,
+ strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+ &vallen, data, NULL);
+ LASSERT(rc == 0);
+ has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+ if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+ /* LMD_FLG_NOIR is for test purpose only */
+ LCONSOLE_WARN(
+ "Trying to mount a client with IR setting "
+ "not compatible with current mgc. "
+ "Force to use current mgc setting that is "
+ "IR %s.\n",
+ has_ir ? "enabled" : "disabled");
+ if (has_ir)
+ *flags &= ~LMD_FLG_NOIR;
+ else
+ *flags |= LMD_FLG_NOIR;
+ }
+ }
+
+ recov_bk = 0;
+ /* If we are restarting the MGS, don't try to keep the MGC's
+ old connection, or registration will fail. */
+ if (IS_MGS(lsi)) {
+ CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+ recov_bk = 1;
+ }
+
+ /* Try all connections, but only once (again).
+ We don't want to block another target from starting
+ (using its local copy of the log), but we do want to connect
+ if at all possible. */
+ recov_bk++;
+ CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ sizeof(KEY_INIT_RECOV_BACKUP),
+ KEY_INIT_RECOV_BACKUP,
+ sizeof(recov_bk), &recov_bk, NULL);
+ GOTO(out, rc = 0);
+ }
+
+ CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+ /* Add the primary nids for the MGS */
+ i = 0;
+ sprintf(niduuid, "%s_%x", mgcname, i);
+ if (IS_SERVER(lsi)) {
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ if (IS_MGS(lsi)) {
+ /* Use local nids (including LO) */
+ lnet_process_id_t id;
+ while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+ rc = do_lcfg(mgcname, id.nid,
+ LCFG_ADD_UUID, niduuid, 0,0,0);
+ }
+ } else {
+ /* Use mgsnode= nids */
+ /* mount -o mgsnode=nid */
+ if (lsi->lsi_lmd->lmd_mgs) {
+ ptr = lsi->lsi_lmd->lmd_mgs;
+ } else if (class_find_param(ptr, PARAM_MGSNODE,
+ &ptr) != 0) {
+ CERROR("No MGS nids given.\n");
+ GOTO(out_free, rc = -EINVAL);
+ }
+ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid, 0,0,0);
+ i++;
+ }
+ }
+ } else { /* client */
+ /* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+ ptr = lsi->lsi_lmd->lmd_dev;
+ while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid, 0,0,0);
+ i++;
+ /* Stop at the first failover nid */
+ if (*ptr == ':')
+ break;
+ }
+ }
+ if (i == 0) {
+ CERROR("No valid MGS nids found.\n");
+ GOTO(out_free, rc = -EINVAL);
+ }
+ lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+ /* Random uuid for MGC allows easier reconnects */
+ OBD_ALLOC_PTR(uuid);
+ ll_generate_random_uuid(uuidc);
+ class_uuid_unparse(uuidc, uuid);
+
+ /* Start the MGC */
+ rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+ (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+ niduuid, 0, 0);
+ OBD_FREE_PTR(uuid);
+ if (rc)
+ GOTO(out_free, rc);
+
+ /* Add any failover MGS nids */
+ i = 1;
+ while (ptr && ((*ptr == ':' ||
+ class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+ /* New failover node */
+ sprintf(niduuid, "%s_%x", mgcname, i);
+ j = 0;
+ while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+ j++;
+ rc = do_lcfg(mgcname, nid,
+ LCFG_ADD_UUID, niduuid, 0,0,0);
+ if (*ptr == ':')
+ break;
+ }
+ if (j > 0) {
+ rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+ niduuid, 0, 0, 0);
+ i++;
+ } else {
+ /* at ":/fsname" */
+ break;
+ }
+ }
+ lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+ obd = class_name2obd(mgcname);
+ if (!obd) {
+ CERROR("Can't find mgcobd %s\n", mgcname);
+ GOTO(out_free, rc = -ENOTCONN);
+ }
+
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ strlen(KEY_MGSSEC), KEY_MGSSEC,
+ strlen(mgssec), mgssec, NULL);
+ if (rc)
+ GOTO(out_free, rc);
+
+ /* Keep a refcount of servers/clients who started with "mount",
+ so we know when we can get rid of the mgc. */
+ atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+ /* Try all connections, but only once. */
+ recov_bk = 1;
+ rc = obd_set_info_async(NULL, obd->obd_self_export,
+ sizeof(KEY_INIT_RECOV_BACKUP),
+ KEY_INIT_RECOV_BACKUP,
+ sizeof(recov_bk), &recov_bk, NULL);
+ if (rc)
+ /* nonfatal */
+ CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+ /* We connect to the MGS at setup, and don't disconnect until cleanup */
+ data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+ OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+ data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+ if (lmd_is_client(lsi->lsi_lmd) &&
+ lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+ data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+ data->ocd_version = LUSTRE_VERSION_CODE;
+ rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+ if (rc) {
+ CERROR("connect failed %d\n", rc);
+ GOTO(out, rc);
+ }
+
+ obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+ /* Keep the mgc info in the sb. Note that many lsi's can point
+ to the same mgc.*/
+ lsi->lsi_mgc = obd;
+out_free:
+ mutex_unlock(&mgc_start_lock);
+
+ if (data)
+ OBD_FREE_PTR(data);
+ if (mgcname)
+ OBD_FREE(mgcname, len);
+ if (niduuid)
+ OBD_FREE(niduuid, len + 2);
+ RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct obd_device *obd;
+ char *niduuid = 0, *ptr = 0;
+ int i, rc = 0, len = 0;
+ ENTRY;
+
+ if (!lsi)
+ RETURN(-ENOENT);
+ obd = lsi->lsi_mgc;
+ if (!obd)
+ RETURN(-ENOENT);
+ lsi->lsi_mgc = NULL;
+
+ mutex_lock(&mgc_start_lock);
+ LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+ if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+ /* This is not fatal, every client that stops
+ will call in here. */
+ CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+ atomic_read(&obd->u.cli.cl_mgc_refcount));
+ GOTO(out, rc = -EBUSY);
+ }
+
+ /* The MGC has no recoverable data in any case.
+ * force shotdown set in umount_begin */
+ obd->obd_no_recov = 1;
+
+ if (obd->u.cli.cl_mgc_mgsexp) {
+ /* An error is not fatal, if we are unable to send the
+ disconnect mgs ping evictor cleans up the export */
+ rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+ if (rc)
+ CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+ }
+
+ /* Save the obdname for cleaning the nid uuids, which are
+ obdname_XX */
+ len = strlen(obd->obd_name) + 6;
+ OBD_ALLOC(niduuid, len);
+ if (niduuid) {
+ strcpy(niduuid, obd->obd_name);
+ ptr = niduuid + strlen(niduuid);
+ }
+
+ rc = class_manual_cleanup(obd);
+ if (rc)
+ GOTO(out, rc);
+
+ /* Clean the nid uuids */
+ if (!niduuid)
+ GOTO(out, rc = -ENOMEM);
+
+ for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+ sprintf(ptr, "_%x", i);
+ rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+ niduuid, 0, 0, 0);
+ if (rc)
+ CERROR("del MDC UUID %s failed: rc = %d\n",
+ niduuid, rc);
+ }
+out:
+ if (niduuid)
+ OBD_FREE(niduuid, len);
+
+ /* class_import_put will get rid of the additional connections */
+ mutex_unlock(&mgc_start_lock);
+ RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi;
+ ENTRY;
+
+ OBD_ALLOC_PTR(lsi);
+ if (!lsi)
+ RETURN(NULL);
+ OBD_ALLOC_PTR(lsi->lsi_lmd);
+ if (!lsi->lsi_lmd) {
+ OBD_FREE_PTR(lsi);
+ RETURN(NULL);
+ }
+
+ lsi->lsi_lmd->lmd_exclude_count = 0;
+ lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+ lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+ s2lsi_nocast(sb) = lsi;
+ /* we take 1 extra ref for our setup */
+ atomic_set(&lsi->lsi_mounts, 1);
+
+ /* Default umount style */
+ lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+ RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ ENTRY;
+
+ LASSERT(lsi != NULL);
+ CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+ /* someone didn't call server_put_mount. */
+ LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+ if (lsi->lsi_lmd != NULL) {
+ if (lsi->lsi_lmd->lmd_dev != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_dev,
+ strlen(lsi->lsi_lmd->lmd_dev) + 1);
+ if (lsi->lsi_lmd->lmd_profile != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_profile,
+ strlen(lsi->lsi_lmd->lmd_profile) + 1);
+ if (lsi->lsi_lmd->lmd_mgssec != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+ strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+ if (lsi->lsi_lmd->lmd_opts != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_opts,
+ strlen(lsi->lsi_lmd->lmd_opts) + 1);
+ if (lsi->lsi_lmd->lmd_exclude_count)
+ OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+ sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+ lsi->lsi_lmd->lmd_exclude_count);
+ if (lsi->lsi_lmd->lmd_mgs != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+ strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+ if (lsi->lsi_lmd->lmd_osd_type != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+ strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+ if (lsi->lsi_lmd->lmd_params != NULL)
+ OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+ OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+ }
+
+ LASSERT(lsi->lsi_llsbi == NULL);
+ OBD_FREE(lsi, sizeof(*lsi));
+ s2lsi_nocast(sb) = NULL;
+
+ RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+ e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ ENTRY;
+
+ LASSERT(lsi != NULL);
+
+ CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+ if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+ if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+ obd_disconnect(lsi->lsi_osd_exp);
+ /* wait till OSD is gone */
+ obd_zombie_barrier();
+ }
+ lustre_free_lsi(sb);
+ RETURN(1);
+ }
+ RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ * Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0 on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+ const char *dash = strrchr(svname, '-');
+ if (!dash) {
+ dash = strrchr(svname, ':');
+ if (!dash)
+ return -EINVAL;
+ }
+
+ /* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+ * in the fsname, then determine the server index */
+ if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+ dash--;
+ for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+ ;
+ if (dash == svname)
+ return -EINVAL;
+ }
+
+ if (fsname != NULL) {
+ strncpy(fsname, svname, dash - svname);
+ fsname[dash - svname] = '\0';
+ }
+
+ if (endptr != NULL)
+ *endptr = dash;
+
+ return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+ size_t svsize)
+{
+ int rc;
+ const const char *dash;
+
+ /* We use server_name2fsname() just for parsing */
+ rc = server_name2fsname(label, NULL, &dash);
+ if (rc != 0)
+ return rc;
+
+ if (*dash != '-')
+ return -1;
+
+ if (strlcpy(svname, dash + 1, svsize) >= svsize)
+ return -E2BIG;
+
+ return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+ rc = server type, or
+ rc < 0 on error
+ if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+ unsigned long index;
+ int rc;
+ const char *dash;
+
+ /* We use server_name2fsname() just for parsing */
+ rc = server_name2fsname(svname, NULL, &dash);
+ if (rc != 0)
+ return rc;
+
+ if (*dash != '-')
+ return -EINVAL;
+
+ dash++;
+
+ if (strncmp(dash, "MDT", 3) == 0)
+ rc = LDD_F_SV_TYPE_MDT;
+ else if (strncmp(dash, "OST", 3) == 0)
+ rc = LDD_F_SV_TYPE_OST;
+ else
+ return -EINVAL;
+
+ dash += 3;
+
+ if (strcmp(dash, "all") == 0)
+ return rc | LDD_F_SV_ALL;
+
+ index = simple_strtoul(dash, (char **)endptr, 16);
+ *idx = index;
+
+ return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+ /* Drop a ref to the MGC */
+ rc = lustre_stop_mgc(sb);
+ if (rc && (rc != -ENOENT)) {
+ if (rc != -EBUSY) {
+ CERROR("Can't stop MGC: %d\n", rc);
+ RETURN(rc);
+ }
+ /* BUSY just means that there's some other obd that
+ needs the mgc. Let him clean it up. */
+ CDEBUG(D_MOUNT, "MGC still in use\n");
+ }
+ /* Drop a ref to the mounted disk */
+ lustre_put_lsi(sb);
+ lu_types_stop();
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+ int i;
+
+ PRINT_CMD(D_MOUNT, " mount data:\n");
+ if (lmd_is_client(lmd))
+ PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+ PRINT_CMD(D_MOUNT, "device: %s\n", lmd->lmd_dev);
+ PRINT_CMD(D_MOUNT, "flags: %x\n", lmd->lmd_flags);
+
+ if (lmd->lmd_opts)
+ PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+ if (lmd->lmd_recovery_time_soft)
+ PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+ lmd->lmd_recovery_time_soft);
+
+ if (lmd->lmd_recovery_time_hard)
+ PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+ lmd->lmd_recovery_time_hard);
+
+ for (i = 0; i < lmd->lmd_exclude_count; i++) {
+ PRINT_CMD(D_MOUNT, "exclude %d: OST%04x\n", i,
+ lmd->lmd_exclude[i]);
+ }
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+ struct lustre_mount_data *lmd = lsi->lsi_lmd;
+ __u32 index;
+ int i, rc;
+ ENTRY;
+
+ rc = server_name2index(svname, &index, NULL);
+ if (rc != LDD_F_SV_TYPE_OST)
+ /* Only exclude OSTs */
+ RETURN(0);
+
+ CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+ index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+ for(i = 0; i < lmd->lmd_exclude_count; i++) {
+ if (index == lmd->lmd_exclude[i]) {
+ CWARN("Excluding %s (on exclusion list)\n", svname);
+ RETURN(1);
+ }
+ }
+ RETURN(0);
+}
+
+/* mount -v -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+ const char *s1 = ptr, *s2;
+ __u32 index, *exclude_list;
+ int rc = 0, devmax;
+ ENTRY;
+
+ /* The shortest an ost name can be is 8 chars: -OST0000.
+ We don't actually know the fsname at this time, so in fact
+ a user could specify any fsname. */
+ devmax = strlen(ptr) / 8 + 1;
+
+ /* temp storage until we figure out how many we have */
+ OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+ if (!exclude_list)
+ RETURN(-ENOMEM);
+
+ /* we enter this fn pointing at the '=' */
+ while (*s1 && *s1 != ' ' && *s1 != ',') {
+ s1++;
+ rc = server_name2index(s1, &index, &s2);
+ if (rc < 0) {
+ CERROR("Can't parse server name '%s'\n", s1);
+ break;
+ }
+ if (rc == LDD_F_SV_TYPE_OST)
+ exclude_list[lmd->lmd_exclude_count++] = index;
+ else
+ CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+ s1 = s2;
+ /* now we are pointing at ':' (next exclude)
+ or ',' (end of excludes) */
+ if (lmd->lmd_exclude_count >= devmax)
+ break;
+ }
+ if (rc >= 0) /* non-err */
+ rc = 0;
+
+ if (lmd->lmd_exclude_count) {
+ /* permanent, freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+ lmd->lmd_exclude_count);
+ if (lmd->lmd_exclude) {
+ memcpy(lmd->lmd_exclude, exclude_list,
+ sizeof(index) * lmd->lmd_exclude_count);
+ } else {
+ rc = -ENOMEM;
+ lmd->lmd_exclude_count = 0;
+ }
+ }
+ OBD_FREE(exclude_list, sizeof(index) * devmax);
+ RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+ char *tail;
+ int length;
+
+ if (lmd->lmd_mgssec != NULL) {
+ OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+ lmd->lmd_mgssec = NULL;
+ }
+
+ tail = strchr(ptr, ',');
+ if (tail == NULL)
+ length = strlen(ptr);
+ else
+ length = tail - ptr;
+
+ OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+ if (lmd->lmd_mgssec == NULL)
+ return -ENOMEM;
+
+ memcpy(lmd->lmd_mgssec, ptr, length);
+ lmd->lmd_mgssec[length] = '\0';
+ return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+ char *tail;
+ int length;
+
+ if ((handle == NULL) || (ptr == NULL))
+ return -EINVAL;
+
+ if (*handle != NULL) {
+ OBD_FREE(*handle, strlen(*handle) + 1);
+ *handle = NULL;
+ }
+
+ tail = strchr(ptr, ',');
+ if (tail == NULL)
+ length = strlen(ptr);
+ else
+ length = tail - ptr;
+
+ OBD_ALLOC(*handle, length + 1);
+ if (*handle == NULL)
+ return -ENOMEM;
+
+ memcpy(*handle, ptr, length);
+ (*handle)[length] = '\0';
+
+ return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+ lnet_nid_t nid;
+ char *tail = *ptr;
+ char *mgsnid;
+ int length;
+ int oldlen = 0;
+
+ /* Find end of nidlist */
+ while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+ length = tail - *ptr;
+ if (length == 0) {
+ LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+ return -EINVAL;
+ }
+
+ if (lmd->lmd_mgs != NULL)
+ oldlen = strlen(lmd->lmd_mgs) + 1;
+
+ OBD_ALLOC(mgsnid, oldlen + length + 1);
+ if (mgsnid == NULL)
+ return -ENOMEM;
+
+ if (lmd->lmd_mgs != NULL) {
+ /* Multiple mgsnid= are taken to mean failover locations */
+ memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+ mgsnid[oldlen - 1] = ':';
+ OBD_FREE(lmd->lmd_mgs, oldlen);
+ }
+ memcpy(mgsnid + oldlen, *ptr, length);
+ mgsnid[oldlen + length] = '\0';
+ lmd->lmd_mgs = mgsnid;
+ *ptr = tail;
+
+ return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+ char *s1, *s2, *devname = NULL;
+ struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+ int rc = 0;
+ ENTRY;
+
+ LASSERT(lmd);
+ if (!options) {
+ LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+ "/sbin/mount.lustre is installed.\n");
+ RETURN(-EINVAL);
+ }
+
+ /* Options should be a string - try to detect old lmd data */
+ if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+ LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+ "/sbin/mount.lustre. Please install "
+ "version %s\n", LUSTRE_VERSION_STRING);
+ RETURN(-EINVAL);
+ }
+ lmd->lmd_magic = LMD_MAGIC;
+
+ OBD_ALLOC(lmd->lmd_params, 4096);
+ if (lmd->lmd_params == NULL)
+ RETURN(-ENOMEM);
+ lmd->lmd_params[0] = '\0';
+
+ /* Set default flags here */
+
+ s1 = options;
+ while (*s1) {
+ int clear = 0;
+ int time_min = OBD_RECOVERY_TIME_MIN;
+
+ /* Skip whitespace and extra commas */
+ while (*s1 == ' ' || *s1 == ',')
+ s1++;
+
+ /* Client options are parsed in ll_options: eg. flock,
+ user_xattr, acl */
+
+ /* Parse non-ldiskfs options here. Rather than modifying
+ ldiskfs, we just zero these out here */
+ if (strncmp(s1, "abort_recov", 11) == 0) {
+ lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+ clear++;
+ } else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+ lmd->lmd_recovery_time_soft = max_t(int,
+ simple_strtoul(s1 + 19, NULL, 10), time_min);
+ clear++;
+ } else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+ lmd->lmd_recovery_time_hard = max_t(int,
+ simple_strtoul(s1 + 19, NULL, 10), time_min);
+ clear++;
+ } else if (strncmp(s1, "noir", 4) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+ clear++;
+ } else if (strncmp(s1, "nosvc", 5) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOSVC;
+ clear++;
+ } else if (strncmp(s1, "nomgs", 5) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOMGS;
+ clear++;
+ } else if (strncmp(s1, "noscrub", 7) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+ clear++;
+ } else if (strncmp(s1, PARAM_MGSNODE,
+ sizeof(PARAM_MGSNODE) - 1) == 0) {
+ s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+ /* Assume the next mount opt is the first
+ invalid nid we get to. */
+ rc = lmd_parse_mgs(lmd, &s2);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "writeconf", 9) == 0) {
+ lmd->lmd_flags |= LMD_FLG_WRITECONF;
+ clear++;
+ } else if (strncmp(s1, "update", 6) == 0) {
+ lmd->lmd_flags |= LMD_FLG_UPDATE;
+ clear++;
+ } else if (strncmp(s1, "virgin", 6) == 0) {
+ lmd->lmd_flags |= LMD_FLG_VIRGIN;
+ clear++;
+ } else if (strncmp(s1, "noprimnode", 10) == 0) {
+ lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+ clear++;
+ } else if (strncmp(s1, "mgssec=", 7) == 0) {
+ rc = lmd_parse_mgssec(lmd, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ /* ost exclusion list */
+ } else if (strncmp(s1, "exclude=", 8) == 0) {
+ rc = lmd_make_exclusion(lmd, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "mgs", 3) == 0) {
+ /* We are an MGS */
+ lmd->lmd_flags |= LMD_FLG_MGS;
+ clear++;
+ } else if (strncmp(s1, "svname=", 7) == 0) {
+ rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+ if (rc)
+ goto invalid;
+ clear++;
+ } else if (strncmp(s1, "param=", 6) == 0) {
+ int length;
+ char *tail = strchr(s1 + 6, ',');
+ if (tail == NULL)
+ length = strlen(s1);
+ else
+ length = tail - s1;
+ length -= 6;
+ strncat(lmd->lmd_params, s1 + 6, length);
+ strcat(lmd->lmd_params, " ");
+ clear++;
+ } else if (strncmp(s1, "osd=", 4) == 0) {
+ rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+ if (rc)
+ goto invalid;
+ clear++;
+ }
+ /* Linux 2.4 doesn't pass the device, so we stuck it at the
+ end of the options. */
+ else if (strncmp(s1, "device=", 7) == 0) {
+ devname = s1 + 7;
+ /* terminate options right before device. device
+ must be the last one. */
+ *s1 = '\0';
+ break;
+ }
+
+ /* Find next opt */
+ s2 = strchr(s1, ',');
+ if (s2 == NULL) {
+ if (clear)
+ *s1 = '\0';
+ break;
+ }
+ s2++;
+ if (clear)
+ memmove(s1, s2, strlen(s2) + 1);
+ else
+ s1 = s2;
+ }
+
+ if (!devname) {
+ LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+ "(need mount option 'device=...')\n");
+ goto invalid;
+ }
+
+ s1 = strstr(devname, ":/");
+ if (s1) {
+ ++s1;
+ lmd->lmd_flags |= LMD_FLG_CLIENT;
+ /* Remove leading /s from fsname */
+ while (*++s1 == '/') ;
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+ if (!lmd->lmd_profile)
+ RETURN(-ENOMEM);
+ sprintf(lmd->lmd_profile, "%s-client", s1);
+ }
+
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+ if (!lmd->lmd_dev)
+ RETURN(-ENOMEM);
+ strcpy(lmd->lmd_dev, devname);
+
+ /* Save mount options */
+ s1 = options + strlen(options) - 1;
+ while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+ *s1-- = 0;
+ if (*options != 0) {
+ /* Freed in lustre_free_lsi */
+ OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+ if (!lmd->lmd_opts)
+ RETURN(-ENOMEM);
+ strcpy(lmd->lmd_opts, options);
+ }
+
+ lmd_print(lmd);
+ lmd->lmd_magic = LMD_MAGIC;
+
+ RETURN(rc);
+
+invalid:
+ CERROR("Bad mount options %s\n", options);
+ RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+ void *lmd2_data;
+ struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct lustre_mount_data *lmd;
+ struct lustre_mount_data2 *lmd2 = data;
+ struct lustre_sb_info *lsi;
+ int rc;
+ ENTRY;
+
+ CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+ lsi = lustre_init_lsi(sb);
+ if (!lsi)
+ RETURN(-ENOMEM);
+ lmd = lsi->lsi_lmd;
+
+ /*
+ * Disable lockdep during mount, because mount locking patterns are
+ * `special'.
+ */
+ lockdep_off();
+
+ /*
+ * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+ */
+ obd_zombie_barrier();
+
+ /* Figure out the lmd from the mount options */
+ if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+ lustre_put_lsi(sb);
+ GOTO(out, rc = -EINVAL);
+ }
+
+ if (lmd_is_client(lmd)) {
+ CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+ if (!client_fill_super) {
+ LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+ "client mount! Is the 'lustre' "
+ "module loaded?\n");
+ lustre_put_lsi(sb);
+ rc = -ENODEV;
+ } else {
+ rc = lustre_start_mgc(sb);
+ if (rc) {
+ lustre_put_lsi(sb);
+ GOTO(out, rc);
+ }
+ /* Connect and start */
+ /* (should always be ll_fill_super) */
+ rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+ /* c_f_s will call lustre_common_put_super on failure */
+ }
+ } else {
+ CERROR("This is client-side-only module, "
+ "cannot handle server mount.\n");
+ rc = -EINVAL;
+ }
+
+ /* If error happens in fill_super() call, @lsi will be killed there.
+ * This is why we do not put it here. */
+ GOTO(out, rc);
+out:
+ if (rc) {
+ CERROR("Unable to mount %s (%d)\n",
+ s2lsi(sb) ? lmd->lmd_dev : "", rc);
+ } else {
+ CDEBUG(D_SUPER, "Mount %s complete\n",
+ lmd->lmd_dev);
+ }
+ lockdep_on();
+ return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+ must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+ struct vfsmount *mnt))
+{
+ client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+ kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+ const char *devname, void *data)
+{
+ struct lustre_mount_data2 lmd2 = { data, NULL };
+
+ return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+ struct lustre_sb_info *lsi = s2lsi(sb);
+
+ if (kill_super_cb && lsi && !IS_SERVER(lsi))
+ (*kill_super_cb)(sb);
+
+ kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "lustre",
+ .mount = lustre_mount,
+ .kill_sb = lustre_kill_super,
+ .fs_flags = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+ FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+ return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+ return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 000000000000..01a0e1f83a68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obdo.c
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+ dst->o_parent_oid = fid_oid(parent);
+ dst->o_parent_seq = fid_seq(parent);
+ dst->o_parent_ver = fid_ver(parent);
+ dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+ attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+ obd_flag newvalid = 0;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+ valid, LTIME_S(src->i_mtime),
+ LTIME_S(src->i_ctime));
+
+ if (valid & OBD_MD_FLATIME) {
+ dst->o_atime = LTIME_S(src->i_atime);
+ newvalid |= OBD_MD_FLATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ dst->o_mtime = LTIME_S(src->i_mtime);
+ newvalid |= OBD_MD_FLMTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ dst->o_ctime = LTIME_S(src->i_ctime);
+ newvalid |= OBD_MD_FLCTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ dst->o_size = i_size_read(src);
+ newvalid |= OBD_MD_FLSIZE;
+ }
+ if (valid & OBD_MD_FLBLOCKS) { /* allocation of space (x512 bytes) */
+ dst->o_blocks = src->i_blocks;
+ newvalid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & OBD_MD_FLBLKSZ) { /* optimal block size */
+ dst->o_blksize = ll_inode_blksize(src);
+ newvalid |= OBD_MD_FLBLKSZ;
+ }
+ if (valid & OBD_MD_FLTYPE) {
+ dst->o_mode = (dst->o_mode & S_IALLUGO) |
+ (src->i_mode & S_IFMT);
+ newvalid |= OBD_MD_FLTYPE;
+ }
+ if (valid & OBD_MD_FLMODE) {
+ dst->o_mode = (dst->o_mode & S_IFMT) |
+ (src->i_mode & S_IALLUGO);
+ newvalid |= OBD_MD_FLMODE;
+ }
+ if (valid & OBD_MD_FLUID) {
+ dst->o_uid = src->i_uid;
+ newvalid |= OBD_MD_FLUID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ dst->o_gid = src->i_gid;
+ newvalid |= OBD_MD_FLGID;
+ }
+ if (valid & OBD_MD_FLFLAGS) {
+ dst->o_flags = ll_inode_flags(src);
+ newvalid |= OBD_MD_FLFLAGS;
+ }
+ dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+ CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+ POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+ if (valid & OBD_MD_FLATIME)
+ dst->o_atime = src->o_atime;
+ if (valid & OBD_MD_FLMTIME)
+ dst->o_mtime = src->o_mtime;
+ if (valid & OBD_MD_FLCTIME)
+ dst->o_ctime = src->o_ctime;
+ if (valid & OBD_MD_FLSIZE)
+ dst->o_size = src->o_size;
+ if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+ dst->o_blocks = src->o_blocks;
+ if (valid & OBD_MD_FLBLKSZ)
+ dst->o_blksize = src->o_blksize;
+ if (valid & OBD_MD_FLTYPE)
+ dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+ if (valid & OBD_MD_FLMODE)
+ dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+ if (valid & OBD_MD_FLUID)
+ dst->o_uid = src->o_uid;
+ if (valid & OBD_MD_FLGID)
+ dst->o_gid = src->o_gid;
+ if (valid & OBD_MD_FLFLAGS)
+ dst->o_flags = src->o_flags;
+ if (valid & OBD_MD_FLFID) {
+ dst->o_parent_seq = src->o_parent_seq;
+ dst->o_parent_ver = src->o_parent_ver;
+ }
+ if (valid & OBD_MD_FLGENER)
+ dst->o_parent_oid = src->o_parent_oid;
+ if (valid & OBD_MD_FLHANDLE)
+ dst->o_handle = src->o_handle;
+ if (valid & OBD_MD_FLCOOKIE)
+ dst->o_lcookie = src->o_lcookie;
+
+ dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+ int res = 0;
+
+ if ( compare & OBD_MD_FLATIME )
+ res = (res || (dst->o_atime != src->o_atime));
+ if ( compare & OBD_MD_FLMTIME )
+ res = (res || (dst->o_mtime != src->o_mtime));
+ if ( compare & OBD_MD_FLCTIME )
+ res = (res || (dst->o_ctime != src->o_ctime));
+ if ( compare & OBD_MD_FLSIZE )
+ res = (res || (dst->o_size != src->o_size));
+ if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+ res = (res || (dst->o_blocks != src->o_blocks));
+ if ( compare & OBD_MD_FLBLKSZ )
+ res = (res || (dst->o_blksize != src->o_blksize));
+ if ( compare & OBD_MD_FLTYPE )
+ res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+ if ( compare & OBD_MD_FLMODE )
+ res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+ if ( compare & OBD_MD_FLUID )
+ res = (res || (dst->o_uid != src->o_uid));
+ if ( compare & OBD_MD_FLGID )
+ res = (res || (dst->o_gid != src->o_gid));
+ if ( compare & OBD_MD_FLFLAGS )
+ res = (res || (dst->o_flags != src->o_flags));
+ if ( compare & OBD_MD_FLNLINK )
+ res = (res || (dst->o_nlink != src->o_nlink));
+ if ( compare & OBD_MD_FLFID ) {
+ res = (res || (dst->o_parent_seq != src->o_parent_seq));
+ res = (res || (dst->o_parent_ver != src->o_parent_ver));
+ }
+ if ( compare & OBD_MD_FLGENER )
+ res = (res || (dst->o_parent_oid != src->o_parent_oid));
+ /* XXX Don't know if thses should be included here - wasn't previously
+ if ( compare & OBD_MD_FLINLINE )
+ res = (res || memcmp(dst->o_inline, src->o_inline));
+ */
+ return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+ ioobj->ioo_oid = oa->o_oi;
+ if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+ ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+ /* Since 2.4 this does not contain o_mode in the low 16 bits.
+ * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+ ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+ if (ia_valid & ATTR_ATIME) {
+ oa->o_atime = LTIME_S(attr->ia_atime);
+ oa->o_valid |= OBD_MD_FLATIME;
+ }
+ if (ia_valid & ATTR_MTIME) {
+ oa->o_mtime = LTIME_S(attr->ia_mtime);
+ oa->o_valid |= OBD_MD_FLMTIME;
+ }
+ if (ia_valid & ATTR_CTIME) {
+ oa->o_ctime = LTIME_S(attr->ia_ctime);
+ oa->o_valid |= OBD_MD_FLCTIME;
+ }
+ if (ia_valid & ATTR_SIZE) {
+ oa->o_size = attr->ia_size;
+ oa->o_valid |= OBD_MD_FLSIZE;
+ }
+ if (ia_valid & ATTR_MODE) {
+ oa->o_mode = attr->ia_mode;
+ oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+ if (!current_is_in_group(oa->o_gid) &&
+ !cfs_capable(CFS_CAP_FSETID))
+ oa->o_mode &= ~S_ISGID;
+ }
+ if (ia_valid & ATTR_UID) {
+ oa->o_uid = attr->ia_uid;
+ oa->o_valid |= OBD_MD_FLUID;
+ }
+ if (ia_valid & ATTR_GID) {
+ oa->o_gid = attr->ia_gid;
+ oa->o_valid |= OBD_MD_FLGID;
+ }
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+ valid &= oa->o_valid;
+
+ if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+ CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+ oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+ attr->ia_valid = 0;
+ if (valid & OBD_MD_FLATIME) {
+ LTIME_S(attr->ia_atime) = oa->o_atime;
+ attr->ia_valid |= ATTR_ATIME;
+ }
+ if (valid & OBD_MD_FLMTIME) {
+ LTIME_S(attr->ia_mtime) = oa->o_mtime;
+ attr->ia_valid |= ATTR_MTIME;
+ }
+ if (valid & OBD_MD_FLCTIME) {
+ LTIME_S(attr->ia_ctime) = oa->o_ctime;
+ attr->ia_valid |= ATTR_CTIME;
+ }
+ if (valid & OBD_MD_FLSIZE) {
+ attr->ia_size = oa->o_size;
+ attr->ia_valid |= ATTR_SIZE;
+ }
+#if 0 /* you shouldn't be able to change a file's type with setattr */
+ if (valid & OBD_MD_FLTYPE) {
+ attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+ attr->ia_valid |= ATTR_MODE;
+ }
+#endif
+ if (valid & OBD_MD_FLMODE) {
+ attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+ attr->ia_valid |= ATTR_MODE;
+ if (!current_is_in_group(oa->o_gid) &&
+ !cfs_capable(CFS_CAP_FSETID))
+ attr->ia_mode &= ~S_ISGID;
+ }
+ if (valid & OBD_MD_FLUID) {
+ attr->ia_uid = oa->o_uid;
+ attr->ia_valid |= ATTR_UID;
+ }
+ if (valid & OBD_MD_FLGID) {
+ attr->ia_gid = oa->o_gid;
+ attr->ia_valid |= ATTR_GID;
+ }
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+ iattr_from_obdo(&op_data->op_attr, oa, valid);
+ if (valid & OBD_MD_FLBLOCKS) {
+ op_data->op_attr_blocks = oa->o_blocks;
+ op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+ }
+ if (valid & OBD_MD_FLFLAGS) {
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+ oa->o_flags;
+ op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+ }
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+ unsigned int valid)
+{
+ obdo_from_iattr(oa, &op_data->op_attr, valid);
+ if (valid & ATTR_BLOCKS) {
+ oa->o_blocks = op_data->op_attr_blocks;
+ oa->o_valid |= OBD_MD_FLBLOCKS;
+ }
+ if (valid & ATTR_ATTR_FLAG) {
+ oa->o_flags =
+ ((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+ oa->o_valid |= OBD_MD_FLFLAGS;
+ }
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+ dobdo->o_size = cpu_to_le64(sobdo->o_size);
+ dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+ dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+ dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+ dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+ dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+ dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+ dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+ dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+ dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+ dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+ dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+ dobdo->o_size = le64_to_cpu(sobdo->o_size);
+ dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+ dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+ dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+ dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+ dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+ dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+ dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+ dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+ dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+ dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+ dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 000000000000..c3b7a78dba50
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+ memset(osfs, 0, sizeof(*osfs));
+ osfs->os_type = sfs->f_type;
+ osfs->os_blocks = sfs->f_blocks;
+ osfs->os_bfree = sfs->f_bfree;
+ osfs->os_bavail = sfs->f_bavail;
+ osfs->os_files = sfs->f_files;
+ osfs->os_ffree = sfs->f_ffree;
+ osfs->os_bsize = sfs->f_bsize;
+ osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+ memset(sfs, 0, sizeof(*sfs));
+ sfs->f_type = osfs->os_type;
+ sfs->f_blocks = osfs->os_blocks;
+ sfs->f_bfree = osfs->os_bfree;
+ sfs->f_bavail = osfs->os_bavail;
+ sfs->f_files = osfs->os_files;
+ sfs->f_ffree = osfs->os_ffree;
+ sfs->f_bsize = osfs->os_bsize;
+ sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 000000000000..af5f27f82bc5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/uuid.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+ __u32 value;
+
+ LASSERT(nob <= sizeof value);
+
+ for (value = 0; nob > 0; --nob)
+ value = (value << 8) | *((*ptr)++);
+ return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+ __u8 *ptr = in;
+
+ LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+ while (nr-- > 0)
+ CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+ /* uu as an array of __u16's */
+ __u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+ CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+ uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+ sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+ uuid[0], uuid[1], uuid[2], uuid[3],
+ uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);