42 files changed, 30096 insertions, 0 deletions
diff --git a/drivers/staging/lustre/lustre/obdclass/Makefile b/drivers/staging/lustre/lustre/obdclass/Makefile
new file mode 100644
index 000000000000..b80c13c6f5dd
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/Makefile
@@ -0,0 +1,13 @@
+obj-$(CONFIG_LUSTRE_FS) += obdclass.o llog_test.o
+
+obdclass-y := linux/linux-module.o linux/linux-obdo.o linux/linux-sysctl.o \
+	      llog.o llog_cat.o llog_obd.o llog_swab.o class_obd.o debug.o \
+	      genops.o uuid.o llog_ioctl.o lprocfs_status.o		   \
+	      lprocfs_jobstats.o lustre_handles.o lustre_peer.o llog_osd.o \
+	      local_storage.o statfs_pack.o obdo.o obd_config.o obd_mount.o\
+	      mea.o lu_object.o dt_object.o capa.o cl_object.o   \
+	      cl_page.o cl_lock.o cl_io.o lu_ref.o acl.o idmap.o	   \
+	      lu_ucred.o
+
+
+ccflags-y := -I$(src)/../include
diff --git a/drivers/staging/lustre/lustre/obdclass/acl.c b/drivers/staging/lustre/lustre/obdclass/acl.c
new file mode 100644
index 000000000000..c2a6702c9f2c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/acl.c
@@ -0,0 +1,546 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/acl.c
+ *
+ * Lustre Access Control List.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+#include <lu_object.h>
+#include <lustre_acl.h>
+#include <lustre_eacl.h>
+#include <obd_support.h>
+
+#ifdef CONFIG_FS_POSIX_ACL
+
+#define CFS_ACL_XATTR_VERSION POSIX_ACL_XATTR_VERSION
+
+enum {
+	ES_UNK  = 0,    /* unknown stat */
+	ES_UNC  = 1,    /* ACL entry is not changed */
+	ES_MOD  = 2,    /* ACL entry is modified */
+	ES_ADD  = 3,    /* ACL entry is added */
+	ES_DEL  = 4     /* ACL entry is deleted */
+};
+
+static inline void lustre_ext_acl_le_to_cpu(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+	d->e_stat       = le32_to_cpu(s->e_stat);
+}
+
+static inline void lustre_ext_acl_cpu_to_le(ext_acl_xattr_entry *d,
+					    ext_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+	d->e_stat       = cpu_to_le32(s->e_stat);
+}
+
+static inline void lustre_posix_acl_le_to_cpu(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= le16_to_cpu(s->e_tag);
+	d->e_perm       = le16_to_cpu(s->e_perm);
+	d->e_id	 = le32_to_cpu(s->e_id);
+}
+
+static inline void lustre_posix_acl_cpu_to_le(posix_acl_xattr_entry *d,
+					      posix_acl_xattr_entry *s)
+{
+	d->e_tag	= cpu_to_le16(s->e_tag);
+	d->e_perm       = cpu_to_le16(s->e_perm);
+	d->e_id	 = cpu_to_le32(s->e_id);
+}
+
+
+/* if "new_count == 0", then "new = {a_version, NULL}", NOT NULL. */
+static int lustre_posix_acl_xattr_reduce_space(posix_acl_xattr_header **header,
+					       int old_count, int new_count)
+{
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, posix_acl_xattr);
+	int new_size = CFS_ACL_XATTR_SIZE(new_count, posix_acl_xattr);
+	posix_acl_xattr_header *new;
+
+	if (unlikely(old_count <= new_count))
+		return old_size;
+
+	OBD_ALLOC(new, new_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, new_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return new_size;
+}
+
+/* if "new_count == 0", then "new = {0, NULL}", NOT NULL. */
+static int lustre_ext_acl_xattr_reduce_space(ext_acl_xattr_header **header,
+					     int old_count)
+{
+	int ext_count = le32_to_cpu((*header)->a_count);
+	int ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+	int old_size = CFS_ACL_XATTR_SIZE(old_count, ext_acl_xattr);
+	ext_acl_xattr_header *new;
+
+	if (unlikely(old_count <= ext_count))
+		return 0;
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		return -ENOMEM;
+
+	memcpy(new, *header, ext_size);
+	OBD_FREE(*header, old_size);
+	*header = new;
+	return 0;
+}
+
+/*
+ * Generate new extended ACL based on the posix ACL.
+ */
+ext_acl_xattr_header *
+lustre_posix_acl_xattr_2ext(posix_acl_xattr_header *header, int size)
+{
+	int count, i, esize;
+	ext_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		count = 0;
+	else
+		count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	esize = CFS_ACL_XATTR_SIZE(count, ext_acl_xattr);
+	OBD_ALLOC(new, esize);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new->a_count = cpu_to_le32(count);
+	for (i = 0; i < count; i++) {
+		new->a_entries[i].e_tag  = header->a_entries[i].e_tag;
+		new->a_entries[i].e_perm = header->a_entries[i].e_perm;
+		new->a_entries[i].e_id   = header->a_entries[i].e_id;
+		new->a_entries[i].e_stat = cpu_to_le32(ES_UNK);
+	}
+
+	RETURN(new);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_2ext);
+
+/*
+ * Filter out the "nobody" entries in the posix ACL.
+ */
+int lustre_posix_acl_xattr_filter(posix_acl_xattr_header *header, int size,
+				  posix_acl_xattr_header **out)
+{
+	int count, i, j, rc = 0;
+	__u32 id;
+	posix_acl_xattr_header *new;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(-EINVAL);
+	else if (!size)
+		RETURN(0);
+
+	OBD_ALLOC(new, size);
+	if (unlikely(new == NULL))
+		RETURN(-ENOMEM);
+
+	new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+	count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	for (i = 0, j = 0; i < count; i++) {
+		id = le32_to_cpu(header->a_entries[i].e_id);
+		switch (le16_to_cpu(header->a_entries[i].e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (id != ACL_UNDEFINED_ID)
+				GOTO(_out, rc = -EIO);
+
+			memcpy(&new->a_entries[j++], &header->a_entries[i],
+			       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_USER:
+			if (id != NOBODY_UID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		case ACL_GROUP:
+			if (id != NOBODY_GID)
+				memcpy(&new->a_entries[j++],
+				       &header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+			break;
+		default:
+			GOTO(_out, rc = -EIO);
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, count, j);
+	if (rc >= 0) {
+		size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, size);
+		size = rc;
+	}
+	return size;
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_filter);
+
+/*
+ * Release the posix ACL space.
+ */
+void lustre_posix_acl_xattr_free(posix_acl_xattr_header *header, int size)
+{
+	OBD_FREE(header, size);
+}
+EXPORT_SYMBOL(lustre_posix_acl_xattr_free);
+
+/*
+ * Release the extended ACL space.
+ */
+void lustre_ext_acl_xattr_free(ext_acl_xattr_header *header)
+{
+	OBD_FREE(header, CFS_ACL_XATTR_SIZE(le32_to_cpu(header->a_count), \
+					    ext_acl_xattr));
+}
+EXPORT_SYMBOL(lustre_ext_acl_xattr_free);
+
+static ext_acl_xattr_entry *
+lustre_ext_acl_xattr_search(ext_acl_xattr_header *header,
+			    posix_acl_xattr_entry *entry, int *pos)
+{
+	int once, start, end, i, j, count = le32_to_cpu(header->a_count);
+
+	once = 0;
+	start = *pos;
+	end = count;
+
+again:
+	for (i = start; i < end; i++) {
+		if (header->a_entries[i].e_tag == entry->e_tag &&
+		    header->a_entries[i].e_id == entry->e_id) {
+			j = i;
+			if (++i >= count)
+				i = 0;
+			*pos = i;
+			return &header->a_entries[j];
+		}
+	}
+
+	if (!once) {
+		once = 1;
+		start = 0;
+		end = *pos;
+		goto again;
+	}
+
+	return NULL;
+}
+
+/*
+ * Merge the posix ACL and the extended ACL into new posix ACL.
+ */
+int lustre_acl_xattr_merge2posix(posix_acl_xattr_header *posix_header, int size,
+				 ext_acl_xattr_header *ext_header,
+				 posix_acl_xattr_header **out)
+{
+	int posix_count, posix_size, i, j;
+	int ext_count = le32_to_cpu(ext_header->a_count), pos = 0, rc = 0;
+	posix_acl_xattr_entry pe = {ACL_MASK, 0, ACL_UNDEFINED_ID};
+	posix_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, ae;
+	ENTRY;
+
+	lustre_posix_acl_cpu_to_le(&pe, &pe);
+	ee = lustre_ext_acl_xattr_search(ext_header, &pe, &pos);
+	if (ee == NULL || le32_to_cpu(ee->e_stat) == ES_DEL) {
+		/* there are only base ACL entries at most. */
+		posix_count = 3;
+		posix_size = CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		for (i = 0, j = 0; i < ext_count; i++) {
+			lustre_ext_acl_le_to_cpu(&ae,
+						 &ext_header->a_entries[i]);
+			switch (ae.e_tag) {
+			case ACL_USER_OBJ:
+			case ACL_GROUP_OBJ:
+			case ACL_OTHER:
+				if (ae.e_id != ACL_UNDEFINED_ID)
+					GOTO(_out, rc = -EIO);
+
+				if (ae.e_stat != ES_DEL) {
+					new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+					new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+					new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+				}
+				break;
+			case ACL_MASK:
+			case ACL_USER:
+			case ACL_GROUP:
+				if (ae.e_stat == ES_DEL)
+					break;
+			default:
+				GOTO(_out, rc = -EIO);
+			}
+		}
+	} else {
+		/* maybe there are valid ACL_USER or ACL_GROUP entries in the
+		 * original server-side ACL, they are regarded as ES_UNC stat.*/
+		int ori_posix_count;
+
+		if (unlikely(size < 0))
+			RETURN(-EINVAL);
+		else if (!size)
+			ori_posix_count = 0;
+		else
+			ori_posix_count =
+				CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+		posix_count = ori_posix_count + ext_count;
+		posix_size =
+			CFS_ACL_XATTR_SIZE(posix_count, posix_acl_xattr);
+		OBD_ALLOC(new, posix_size);
+		if (unlikely(new == NULL))
+			RETURN(-ENOMEM);
+
+		new->a_version = cpu_to_le32(CFS_ACL_XATTR_VERSION);
+		/* 1. process the unchanged ACL entries
+		 *    in the original server-side ACL. */
+		pos = 0;
+		for (i = 0, j = 0; i < ori_posix_count; i++) {
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee == NULL)
+				memcpy(&new->a_entries[j++],
+				       &posix_header->a_entries[i],
+				       sizeof(posix_acl_xattr_entry));
+		}
+
+		/* 2. process the non-deleted entries
+		 *    from client-side extended ACL. */
+		for (i = 0; i < ext_count; i++) {
+			if (le16_to_cpu(ext_header->a_entries[i].e_stat) !=
+			    ES_DEL) {
+				new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+				new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+				new->a_entries[j++].e_id =
+						ext_header->a_entries[i].e_id;
+			}
+		}
+	}
+
+	/* free unused space. */
+	rc = lustre_posix_acl_xattr_reduce_space(&new, posix_count, j);
+	if (rc >= 0) {
+		posix_size = rc;
+		*out = new;
+		rc = 0;
+	}
+	EXIT;
+
+_out:
+	if (rc) {
+		OBD_FREE(new, posix_size);
+		posix_size = rc;
+	}
+	return posix_size;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2posix);
+
+/*
+ * Merge the posix ACL and the extended ACL into new extended ACL.
+ */
+ext_acl_xattr_header *
+lustre_acl_xattr_merge2ext(posix_acl_xattr_header *posix_header, int size,
+			   ext_acl_xattr_header *ext_header)
+{
+	int ori_ext_count, posix_count, ext_count, ext_size;
+	int i, j, pos = 0, rc = 0;
+	posix_acl_xattr_entry pae;
+	ext_acl_xattr_header *new;
+	ext_acl_xattr_entry *ee, eae;
+	ENTRY;
+
+	if (unlikely(size < 0))
+		RETURN(ERR_PTR(-EINVAL));
+	else if (!size)
+		posix_count = 0;
+	else
+		posix_count = CFS_ACL_XATTR_COUNT(size, posix_acl_xattr);
+	ori_ext_count = le32_to_cpu(ext_header->a_count);
+	ext_count = posix_count + ori_ext_count;
+	ext_size = CFS_ACL_XATTR_SIZE(ext_count, ext_acl_xattr);
+
+	OBD_ALLOC(new, ext_size);
+	if (unlikely(new == NULL))
+		RETURN(ERR_PTR(-ENOMEM));
+
+	for (i = 0, j = 0; i < posix_count; i++) {
+		lustre_posix_acl_le_to_cpu(&pae, &posix_header->a_entries[i]);
+		switch (pae.e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			if (pae.e_id != ACL_UNDEFINED_ID)
+				GOTO(out, rc = -EIO);
+		case ACL_USER:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_UID)
+				break;
+
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		case ACL_GROUP:
+			/* ignore "nobody" entry. */
+			if (pae.e_id == NOBODY_GID)
+				break;
+			new->a_entries[j].e_tag =
+					posix_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+					posix_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id =
+					posix_header->a_entries[i].e_id;
+			ee = lustre_ext_acl_xattr_search(ext_header,
+					&posix_header->a_entries[i], &pos);
+			if (ee) {
+				if (posix_header->a_entries[i].e_perm !=
+								ee->e_perm)
+					/* entry modified. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_MOD);
+				else
+					/* entry unchanged. */
+					ee->e_stat =
+					new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_UNC);
+			} else {
+				/* new entry. */
+				new->a_entries[j++].e_stat =
+							cpu_to_le32(ES_ADD);
+			}
+			break;
+		default:
+			GOTO(out, rc = -EIO);
+		}
+	}
+
+	/* process deleted entries. */
+	for (i = 0; i < ori_ext_count; i++) {
+		lustre_ext_acl_le_to_cpu(&eae, &ext_header->a_entries[i]);
+		if (eae.e_stat == ES_UNK) {
+			/* ignore "nobody" entry. */
+			if ((eae.e_tag == ACL_USER && eae.e_id == NOBODY_UID) ||
+			    (eae.e_tag == ACL_GROUP && eae.e_id == NOBODY_GID))
+				continue;
+
+			new->a_entries[j].e_tag =
+						ext_header->a_entries[i].e_tag;
+			new->a_entries[j].e_perm =
+						ext_header->a_entries[i].e_perm;
+			new->a_entries[j].e_id = ext_header->a_entries[i].e_id;
+			new->a_entries[j++].e_stat = cpu_to_le32(ES_DEL);
+		}
+	}
+
+	new->a_count = cpu_to_le32(j);
+	/* free unused space. */
+	rc = lustre_ext_acl_xattr_reduce_space(&new, ext_count);
+	EXIT;
+
+out:
+	if (rc) {
+		OBD_FREE(new, ext_size);
+		new = ERR_PTR(rc);
+	}
+	return new;
+}
+EXPORT_SYMBOL(lustre_acl_xattr_merge2ext);
+
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/capa.c b/drivers/staging/lustre/lustre/obdclass/capa.c
new file mode 100644
index 000000000000..3e532f5106e4
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/capa.c
@@ -0,0 +1,401 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/capa.c
+ *
+ * Lustre Capability Hash Management
+ *
+ * Author: Lai Siyao<lsy@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <asm/unistd.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_debug.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/list.h>
+#include <lustre_capa.h>
+
+#define NR_CAPAHASH 32
+#define CAPA_HASH_SIZE 3000	      /* for MDS & OSS */
+
+struct kmem_cache *capa_cachep = NULL;
+
+/* lock for capa hash/capa_list/fo_capa_keys */
+DEFINE_SPINLOCK(capa_lock);
+
+struct list_head capa_list[CAPA_SITE_MAX];
+
+static struct capa_hmac_alg capa_hmac_algs[] = {
+	DEF_CAPA_HMAC_ALG("sha1", SHA1, 20, 20),
+};
+/* capa count */
+int capa_count[CAPA_SITE_MAX] = { 0, };
+
+EXPORT_SYMBOL(capa_cachep);
+EXPORT_SYMBOL(capa_list);
+EXPORT_SYMBOL(capa_lock);
+EXPORT_SYMBOL(capa_count);
+
+struct hlist_head *init_capa_hash(void)
+{
+	struct hlist_head *hash;
+	int nr_hash, i;
+
+	OBD_ALLOC(hash, PAGE_CACHE_SIZE);
+	if (!hash)
+		return NULL;
+
+	nr_hash = PAGE_CACHE_SIZE / sizeof(struct hlist_head);
+	LASSERT(nr_hash > NR_CAPAHASH);
+
+	for (i = 0; i < NR_CAPAHASH; i++)
+		INIT_HLIST_HEAD(hash + i);
+	return hash;
+}
+EXPORT_SYMBOL(init_capa_hash);
+
+static inline int capa_on_server(struct obd_capa *ocapa)
+{
+	return ocapa->c_site == CAPA_SITE_SERVER;
+}
+
+static inline void capa_delete(struct obd_capa *ocapa)
+{
+	LASSERT(capa_on_server(ocapa));
+	hlist_del_init(&ocapa->u.tgt.c_hash);
+	list_del_init(&ocapa->c_list);
+	capa_count[ocapa->c_site]--;
+	/* release the ref when alloc */
+	capa_put(ocapa);
+}
+
+void cleanup_capa_hash(struct hlist_head *hash)
+{
+	int i;
+	struct hlist_node *next;
+	struct obd_capa *oc;
+
+	spin_lock(&capa_lock);
+	for (i = 0; i < NR_CAPAHASH; i++) {
+		hlist_for_each_entry_safe(oc, next, hash + i,
+					      u.tgt.c_hash)
+			capa_delete(oc);
+	}
+	spin_unlock(&capa_lock);
+
+	OBD_FREE(hash, PAGE_CACHE_SIZE);
+}
+EXPORT_SYMBOL(cleanup_capa_hash);
+
+static inline int capa_hashfn(struct lu_fid *fid)
+{
+	return (fid_oid(fid) ^ fid_ver(fid)) *
+	       (unsigned long)(fid_seq(fid) + 1) % NR_CAPAHASH;
+}
+
+/* capa renewal time check is earlier than that on client, which is to prevent
+ * client renew right after obtaining it. */
+static inline int capa_is_to_expire(struct obd_capa *oc)
+{
+	return cfs_time_before(cfs_time_sub(oc->c_expiry,
+				   cfs_time_seconds(oc->c_capa.lc_timeout)*2/3),
+			       cfs_time_current());
+}
+
+static struct obd_capa *find_capa(struct lustre_capa *capa,
+				  struct hlist_head *head, int alive)
+{
+	struct obd_capa *ocapa;
+	int len = alive ? offsetof(struct lustre_capa, lc_keyid):sizeof(*capa);
+
+	hlist_for_each_entry(ocapa, head, u.tgt.c_hash) {
+		if (memcmp(&ocapa->c_capa, capa, len))
+			continue;
+		/* don't return one that will expire soon in this case */
+		if (alive && capa_is_to_expire(ocapa))
+			continue;
+
+		LASSERT(capa_on_server(ocapa));
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found");
+		return ocapa;
+	}
+
+	return NULL;
+}
+
+#define LRU_CAPA_DELETE_COUNT 12
+static inline void capa_delete_lru(struct list_head *head)
+{
+	struct obd_capa *ocapa;
+	struct list_head *node = head->next;
+	int count = 0;
+
+	/* free LRU_CAPA_DELETE_COUNT unused capa from head */
+	while (count++ < LRU_CAPA_DELETE_COUNT) {
+		ocapa = list_entry(node, struct obd_capa, c_list);
+		node = node->next;
+		if (atomic_read(&ocapa->c_refc))
+			continue;
+
+		DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free lru");
+		capa_delete(ocapa);
+	}
+}
+
+/* add or update */
+struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa)
+{
+	struct hlist_head *head = hash + capa_hashfn(&capa->lc_fid);
+	struct obd_capa *ocapa, *old = NULL;
+	struct list_head *list = &capa_list[CAPA_SITE_SERVER];
+
+	ocapa = alloc_capa(CAPA_SITE_SERVER);
+	if (IS_ERR(ocapa))
+		return NULL;
+
+	spin_lock(&capa_lock);
+	old = find_capa(capa, head, 0);
+	if (!old) {
+		ocapa->c_capa = *capa;
+		set_capa_expiry(ocapa);
+		hlist_add_head(&ocapa->u.tgt.c_hash, head);
+		list_add_tail(&ocapa->c_list, list);
+		capa_get(ocapa);
+		capa_count[CAPA_SITE_SERVER]++;
+		if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE)
+			capa_delete_lru(list);
+		spin_unlock(&capa_lock);
+		return ocapa;
+	} else {
+		capa_get(old);
+		spin_unlock(&capa_lock);
+		capa_put(ocapa);
+		return old;
+	}
+}
+EXPORT_SYMBOL(capa_add);
+
+struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa,
+			     int alive)
+{
+	struct obd_capa *ocapa;
+
+	spin_lock(&capa_lock);
+	ocapa = find_capa(capa, hash + capa_hashfn(&capa->lc_fid), alive);
+	if (ocapa) {
+		list_move_tail(&ocapa->c_list,
+				   &capa_list[CAPA_SITE_SERVER]);
+		capa_get(ocapa);
+	}
+	spin_unlock(&capa_lock);
+
+	return ocapa;
+}
+EXPORT_SYMBOL(capa_lookup);
+
+int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key)
+{
+	struct ll_crypto_hash *tfm;
+	struct capa_hmac_alg  *alg;
+	int keylen;
+	struct scatterlist sl;
+
+	if (capa_alg(capa) != CAPA_HMAC_ALG_SHA1) {
+		CERROR("unknown capability hmac algorithm!\n");
+		return -EFAULT;
+	}
+
+	alg = &capa_hmac_algs[capa_alg(capa)];
+
+	tfm = ll_crypto_alloc_hash(alg->ha_name, 0, 0);
+	if (!tfm) {
+		CERROR("crypto_alloc_tfm failed, check whether your kernel"
+		       "has crypto support!\n");
+		return -ENOMEM;
+	}
+	keylen = alg->ha_keylen;
+
+	sg_set_page(&sl, virt_to_page(capa),
+		    offsetof(struct lustre_capa, lc_hmac),
+		    (unsigned long)(capa) % PAGE_CACHE_SIZE);
+
+	ll_crypto_hmac(tfm, key, &keylen, &sl, sl.length, hmac);
+	ll_crypto_free_hash(tfm);
+
+	return 0;
+}
+EXPORT_SYMBOL(capa_hmac);
+
+int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to encrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_encrypt_id);
+
+int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen)
+{
+	struct ll_crypto_cipher *tfm;
+	struct scatterlist sd;
+	struct scatterlist ss;
+	struct blkcipher_desc desc;
+	unsigned int min;
+	int rc;
+	char alg[CRYPTO_MAX_ALG_NAME+1] = "aes";
+	ENTRY;
+
+	/* passing "aes" in a variable instead of a constant string keeps gcc
+	 * 4.3.2 happy */
+	tfm = ll_crypto_alloc_blkcipher(alg, 0, 0 );
+	if (IS_ERR(tfm)) {
+		CERROR("failed to load transform for aes\n");
+		RETURN(PTR_ERR(tfm));
+	}
+
+	min = ll_crypto_tfm_alg_min_keysize(tfm);
+	if (keylen < min) {
+		CERROR("keylen at least %d bits for aes\n", min * 8);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	rc = ll_crypto_blkcipher_setkey(tfm, key, min);
+	if (rc) {
+		CERROR("failed to setting key for aes\n");
+		GOTO(out, rc);
+	}
+
+	sg_set_page(&sd, virt_to_page(d), 16,
+		    (unsigned long)(d) % PAGE_CACHE_SIZE);
+
+	sg_set_page(&ss, virt_to_page(s), 16,
+		    (unsigned long)(s) % PAGE_CACHE_SIZE);
+
+	desc.tfm   = tfm;
+	desc.info  = NULL;
+	desc.flags = 0;
+	rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16);
+	if (rc) {
+		CERROR("failed to decrypt for aes\n");
+		GOTO(out, rc);
+	}
+
+	EXIT;
+
+out:
+	ll_crypto_free_blkcipher(tfm);
+	return rc;
+}
+EXPORT_SYMBOL(capa_decrypt_id);
+
+void capa_cpy(void *capa, struct obd_capa *ocapa)
+{
+	spin_lock(&ocapa->c_lock);
+	*(struct lustre_capa *)capa = ocapa->c_capa;
+	spin_unlock(&ocapa->c_lock);
+}
+EXPORT_SYMBOL(capa_cpy);
+
+void _debug_capa(struct lustre_capa *c,
+		 struct libcfs_debug_msg_data *msgdata,
+		 const char *fmt, ... )
+{
+	va_list args;
+	va_start(args, fmt);
+	libcfs_debug_vmsg2(msgdata, fmt, args,
+			   " capability@%p fid "DFID" opc "LPX64" uid "LPU64
+			   " gid "LPU64" flags %u alg %d keyid %u timeout %u "
+			   "expiry %u\n", c, PFID(capa_fid(c)), capa_opc(c),
+			   capa_uid(c), capa_gid(c), capa_flags(c),
+			   capa_alg(c), capa_keyid(c), capa_timeout(c),
+			   capa_expiry(c));
+	va_end(args);
+}
+EXPORT_SYMBOL(_debug_capa);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_internal.h b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
new file mode 100644
index 000000000000..7eb0ad7b3644
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_internal.h
@@ -0,0 +1,121 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Internal cl interfaces.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+#ifndef _CL_INTERNAL_H
+#define _CL_INTERNAL_H
+
+#define CLT_PVEC_SIZE (14)
+
+/**
+ * Possible levels of the nesting. Currently this is 2: there are "top"
+ * entities (files, extent locks), and "sub" entities (stripes and stripe
+ * locks). This is used only for debugging counters right now.
+ */
+enum clt_nesting_level {
+	CNL_TOP,
+	CNL_SUB,
+	CNL_NR
+};
+
+/**
+ * Counters used to check correctness of cl_lock interface usage.
+ */
+struct cl_thread_counters {
+	/**
+	 * Number of outstanding calls to cl_lock_mutex_get() made by the
+	 * current thread. For debugging.
+	 */
+	int	   ctc_nr_locks_locked;
+	/** List of locked locks. */
+	struct lu_ref ctc_locks_locked;
+	/** Number of outstanding holds on locks. */
+	int	   ctc_nr_held;
+	/** Number of outstanding uses on locks. */
+	int	   ctc_nr_used;
+	/** Number of held extent locks. */
+	int	   ctc_nr_locks_acquired;
+};
+
+/**
+ * Thread local state internal for generic cl-code.
+ */
+struct cl_thread_info {
+	/*
+	 * Common fields.
+	 */
+	struct cl_io	 clt_io;
+	struct cl_2queue     clt_queue;
+
+	/*
+	 * Fields used by cl_lock.c
+	 */
+	struct cl_lock_descr clt_descr;
+	struct cl_page_list  clt_list;
+	/**
+	 * Counters for every level of lock nesting.
+	 */
+	struct cl_thread_counters clt_counters[CNL_NR];
+	/** @} debugging */
+
+	/*
+	 * Fields used by cl_page.c
+	 */
+	struct cl_page      *clt_pvec[CLT_PVEC_SIZE];
+
+	/*
+	 * Fields used by cl_io.c
+	 */
+	/**
+	 * Pointer to the topmost ongoing IO in this thread.
+	 */
+	struct cl_io	*clt_current_io;
+	/**
+	 * Used for submitting a sync io.
+	 */
+	struct cl_sync_io    clt_anchor;
+	/**
+	 * Fields used by cl_lock_discard_pages().
+	 */
+	pgoff_t	      clt_next_index;
+	pgoff_t	      clt_fn_index; /* first non-overlapped index */
+};
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env);
+
+#endif /* _CL_INTERNAL_H */
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_io.c b/drivers/staging/lustre/lustre/obdclass/cl_io.c
new file mode 100644
index 000000000000..75c9be8875e0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_io.c
@@ -0,0 +1,1753 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client IO.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/*****************************************************************************
+ *
+ * cl_io interface.
+ *
+ */
+
+#define cl_io_for_each(slice, io) \
+	list_for_each_entry((slice), &io->ci_layers, cis_linkage)
+#define cl_io_for_each_reverse(slice, io)		 \
+	list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage)
+
+static inline int cl_io_type_is_valid(enum cl_io_type type)
+{
+	return CIT_READ <= type && type < CIT_OP_NR;
+}
+
+static inline int cl_io_is_loopable(const struct cl_io *io)
+{
+	return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC;
+}
+
+/**
+ * Returns true iff there is an IO ongoing in the given environment.
+ */
+int cl_io_is_going(const struct lu_env *env)
+{
+	return cl_env_info(env)->clt_current_io != NULL;
+}
+EXPORT_SYMBOL(cl_io_is_going);
+
+/**
+ * cl_io invariant that holds at all times when exported cl_io_*() functions
+ * are entered and left.
+ */
+static int cl_io_invariant(const struct cl_io *io)
+{
+	struct cl_io *up;
+
+	up = io->ci_parent;
+	return
+		/*
+		 * io can own pages only when it is ongoing. Sub-io might
+		 * still be in CIS_LOCKED state when top-io is in
+		 * CIS_IO_GOING.
+		 */
+		ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING ||
+		     (io->ci_state == CIS_LOCKED && up != NULL));
+}
+
+/**
+ * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top.
+ */
+void cl_io_fini(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_io_slice    *slice;
+	struct cl_thread_info *info;
+
+	LINVRNT(cl_io_type_is_valid(io->ci_type));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	while (!list_empty(&io->ci_layers)) {
+		slice = container_of(io->ci_layers.prev, struct cl_io_slice,
+				     cis_linkage);
+		list_del_init(&slice->cis_linkage);
+		if (slice->cis_iop->op[io->ci_type].cio_fini != NULL)
+			slice->cis_iop->op[io->ci_type].cio_fini(env, slice);
+		/*
+		 * Invalidate slice to catch use after free. This assumes that
+		 * slices are allocated within session and can be touched
+		 * after ->cio_fini() returns.
+		 */
+		slice->cis_io = NULL;
+	}
+	io->ci_state = CIS_FINI;
+	info = cl_env_info(env);
+	if (info->clt_current_io == io)
+		info->clt_current_io = NULL;
+
+	/* sanity check for layout change */
+	switch(io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		break;
+	case CIT_FAULT:
+	case CIT_FSYNC:
+		LASSERT(!io->ci_need_restart);
+		break;
+	case CIT_SETATTR:
+	case CIT_MISC:
+		/* Check ignore layout change conf */
+		LASSERT(ergo(io->ci_ignore_layout || !io->ci_verify_layout,
+				!io->ci_need_restart));
+		break;
+	default:
+		LBUG();
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_fini);
+
+static int cl_io_init0(const struct lu_env *env, struct cl_io *io,
+		       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_object *scan;
+	int result;
+
+	LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI);
+	LINVRNT(cl_io_type_is_valid(iot));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_type = iot;
+	INIT_LIST_HEAD(&io->ci_lockset.cls_todo);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_curr);
+	INIT_LIST_HEAD(&io->ci_lockset.cls_done);
+	INIT_LIST_HEAD(&io->ci_layers);
+
+	result = 0;
+	cl_object_for_each(scan, obj) {
+		if (scan->co_ops->coo_io_init != NULL) {
+			result = scan->co_ops->coo_io_init(env, scan, io);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		io->ci_state = CIS_INIT;
+	RETURN(result);
+}
+
+/**
+ * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * \pre obj != cl_object_top(obj)
+ */
+int cl_io_sub_init(const struct lu_env *env, struct cl_io *io,
+		   enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj != cl_object_top(obj));
+	if (info->clt_current_io == NULL)
+		info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_sub_init);
+
+/**
+ * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom.
+ *
+ * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter
+ * what the latter returned.
+ *
+ * \pre obj == cl_object_top(obj)
+ * \pre cl_io_type_is_valid(iot)
+ * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot
+ */
+int cl_io_init(const struct lu_env *env, struct cl_io *io,
+	       enum cl_io_type iot, struct cl_object *obj)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+
+	LASSERT(obj == cl_object_top(obj));
+	LASSERT(info->clt_current_io == NULL);
+
+	info->clt_current_io = io;
+	return cl_io_init0(env, io, iot, obj);
+}
+EXPORT_SYMBOL(cl_io_init);
+
+/**
+ * Initialize read or write io.
+ *
+ * \pre iot == CIT_READ || iot == CIT_WRITE
+ */
+int cl_io_rw_init(const struct lu_env *env, struct cl_io *io,
+		  enum cl_io_type iot, loff_t pos, size_t count)
+{
+	LINVRNT(iot == CIT_READ || iot == CIT_WRITE);
+	LINVRNT(io->ci_obj != NULL);
+	ENTRY;
+
+	LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu,
+			 "io range: %u ["LPU64", "LPU64") %u %u\n",
+			 iot, (__u64)pos, (__u64)pos + count,
+			 io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append);
+	io->u.ci_rw.crw_pos    = pos;
+	io->u.ci_rw.crw_count  = count;
+	RETURN(cl_io_init(env, io, iot, io->ci_obj));
+}
+EXPORT_SYMBOL(cl_io_rw_init);
+
+static inline const struct lu_fid *
+cl_lock_descr_fid(const struct cl_lock_descr *descr)
+{
+	return lu_object_fid(&descr->cld_obj->co_lu);
+}
+
+static int cl_lock_descr_sort(const struct cl_lock_descr *d0,
+			      const struct cl_lock_descr *d1)
+{
+	return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?:
+		__diff_normalize(d0->cld_start, d1->cld_start);
+}
+
+static int cl_lock_descr_cmp(const struct cl_lock_descr *d0,
+			     const struct cl_lock_descr *d1)
+{
+	int ret;
+
+	ret = lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1));
+	if (ret)
+		return ret;
+	if (d0->cld_end < d1->cld_start)
+		return -1;
+	if (d0->cld_start > d0->cld_end)
+		return 1;
+	return 0;
+}
+
+static void cl_lock_descr_merge(struct cl_lock_descr *d0,
+				const struct cl_lock_descr *d1)
+{
+	d0->cld_start = min(d0->cld_start, d1->cld_start);
+	d0->cld_end = max(d0->cld_end, d1->cld_end);
+
+	if (d1->cld_mode == CLM_WRITE && d0->cld_mode != CLM_WRITE)
+		d0->cld_mode = CLM_WRITE;
+
+	if (d1->cld_mode == CLM_GROUP && d0->cld_mode != CLM_GROUP)
+		d0->cld_mode = CLM_GROUP;
+}
+
+/*
+ * Sort locks in lexicographical order of their (fid, start-offset) pairs.
+ */
+static void cl_io_locks_sort(struct cl_io *io)
+{
+	int done = 0;
+
+	ENTRY;
+	/* hidden treasure: bubble sort for now. */
+	do {
+		struct cl_io_lock_link *curr;
+		struct cl_io_lock_link *prev;
+		struct cl_io_lock_link *temp;
+
+		done = 1;
+		prev = NULL;
+
+		list_for_each_entry_safe(curr, temp,
+					     &io->ci_lockset.cls_todo,
+					     cill_linkage) {
+			if (prev != NULL) {
+				switch (cl_lock_descr_sort(&prev->cill_descr,
+							  &curr->cill_descr)) {
+				case 0:
+					/*
+					 * IMPOSSIBLE: Identical locks are
+					 *	     already removed at
+					 *	     this point.
+					 */
+				default:
+					LBUG();
+				case +1:
+					list_move_tail(&curr->cill_linkage,
+							   &prev->cill_linkage);
+					done = 0;
+					continue; /* don't change prev: it's
+						   * still "previous" */
+				case -1: /* already in order */
+					break;
+				}
+			}
+			prev = curr;
+		}
+	} while (!done);
+	EXIT;
+}
+
+/**
+ * Check whether \a queue contains locks matching \a need.
+ *
+ * \retval +ve there is a matching lock in the \a queue
+ * \retval   0 there are no matching locks in the \a queue
+ */
+int cl_queue_match(const struct list_head *queue,
+		   const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_match(&scan->cill_descr, need))
+		       RETURN(+1);
+       }
+       RETURN(0);
+}
+EXPORT_SYMBOL(cl_queue_match);
+
+static int cl_queue_merge(const struct list_head *queue,
+			  const struct cl_lock_descr *need)
+{
+       struct cl_io_lock_link *scan;
+
+       ENTRY;
+       list_for_each_entry(scan, queue, cill_linkage) {
+	       if (cl_lock_descr_cmp(&scan->cill_descr, need))
+		       continue;
+	       cl_lock_descr_merge(&scan->cill_descr, need);
+	       CDEBUG(D_VFSTRACE, "lock: %d: [%lu, %lu]\n",
+		      scan->cill_descr.cld_mode, scan->cill_descr.cld_start,
+		      scan->cill_descr.cld_end);
+	       RETURN(+1);
+       }
+       RETURN(0);
+
+}
+
+static int cl_lockset_match(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_match(&set->cls_curr, need) ||
+	       cl_queue_match(&set->cls_done, need);
+}
+
+static int cl_lockset_merge(const struct cl_lockset *set,
+			    const struct cl_lock_descr *need)
+{
+	return cl_queue_merge(&set->cls_todo, need) ||
+	       cl_lockset_match(set, need);
+}
+
+static int cl_lockset_lock_one(const struct lu_env *env,
+			       struct cl_io *io, struct cl_lockset *set,
+			       struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock;
+	int	     result;
+
+	ENTRY;
+
+	lock = cl_lock_request(env, io, &link->cill_descr, "io", io);
+
+	if (!IS_ERR(lock)) {
+		link->cill_lock = lock;
+		list_move(&link->cill_linkage, &set->cls_curr);
+		if (!(link->cill_descr.cld_enq_flags & CEF_ASYNC)) {
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+		} else
+			result = 0;
+	} else
+		result = PTR_ERR(lock);
+	RETURN(result);
+}
+
+static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io,
+			      struct cl_io_lock_link *link)
+{
+	struct cl_lock *lock = link->cill_lock;
+
+	ENTRY;
+	list_del_init(&link->cill_linkage);
+	if (lock != NULL) {
+		cl_lock_release(env, lock, "io", io);
+		link->cill_lock = NULL;
+	}
+	if (link->cill_fini != NULL)
+		link->cill_fini(env, link);
+	EXIT;
+}
+
+static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io,
+			   struct cl_lockset *set)
+{
+	struct cl_io_lock_link *link;
+	struct cl_io_lock_link *temp;
+	struct cl_lock	 *lock;
+	int result;
+
+	ENTRY;
+	result = 0;
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) {
+		if (!cl_lockset_match(set, &link->cill_descr)) {
+			/* XXX some locking to guarantee that locks aren't
+			 * expanded in between. */
+			result = cl_lockset_lock_one(env, io, set, link);
+			if (result != 0)
+				break;
+		} else
+			cl_lock_link_fini(env, io, link);
+	}
+	if (result == 0) {
+		list_for_each_entry_safe(link, temp,
+					     &set->cls_curr, cill_linkage) {
+			lock = link->cill_lock;
+			result = cl_wait(env, lock);
+			if (result == 0)
+				list_move(&link->cill_linkage,
+					      &set->cls_done);
+			else
+				break;
+		}
+	}
+	RETURN(result);
+}
+
+/**
+ * Takes locks necessary for the current iteration of io.
+ *
+ * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required
+ * by layers for the current iteration. Then sort locks (to avoid dead-locks),
+ * and acquire them.
+ */
+int cl_io_lock(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IT_STARTED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_lock == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0) {
+		cl_io_locks_sort(io);
+		result = cl_lockset_lock(env, io, &io->ci_lockset);
+	}
+	if (result != 0)
+		cl_io_unlock(env, io);
+	else
+		io->ci_state = CIS_LOCKED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock);
+
+/**
+ * Release locks takes by io.
+ */
+void cl_io_unlock(const struct lu_env *env, struct cl_io *io)
+{
+	struct cl_lockset	*set;
+	struct cl_io_lock_link   *link;
+	struct cl_io_lock_link   *temp;
+	const struct cl_io_slice *scan;
+
+	LASSERT(cl_io_is_loopable(io));
+	LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	set = &io->ci_lockset;
+
+	list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage)
+		cl_lock_link_fini(env, io, link);
+
+	list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) {
+		cl_unuse(env, link->cill_lock);
+		cl_lock_link_fini(env, io, link);
+	}
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL)
+			scan->cis_iop->op[io->ci_type].cio_unlock(env, scan);
+	}
+	io->ci_state = CIS_UNLOCKED;
+	LASSERT(!cl_env_info(env)->clt_counters[CNL_TOP].ctc_nr_locks_acquired);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_unlock);
+
+/**
+ * Prepares next iteration of io.
+ *
+ * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give
+ * layers a chance to modify io parameters, e.g., so that lov can restrict io
+ * to a single stripe.
+ */
+int cl_io_iter_init(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	result = 0;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_iter_init(env,
+								      scan);
+		if (result != 0)
+			break;
+	}
+	if (result == 0)
+		io->ci_state = CIS_IT_STARTED;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_iter_init);
+
+/**
+ * Finalizes io iteration.
+ *
+ * Calls cl_io_operations::cio_iter_fini() bottom-to-top.
+ */
+void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_UNLOCKED);
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL)
+			scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan);
+	}
+	io->ci_state = CIS_IT_ENDED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_iter_fini);
+
+/**
+ * Records that read or write io progressed \a nob bytes forward.
+ */
+void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE ||
+		nob == 0);
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(cl_io_invariant(io));
+
+	ENTRY;
+
+	io->u.ci_rw.crw_pos   += nob;
+	io->u.ci_rw.crw_count -= nob;
+
+	/* layers have to be notified. */
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_advance != NULL)
+			scan->cis_iop->op[io->ci_type].cio_advance(env, scan,
+								   nob);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_rw_advance);
+
+/**
+ * Adds a lock to a lockset.
+ */
+int cl_io_lock_add(const struct lu_env *env, struct cl_io *io,
+		   struct cl_io_lock_link *link)
+{
+	int result;
+
+	ENTRY;
+	if (cl_lockset_merge(&io->ci_lockset, &link->cill_descr))
+		result = +1;
+	else {
+		list_add(&link->cill_linkage, &io->ci_lockset.cls_todo);
+		result = 0;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_add);
+
+static void cl_free_io_lock_link(const struct lu_env *env,
+				 struct cl_io_lock_link *link)
+{
+	OBD_FREE_PTR(link);
+}
+
+/**
+ * Allocates new lock link, and uses it to add a lock to a lockset.
+ */
+int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io,
+			 struct cl_lock_descr *descr)
+{
+	struct cl_io_lock_link *link;
+	int result;
+
+	ENTRY;
+	OBD_ALLOC_PTR(link);
+	if (link != NULL) {
+		link->cill_descr     = *descr;
+		link->cill_fini      = cl_free_io_lock_link;
+		result = cl_io_lock_add(env, io, link);
+		if (result) /* lock match */
+			link->cill_fini(env, link);
+	} else
+		result = -ENOMEM;
+
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_lock_alloc_add);
+
+/**
+ * Starts io by calling cl_io_operations::cio_start() top-to-bottom.
+ */
+int cl_io_start(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	io->ci_state = CIS_IO_GOING;
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_start == NULL)
+			continue;
+		result = scan->cis_iop->op[io->ci_type].cio_start(env, scan);
+		if (result != 0)
+			break;
+	}
+	if (result >= 0)
+		result = 0;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_start);
+
+/**
+ * Wait until current io iteration is finished by calling
+ * cl_io_operations::cio_end() bottom-to-top.
+ */
+void cl_io_end(const struct lu_env *env, struct cl_io *io)
+{
+	const struct cl_io_slice *scan;
+
+	LINVRNT(cl_io_is_loopable(io));
+	LINVRNT(io->ci_state == CIS_IO_GOING);
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->op[io->ci_type].cio_end != NULL)
+			scan->cis_iop->op[io->ci_type].cio_end(env, scan);
+		/* TODO: error handling. */
+	}
+	io->ci_state = CIS_IO_FINISHED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_end);
+
+static const struct cl_page_slice *
+cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type);
+	LINVRNT(slice != NULL);
+	return slice;
+}
+
+/**
+ * True iff \a page is within \a io range.
+ */
+static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io)
+{
+	int     result = 1;
+	loff_t  start;
+	loff_t  end;
+	pgoff_t idx;
+
+	idx = page->cp_index;
+	switch (io->ci_type) {
+	case CIT_READ:
+	case CIT_WRITE:
+		/*
+		 * check that [start, end) and [pos, pos + count) extents
+		 * overlap.
+		 */
+		if (!cl_io_is_append(io)) {
+			const struct cl_io_rw_common *crw = &(io->u.ci_rw);
+			start = cl_offset(page->cp_obj, idx);
+			end   = cl_offset(page->cp_obj, idx + 1);
+			result = crw->crw_pos < end &&
+				 start < crw->crw_pos + crw->crw_count;
+		}
+		break;
+	case CIT_FAULT:
+		result = io->u.ci_fault.ft_index == idx;
+		break;
+	default:
+		LBUG();
+	}
+	return result;
+}
+
+/**
+ * Called by read io, when page has to be read from the server.
+ *
+ * \see cl_io_operations::cio_read_page()
+ */
+int cl_io_read_page(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *page)
+{
+	const struct cl_io_slice *scan;
+	struct cl_2queue	 *queue;
+	int		       result = 0;
+
+	LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_page_in_io(page, io));
+	LINVRNT(cl_io_invariant(io));
+	ENTRY;
+
+	queue = &io->ci_queue;
+
+	cl_2queue_init(queue);
+	/*
+	 * ->cio_read_page() methods called in the loop below are supposed to
+	 * never block waiting for network (the only subtle point is the
+	 * creation of new pages for read-ahead that might result in cache
+	 * shrinking, but currently only clean pages are shrunk and this
+	 * requires no network io).
+	 *
+	 * Should this ever starts blocking, retry loop would be needed for
+	 * "parallel io" (see CLO_REPEAT loops in cl_lock.c).
+	 */
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_read_page != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			LINVRNT(slice != NULL);
+			result = scan->cis_iop->cio_read_page(env, scan, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	if (result == 0)
+		result = cl_io_submit_rw(env, io, CRT_READ, queue);
+	/*
+	 * Unlock unsent pages in case of error.
+	 */
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_2queue_fini(env, queue);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_read_page);
+
+/**
+ * Called by write io to prepare page to receive data from user buffer.
+ *
+ * \see cl_io_operations::cio_prepare_write()
+ */
+int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(cl_page_is_owned(page, io));
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each_reverse(scan, io) {
+		if (scan->cis_iop->cio_prepare_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_prepare_write(env, scan,
+								  slice,
+								  from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_prepare_write);
+
+/**
+ * Called by write io after user data were copied into a page.
+ *
+ * \see cl_io_operations::cio_commit_write()
+ */
+int cl_io_commit_write(const struct lu_env *env, struct cl_io *io,
+		       struct cl_page *page, unsigned from, unsigned to)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(io->ci_type == CIT_WRITE);
+	LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED);
+	LINVRNT(cl_io_invariant(io));
+	/*
+	 * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov)
+	 * already called cl_page_cache_add(), moving page into CPS_CACHED
+	 * state. Better (and more general) way of dealing with such situation
+	 * is needed.
+	 */
+	LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL);
+	LASSERT(cl_page_in_io(page, io));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->cio_commit_write != NULL) {
+			const struct cl_page_slice *slice;
+
+			slice = cl_io_slice_page(scan, page);
+			result = scan->cis_iop->cio_commit_write(env, scan,
+								 slice,
+								 from, to);
+			if (result != 0)
+				break;
+		}
+	}
+	LINVRNT(result <= 0);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_commit_write);
+
+/**
+ * Submits a list of pages for immediate io.
+ *
+ * After the function gets returned, The submitted pages are moved to
+ * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need
+ * to be submitted, and the pages are errant to submit.
+ *
+ * \returns 0 if at least one page was submitted, error code otherwise.
+ * \see cl_io_operations::cio_submit()
+ */
+int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io,
+		    enum cl_req_type crt, struct cl_2queue *queue)
+{
+	const struct cl_io_slice *scan;
+	int result = 0;
+
+	LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op));
+	ENTRY;
+
+	cl_io_for_each(scan, io) {
+		if (scan->cis_iop->req_op[crt].cio_submit == NULL)
+			continue;
+		result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt,
+							       queue);
+		if (result != 0)
+			break;
+	}
+	/*
+	 * If ->cio_submit() failed, no pages were sent.
+	 */
+	LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages)));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_io_submit_rw);
+
+/**
+ * Submit a sync_io and wait for the IO to be finished, or error happens.
+ * If \a timeout is zero, it means to wait for the IO unconditionally.
+ */
+int cl_io_submit_sync(const struct lu_env *env, struct cl_io *io,
+		      enum cl_req_type iot, struct cl_2queue *queue,
+		      long timeout)
+{
+	struct cl_sync_io *anchor = &cl_env_info(env)->clt_anchor;
+	struct cl_page *pg;
+	int rc;
+
+	cl_page_list_for_each(pg, &queue->c2_qin) {
+		LASSERT(pg->cp_sync_io == NULL);
+		pg->cp_sync_io = anchor;
+	}
+
+	cl_sync_io_init(anchor, queue->c2_qin.pl_nr);
+	rc = cl_io_submit_rw(env, io, iot, queue);
+	if (rc == 0) {
+		/*
+		 * If some pages weren't sent for any reason (e.g.,
+		 * read found up-to-date pages in the cache, or write found
+		 * clean pages), count them as completed to avoid infinite
+		 * wait.
+		 */
+		 cl_page_list_for_each(pg, &queue->c2_qin) {
+			pg->cp_sync_io = NULL;
+			cl_sync_io_note(anchor, +1);
+		 }
+
+		 /* wait for the IO to be finished. */
+		 rc = cl_sync_io_wait(env, io, &queue->c2_qout,
+				      anchor, timeout);
+	} else {
+		LASSERT(list_empty(&queue->c2_qout.pl_pages));
+		cl_page_list_for_each(pg, &queue->c2_qin)
+			pg->cp_sync_io = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(cl_io_submit_sync);
+
+/**
+ * Cancel an IO which has been submitted by cl_io_submit_rw.
+ */
+int cl_io_cancel(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page_list *queue)
+{
+	struct cl_page *page;
+	int result = 0;
+
+	CERROR("Canceling ongoing page trasmission\n");
+	cl_page_list_for_each(page, queue) {
+		int rc;
+
+		LINVRNT(cl_page_in_io(page, io));
+		rc = cl_page_cancel(env, page);
+		result = result ?: rc;
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_io_cancel);
+
+/**
+ * Main io loop.
+ *
+ * Pumps io through iterations calling
+ *
+ *    - cl_io_iter_init()
+ *
+ *    - cl_io_lock()
+ *
+ *    - cl_io_start()
+ *
+ *    - cl_io_end()
+ *
+ *    - cl_io_unlock()
+ *
+ *    - cl_io_iter_fini()
+ *
+ * repeatedly until there is no more io to do.
+ */
+int cl_io_loop(const struct lu_env *env, struct cl_io *io)
+{
+	int result   = 0;
+
+	LINVRNT(cl_io_is_loopable(io));
+	ENTRY;
+
+	do {
+		size_t nob;
+
+		io->ci_continue = 0;
+		result = cl_io_iter_init(env, io);
+		if (result == 0) {
+			nob    = io->ci_nob;
+			result = cl_io_lock(env, io);
+			if (result == 0) {
+				/*
+				 * Notify layers that locks has been taken,
+				 * and do actual i/o.
+				 *
+				 *   - llite: kms, short read;
+				 *   - llite: generic_file_read();
+				 */
+				result = cl_io_start(env, io);
+				/*
+				 * Send any remaining pending
+				 * io, etc.
+				 *
+				 *   - llite: ll_rw_stats_tally.
+				 */
+				cl_io_end(env, io);
+				cl_io_unlock(env, io);
+				cl_io_rw_advance(env, io, io->ci_nob - nob);
+			}
+		}
+		cl_io_iter_fini(env, io);
+	} while (result == 0 && io->ci_continue);
+	if (result == 0)
+		result = io->ci_result;
+	RETURN(result < 0 ? result : 0);
+}
+EXPORT_SYMBOL(cl_io_loop);
+
+/**
+ * Adds io slice to the cl_io.
+ *
+ * This is called by cl_object_operations::coo_io_init() methods to add a
+ * per-layer state to the io. New state is added at the end of
+ * cl_io::ci_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add()
+ */
+void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice,
+		     struct cl_object *obj,
+		     const struct cl_io_operations *ops)
+{
+	struct list_head *linkage = &slice->cis_linkage;
+
+	LASSERT((linkage->prev == NULL && linkage->next == NULL) ||
+		list_empty(linkage));
+	ENTRY;
+
+	list_add_tail(linkage, &io->ci_layers);
+	slice->cis_io  = io;
+	slice->cis_obj = obj;
+	slice->cis_iop = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_io_slice_add);
+
+
+/**
+ * Initializes page list.
+ */
+void cl_page_list_init(struct cl_page_list *plist)
+{
+	ENTRY;
+	plist->pl_nr = 0;
+	INIT_LIST_HEAD(&plist->pl_pages);
+	plist->pl_owner = current;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_init);
+
+/**
+ * Adds a page to a page list.
+ */
+void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page)
+{
+	ENTRY;
+	/* it would be better to check that page is owned by "current" io, but
+	 * it is not passed here. */
+	LASSERT(page->cp_owner != NULL);
+	LINVRNT(plist->pl_owner == current);
+
+	lockdep_off();
+	mutex_lock(&page->cp_mutex);
+	lockdep_on();
+	LASSERT(list_empty(&page->cp_batch));
+	list_add_tail(&page->cp_batch, &plist->pl_pages);
+	++plist->pl_nr;
+	page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist);
+	cl_page_get(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_add);
+
+/**
+ * Removes a page from a page list.
+ */
+void cl_page_list_del(const struct lu_env *env,
+		      struct cl_page_list *plist, struct cl_page *page)
+{
+	LASSERT(plist->pl_nr > 0);
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	list_del_init(&page->cp_batch);
+	lockdep_off();
+	mutex_unlock(&page->cp_mutex);
+	lockdep_on();
+	--plist->pl_nr;
+	lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist);
+	cl_page_put(env, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_del);
+
+/**
+ * Moves a page from one page list to another.
+ */
+void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src,
+		       struct cl_page *page)
+{
+	LASSERT(src->pl_nr > 0);
+	LINVRNT(dst->pl_owner == current);
+	LINVRNT(src->pl_owner == current);
+
+	ENTRY;
+	list_move_tail(&page->cp_batch, &dst->pl_pages);
+	--src->pl_nr;
+	++dst->pl_nr;
+	lu_ref_set_at(&page->cp_reference,
+		      page->cp_queue_ref, "queue", src, dst);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_move);
+
+/**
+ * splice the cl_page_list, just as list head does
+ */
+void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head)
+{
+	struct cl_page *page;
+	struct cl_page *tmp;
+
+	LINVRNT(list->pl_owner == current);
+	LINVRNT(head->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, tmp, list)
+		cl_page_list_move(head, list, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_splice);
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg);
+
+/**
+ * Disowns pages in a queue.
+ */
+void cl_page_list_disown(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(plist->pl_nr > 0);
+
+		list_del_init(&page->cp_batch);
+		lockdep_off();
+		mutex_unlock(&page->cp_mutex);
+		lockdep_on();
+		--plist->pl_nr;
+		/*
+		 * cl_page_disown0 rather than usual cl_page_disown() is used,
+		 * because pages are possibly in CPS_FREEING state already due
+		 * to the call to cl_page_list_discard().
+		 */
+		/*
+		 * XXX cl_page_disown0() will fail if page is not locked.
+		 */
+		cl_page_disown0(env, io, page);
+		lu_ref_del(&page->cp_reference, "queue", plist);
+		cl_page_put(env, page);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_disown);
+
+/**
+ * Releases pages from queue.
+ */
+void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	cl_page_list_for_each_safe(page, temp, plist)
+		cl_page_list_del(env, plist, page);
+	LASSERT(plist->pl_nr == 0);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_fini);
+
+/**
+ * Owns all pages in a queue.
+ */
+int cl_page_list_own(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	struct cl_page *temp;
+	pgoff_t index = 0;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each_safe(page, temp, plist) {
+		LASSERT(index <= page->cp_index);
+		index = page->cp_index;
+		if (cl_page_own(env, io, page) == 0)
+			result = result ?: page->cp_error;
+		else
+			cl_page_list_del(env, plist, page);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_own);
+
+/**
+ * Assumes all pages in a queue.
+ */
+void cl_page_list_assume(const struct lu_env *env,
+			 struct cl_io *io, struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+
+	cl_page_list_for_each(page, plist)
+		cl_page_assume(env, io, page);
+}
+EXPORT_SYMBOL(cl_page_list_assume);
+
+/**
+ * Discards all pages in a queue.
+ */
+void cl_page_list_discard(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page_list *plist)
+{
+	struct cl_page *page;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	cl_page_list_for_each(page, plist)
+		cl_page_discard(env, io, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_list_discard);
+
+/**
+ * Unmaps all pages in a queue from user virtual memory.
+ */
+int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io,
+			struct cl_page_list *plist)
+{
+	struct cl_page *page;
+	int result;
+
+	LINVRNT(plist->pl_owner == current);
+	ENTRY;
+	result = 0;
+	cl_page_list_for_each(page, plist) {
+		result = cl_page_unmap(env, io, page);
+		if (result != 0)
+			break;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_list_unmap);
+
+/**
+ * Initialize dual page queue.
+ */
+void cl_2queue_init(struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_init(&queue->c2_qin);
+	cl_page_list_init(&queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init);
+
+/**
+ * Add a page to the incoming page list of 2-queue.
+ */
+void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_page_list_add(&queue->c2_qin, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_add);
+
+/**
+ * Disown pages in both lists of a 2-queue.
+ */
+void cl_2queue_disown(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_disown(env, io, &queue->c2_qin);
+	cl_page_list_disown(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_disown);
+
+/**
+ * Discard (truncate) pages in both lists of a 2-queue.
+ */
+void cl_2queue_discard(const struct lu_env *env,
+		       struct cl_io *io, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_discard(env, io, &queue->c2_qin);
+	cl_page_list_discard(env, io, &queue->c2_qout);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_discard);
+
+/**
+ * Assume to own the pages in cl_2queue
+ */
+void cl_2queue_assume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_2queue *queue)
+{
+	cl_page_list_assume(env, io, &queue->c2_qin);
+	cl_page_list_assume(env, io, &queue->c2_qout);
+}
+EXPORT_SYMBOL(cl_2queue_assume);
+
+/**
+ * Finalize both page lists of a 2-queue.
+ */
+void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue)
+{
+	ENTRY;
+	cl_page_list_fini(env, &queue->c2_qout);
+	cl_page_list_fini(env, &queue->c2_qin);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_fini);
+
+/**
+ * Initialize a 2-queue to contain \a page in its incoming page list.
+ */
+void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page)
+{
+	ENTRY;
+	cl_2queue_init(queue);
+	cl_2queue_add(queue, page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_2queue_init_page);
+
+/**
+ * Returns top-level io.
+ *
+ * \see cl_object_top(), cl_page_top().
+ */
+struct cl_io *cl_io_top(struct cl_io *io)
+{
+	ENTRY;
+	while (io->ci_parent != NULL)
+		io = io->ci_parent;
+	RETURN(io);
+}
+EXPORT_SYMBOL(cl_io_top);
+
+/**
+ * Prints human readable representation of \a io to the \a f.
+ */
+void cl_io_print(const struct lu_env *env, void *cookie,
+		 lu_printer_t printer, const struct cl_io *io)
+{
+}
+
+/**
+ * Adds request slice to the compound request.
+ *
+ * This is called by cl_device_operations::cdo_req_init() methods to add a
+ * per-layer state to the request. New state is added at the end of
+ * cl_req::crq_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice,
+		      struct cl_device *dev,
+		      const struct cl_req_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->crs_linkage, &req->crq_layers);
+	slice->crs_dev = dev;
+	slice->crs_ops = ops;
+	slice->crs_req = req;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_slice_add);
+
+static void cl_req_free(const struct lu_env *env, struct cl_req *req)
+{
+	unsigned i;
+
+	LASSERT(list_empty(&req->crq_pages));
+	LASSERT(req->crq_nrpages == 0);
+	LINVRNT(list_empty(&req->crq_layers));
+	LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL));
+	ENTRY;
+
+	if (req->crq_o != NULL) {
+		for (i = 0; i < req->crq_nrobjs; ++i) {
+			struct cl_object *obj = req->crq_o[i].ro_obj;
+			if (obj != NULL) {
+				lu_object_ref_del_at(&obj->co_lu,
+						     req->crq_o[i].ro_obj_ref,
+						     "cl_req", req);
+				cl_object_put(env, obj);
+			}
+		}
+		OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]);
+	}
+	OBD_FREE_PTR(req);
+	EXIT;
+}
+
+static int cl_req_init(const struct lu_env *env, struct cl_req *req,
+		       struct cl_page *page)
+{
+	struct cl_device     *dev;
+	struct cl_page_slice *slice;
+	int result;
+
+	ENTRY;
+	result = 0;
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev);
+			if (dev->cd_ops->cdo_req_init != NULL) {
+				result = dev->cd_ops->cdo_req_init(env,
+								   dev, req);
+				if (result != 0)
+					break;
+			}
+		}
+		page = page->cp_child;
+	} while (page != NULL && result == 0);
+	RETURN(result);
+}
+
+/**
+ * Invokes per-request transfer completion call-backs
+ * (cl_req_operations::cro_completion()) bottom-to-top.
+ */
+void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc)
+{
+	struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * for the lack of list_for_each_entry_reverse_safe()...
+	 */
+	while (!list_empty(&req->crq_layers)) {
+		slice = list_entry(req->crq_layers.prev,
+				       struct cl_req_slice, crs_linkage);
+		list_del_init(&slice->crs_linkage);
+		if (slice->crs_ops->cro_completion != NULL)
+			slice->crs_ops->cro_completion(env, slice, rc);
+	}
+	cl_req_free(env, req);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_completion);
+
+/**
+ * Allocates new transfer request.
+ */
+struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page,
+			    enum cl_req_type crt, int nr_objects)
+{
+	struct cl_req *req;
+
+	LINVRNT(nr_objects > 0);
+	ENTRY;
+
+	OBD_ALLOC_PTR(req);
+	if (req != NULL) {
+		int result;
+
+		OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]);
+		if (req->crq_o != NULL) {
+			req->crq_nrobjs = nr_objects;
+			req->crq_type = crt;
+			INIT_LIST_HEAD(&req->crq_pages);
+			INIT_LIST_HEAD(&req->crq_layers);
+			result = cl_req_init(env, req, page);
+		} else
+			result = -ENOMEM;
+		if (result != 0) {
+			cl_req_completion(env, req, result);
+			req = ERR_PTR(result);
+		}
+	} else
+		req = ERR_PTR(-ENOMEM);
+	RETURN(req);
+}
+EXPORT_SYMBOL(cl_req_alloc);
+
+/**
+ * Adds a page to a request.
+ */
+void cl_req_page_add(const struct lu_env *env,
+		     struct cl_req *req, struct cl_page *page)
+{
+	struct cl_object  *obj;
+	struct cl_req_obj *rqo;
+	int i;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(list_empty(&page->cp_flight));
+	LASSERT(page->cp_req == NULL);
+
+	CL_PAGE_DEBUG(D_PAGE, env, page, "req %p, %d, %u\n",
+		      req, req->crq_type, req->crq_nrpages);
+
+	list_add_tail(&page->cp_flight, &req->crq_pages);
+	++req->crq_nrpages;
+	page->cp_req = req;
+	obj = cl_object_top(page->cp_obj);
+	for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) {
+		if (rqo->ro_obj == NULL) {
+			rqo->ro_obj = obj;
+			cl_object_get(obj);
+			rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu,
+							    "cl_req", req);
+			break;
+		}
+	}
+	LASSERT(i < req->crq_nrobjs);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_add);
+
+/**
+ * Removes a page from a request.
+ */
+void cl_req_page_done(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_req *req = page->cp_req;
+
+	ENTRY;
+	page = cl_page_top(page);
+
+	LASSERT(!list_empty(&page->cp_flight));
+	LASSERT(req->crq_nrpages > 0);
+
+	list_del_init(&page->cp_flight);
+	--req->crq_nrpages;
+	page->cp_req = NULL;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_page_done);
+
+/**
+ * Notifies layers that request is about to depart by calling
+ * cl_req_operations::cro_prep() top-to-bottom.
+ */
+int cl_req_prep(const struct lu_env *env, struct cl_req *req)
+{
+	int i;
+	int result;
+	const struct cl_req_slice *slice;
+
+	ENTRY;
+	/*
+	 * Check that the caller of cl_req_alloc() didn't lie about the number
+	 * of objects.
+	 */
+	for (i = 0; i < req->crq_nrobjs; ++i)
+		LASSERT(req->crq_o[i].ro_obj != NULL);
+
+	result = 0;
+	list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+		if (slice->crs_ops->cro_prep != NULL) {
+			result = slice->crs_ops->cro_prep(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_req_prep);
+
+/**
+ * Fills in attributes that are passed to server together with transfer. Only
+ * attributes from \a flags may be touched. This can be called multiple times
+ * for the same request.
+ */
+void cl_req_attr_set(const struct lu_env *env, struct cl_req *req,
+		     struct cl_req_attr *attr, obd_valid flags)
+{
+	const struct cl_req_slice *slice;
+	struct cl_page	    *page;
+	int i;
+
+	LASSERT(!list_empty(&req->crq_pages));
+	ENTRY;
+
+	/* Take any page to use as a model. */
+	page = list_entry(req->crq_pages.next, struct cl_page, cp_flight);
+
+	for (i = 0; i < req->crq_nrobjs; ++i) {
+		list_for_each_entry(slice, &req->crq_layers, crs_linkage) {
+			const struct cl_page_slice *scan;
+			const struct cl_object     *obj;
+
+			scan = cl_page_at(page,
+					  slice->crs_dev->cd_lu_dev.ld_type);
+			LASSERT(scan != NULL);
+			obj = scan->cpl_obj;
+			if (slice->crs_ops->cro_attr_set != NULL)
+				slice->crs_ops->cro_attr_set(env, slice, obj,
+							     attr + i, flags);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_req_attr_set);
+
+/* XXX complete(), init_completion(), and wait_for_completion(), until they are
+ * implemented in libcfs. */
+# include <linux/sched.h>
+
+/**
+ * Initialize synchronous io wait anchor, for transfer of \a nrpages pages.
+ */
+void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages)
+{
+	ENTRY;
+	init_waitqueue_head(&anchor->csi_waitq);
+	atomic_set(&anchor->csi_sync_nr, nrpages);
+	atomic_set(&anchor->csi_barrier, nrpages > 0);
+	anchor->csi_sync_rc = 0;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_init);
+
+/**
+ * Wait until all transfer completes. Transfer completion routine has to call
+ * cl_sync_io_note() for every page.
+ */
+int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page_list *queue, struct cl_sync_io *anchor,
+		    long timeout)
+{
+	struct l_wait_info lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout),
+						  NULL, NULL, NULL);
+	int rc;
+	ENTRY;
+
+	LASSERT(timeout >= 0);
+
+	rc = l_wait_event(anchor->csi_waitq,
+			  atomic_read(&anchor->csi_sync_nr) == 0,
+			  &lwi);
+	if (rc < 0) {
+		CERROR("SYNC IO failed with error: %d, try to cancel "
+		       "%d remaining pages\n",
+		       rc, atomic_read(&anchor->csi_sync_nr));
+
+		(void)cl_io_cancel(env, io, queue);
+
+		lwi = (struct l_wait_info) { 0 };
+		(void)l_wait_event(anchor->csi_waitq,
+				   atomic_read(&anchor->csi_sync_nr) == 0,
+				   &lwi);
+	} else {
+		rc = anchor->csi_sync_rc;
+	}
+	LASSERT(atomic_read(&anchor->csi_sync_nr) == 0);
+	cl_page_list_assume(env, io, queue);
+
+	/* wait until cl_sync_io_note() has done wakeup */
+	while (unlikely(atomic_read(&anchor->csi_barrier) != 0)) {
+		cpu_relax();
+	}
+
+	POISON(anchor, 0x5a, sizeof *anchor);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_sync_io_wait);
+
+/**
+ * Indicate that transfer of a single page completed.
+ */
+void cl_sync_io_note(struct cl_sync_io *anchor, int ioret)
+{
+	ENTRY;
+	if (anchor->csi_sync_rc == 0 && ioret < 0)
+		anchor->csi_sync_rc = ioret;
+	/*
+	 * Synchronous IO done without releasing page lock (e.g., as a part of
+	 * ->{prepare,commit}_write(). Completion is used to signal the end of
+	 * IO.
+	 */
+	LASSERT(atomic_read(&anchor->csi_sync_nr) > 0);
+	if (atomic_dec_and_test(&anchor->csi_sync_nr)) {
+		wake_up_all(&anchor->csi_waitq);
+		/* it's safe to nuke or reuse anchor now */
+		atomic_set(&anchor->csi_barrier, 0);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_sync_io_note);
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_lock.c b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
new file mode 100644
index 000000000000..d34e044fc854
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_lock.c
@@ -0,0 +1,2304 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Extent Lock.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "cl_internal.h"
+
+/** Lock class of cl_lock::cll_guard */
+static struct lock_class_key cl_lock_guard_class;
+static struct kmem_cache *cl_lock_kmem;
+
+static struct lu_kmem_descr cl_lock_caches[] = {
+	{
+		.ckd_cache = &cl_lock_kmem,
+		.ckd_name  = "cl_lock_kmem",
+		.ckd_size  = sizeof (struct cl_lock)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+#define CS_LOCK_INC(o, item)
+#define CS_LOCK_DEC(o, item)
+#define CS_LOCKSTATE_INC(o, state)
+#define CS_LOCKSTATE_DEC(o, state)
+
+/**
+ * Basic lock invariant that is maintained at all times. Caller either has a
+ * reference to \a lock, or somehow assures that \a lock cannot be freed.
+ *
+ * \see cl_lock_invariant()
+ */
+static int cl_lock_invariant_trusted(const struct lu_env *env,
+				     const struct cl_lock *lock)
+{
+	return  ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) &&
+		atomic_read(&lock->cll_ref) >= lock->cll_holds &&
+		lock->cll_holds >= lock->cll_users &&
+		lock->cll_holds >= 0 &&
+		lock->cll_users >= 0 &&
+		lock->cll_depth >= 0;
+}
+
+/**
+ * Stronger lock invariant, checking that caller has a reference on a lock.
+ *
+ * \see cl_lock_invariant_trusted()
+ */
+static int cl_lock_invariant(const struct lu_env *env,
+			     const struct cl_lock *lock)
+{
+	int result;
+
+	result = atomic_read(&lock->cll_ref) > 0 &&
+		cl_lock_invariant_trusted(env, lock);
+	if (!result && env != NULL)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken");
+	return result;
+}
+
+/**
+ * Returns lock "nesting": 0 for a top-lock and 1 for a sub-lock.
+ */
+static enum clt_nesting_level cl_lock_nesting(const struct cl_lock *lock)
+{
+	return cl_object_header(lock->cll_descr.cld_obj)->coh_nesting;
+}
+
+/**
+ * Returns a set of counters for this lock, depending on a lock nesting.
+ */
+static struct cl_thread_counters *cl_lock_counters(const struct lu_env *env,
+						   const struct cl_lock *lock)
+{
+	struct cl_thread_info *info;
+	enum clt_nesting_level nesting;
+
+	info = cl_env_info(env);
+	nesting = cl_lock_nesting(lock);
+	LASSERT(nesting < ARRAY_SIZE(info->clt_counters));
+	return &info->clt_counters[nesting];
+}
+
+static void cl_lock_trace0(int level, const struct lu_env *env,
+			   const char *prefix, const struct cl_lock *lock,
+			   const char *func, const int line)
+{
+	struct cl_object_header *h = cl_object_header(lock->cll_descr.cld_obj);
+	CDEBUG(level, "%s: %p@(%d %p %d %d %d %d %d %lx)"
+		      "(%p/%d/%d) at %s():%d\n",
+	       prefix, lock, atomic_read(&lock->cll_ref),
+	       lock->cll_guarder, lock->cll_depth,
+	       lock->cll_state, lock->cll_error, lock->cll_holds,
+	       lock->cll_users, lock->cll_flags,
+	       env, h->coh_nesting, cl_lock_nr_mutexed(env),
+	       func, line);
+}
+#define cl_lock_trace(level, env, prefix, lock)			 \
+	cl_lock_trace0(level, env, prefix, lock, __FUNCTION__, __LINE__)
+
+#define RETIP ((unsigned long)__builtin_return_address(0))
+
+#ifdef CONFIG_LOCKDEP
+static struct lock_class_key cl_lock_key;
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{
+	lockdep_set_class_and_name(lock, &cl_lock_key, "EXT");
+}
+
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired++;
+	lock_map_acquire(&lock->dep_map);
+}
+
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{
+	cl_lock_counters(env, lock)->ctc_nr_locks_acquired--;
+	lock_release(&lock->dep_map, 0, RETIP);
+}
+
+#else /* !CONFIG_LOCKDEP */
+
+static void cl_lock_lockdep_init(struct cl_lock *lock)
+{}
+static void cl_lock_lockdep_acquire(const struct lu_env *env,
+				    struct cl_lock *lock, __u32 enqflags)
+{}
+static void cl_lock_lockdep_release(const struct lu_env *env,
+				    struct cl_lock *lock)
+{}
+
+#endif /* !CONFIG_LOCKDEP */
+
+/**
+ * Adds lock slice to the compound lock.
+ *
+ * This is called by cl_object_operations::coo_lock_init() methods to add a
+ * per-layer state to the lock. New state is added at the end of
+ * cl_lock::cll_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add()
+ */
+void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_lock_operations *ops)
+{
+	ENTRY;
+	slice->cls_lock = lock;
+	list_add_tail(&slice->cls_linkage, &lock->cll_layers);
+	slice->cls_obj = obj;
+	slice->cls_ops = ops;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_slice_add);
+
+/**
+ * Returns true iff a lock with the mode \a has provides at least the same
+ * guarantees as a lock with the mode \a need.
+ */
+int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need)
+{
+	LINVRNT(need == CLM_READ || need == CLM_WRITE ||
+		need == CLM_PHANTOM || need == CLM_GROUP);
+	LINVRNT(has == CLM_READ || has == CLM_WRITE ||
+		has == CLM_PHANTOM || has == CLM_GROUP);
+	CLASSERT(CLM_PHANTOM < CLM_READ);
+	CLASSERT(CLM_READ < CLM_WRITE);
+	CLASSERT(CLM_WRITE < CLM_GROUP);
+
+	if (has != CLM_GROUP)
+		return need <= has;
+	else
+		return need == has;
+}
+EXPORT_SYMBOL(cl_lock_mode_match);
+
+/**
+ * Returns true iff extent portions of lock descriptions match.
+ */
+int cl_lock_ext_match(const struct cl_lock_descr *has,
+		      const struct cl_lock_descr *need)
+{
+	return
+		has->cld_start <= need->cld_start &&
+		has->cld_end >= need->cld_end &&
+		cl_lock_mode_match(has->cld_mode, need->cld_mode) &&
+		(has->cld_mode != CLM_GROUP || has->cld_gid == need->cld_gid);
+}
+EXPORT_SYMBOL(cl_lock_ext_match);
+
+/**
+ * Returns true iff a lock with the description \a has provides at least the
+ * same guarantees as a lock with the description \a need.
+ */
+int cl_lock_descr_match(const struct cl_lock_descr *has,
+			const struct cl_lock_descr *need)
+{
+	return
+		cl_object_same(has->cld_obj, need->cld_obj) &&
+		cl_lock_ext_match(has, need);
+}
+EXPORT_SYMBOL(cl_lock_descr_match);
+
+static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object *obj = lock->cll_descr.cld_obj;
+
+	LINVRNT(!cl_lock_is_mutexed(lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "free lock", lock);
+	might_sleep();
+	while (!list_empty(&lock->cll_layers)) {
+		struct cl_lock_slice *slice;
+
+		slice = list_entry(lock->cll_layers.next,
+				       struct cl_lock_slice, cls_linkage);
+		list_del_init(lock->cll_layers.next);
+		slice->cls_ops->clo_fini(env, slice);
+	}
+	CS_LOCK_DEC(obj, total);
+	CS_LOCKSTATE_DEC(obj, lock->cll_state);
+	lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock);
+	cl_object_put(env, obj);
+	lu_ref_fini(&lock->cll_reference);
+	lu_ref_fini(&lock->cll_holders);
+	mutex_destroy(&lock->cll_guard);
+	OBD_SLAB_FREE_PTR(lock, cl_lock_kmem);
+	EXIT;
+}
+
+/**
+ * Releases a reference on a lock.
+ *
+ * When last reference is released, lock is returned to the cache, unless it
+ * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed
+ * immediately.
+ *
+ * \see cl_object_put(), cl_page_put()
+ */
+void cl_lock_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object	*obj;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	obj = lock->cll_descr.cld_obj;
+	LINVRNT(obj != NULL);
+
+	CDEBUG(D_TRACE, "releasing reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+
+	if (atomic_dec_and_test(&lock->cll_ref)) {
+		if (lock->cll_state == CLS_FREEING) {
+			LASSERT(list_empty(&lock->cll_linkage));
+			cl_lock_free(env, lock);
+		}
+		CS_LOCK_DEC(obj, busy);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_put);
+
+/**
+ * Acquires an additional reference to a lock.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * lock.
+ *
+ * \see cl_object_get(), cl_page_get()
+ */
+void cl_lock_get(struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(NULL, lock));
+	CDEBUG(D_TRACE, "acquiring reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	atomic_inc(&lock->cll_ref);
+}
+EXPORT_SYMBOL(cl_lock_get);
+
+/**
+ * Acquires a reference to a lock.
+ *
+ * This is much like cl_lock_get(), except that this function can be used to
+ * acquire initial reference to the cached lock. Caller has to deal with all
+ * possible races. Use with care!
+ *
+ * \see cl_page_get_trust()
+ */
+void cl_lock_get_trust(struct cl_lock *lock)
+{
+	CDEBUG(D_TRACE, "acquiring trusted reference: %d %p %lu\n",
+	       atomic_read(&lock->cll_ref), lock, RETIP);
+	if (atomic_inc_return(&lock->cll_ref) == 1)
+		CS_LOCK_INC(lock->cll_descr.cld_obj, busy);
+}
+EXPORT_SYMBOL(cl_lock_get_trust);
+
+/**
+ * Helper function destroying the lock that wasn't completely initialized.
+ *
+ * Other threads can acquire references to the top-lock through its
+ * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately.
+ */
+static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock)
+{
+	cl_lock_mutex_get(env, lock);
+	cl_lock_cancel(env, lock);
+	cl_lock_delete(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_put(env, lock);
+}
+
+static struct cl_lock *cl_lock_alloc(const struct lu_env *env,
+				     struct cl_object *obj,
+				     const struct cl_io *io,
+				     const struct cl_lock_descr *descr)
+{
+	struct cl_lock	  *lock;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_SLAB_ALLOC_PTR_GFP(lock, cl_lock_kmem, __GFP_IO);
+	if (lock != NULL) {
+		atomic_set(&lock->cll_ref, 1);
+		lock->cll_descr = *descr;
+		lock->cll_state = CLS_NEW;
+		cl_object_get(obj);
+		lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu,
+						      "cl_lock", lock);
+		INIT_LIST_HEAD(&lock->cll_layers);
+		INIT_LIST_HEAD(&lock->cll_linkage);
+		INIT_LIST_HEAD(&lock->cll_inclosure);
+		lu_ref_init(&lock->cll_reference);
+		lu_ref_init(&lock->cll_holders);
+		mutex_init(&lock->cll_guard);
+		lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class);
+		init_waitqueue_head(&lock->cll_wq);
+		head = obj->co_lu.lo_header;
+		CS_LOCKSTATE_INC(obj, CLS_NEW);
+		CS_LOCK_INC(obj, total);
+		CS_LOCK_INC(obj, create);
+		cl_lock_lockdep_init(lock);
+		list_for_each_entry(obj, &head->loh_layers,
+					co_lu.lo_linkage) {
+			int err;
+
+			err = obj->co_ops->coo_lock_init(env, obj, lock, io);
+			if (err != 0) {
+				cl_lock_finish(env, lock);
+				lock = ERR_PTR(err);
+				break;
+			}
+		}
+	} else
+		lock = ERR_PTR(-ENOMEM);
+	RETURN(lock);
+}
+
+/**
+ * Transfer the lock into INTRANSIT state and return the original state.
+ *
+ * \pre  state: CLS_CACHED, CLS_HELD or CLS_ENQUEUED
+ * \post state: CLS_INTRANSIT
+ * \see CLS_INTRANSIT
+ */
+enum cl_lock_state cl_lock_intransit(const struct lu_env *env,
+				     struct cl_lock *lock)
+{
+	enum cl_lock_state state = lock->cll_state;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERTF(state >= CLS_ENQUEUED && state <= CLS_CACHED,
+		 "Malformed lock state %d.\n", state);
+
+	cl_lock_state_set(env, lock, CLS_INTRANSIT);
+	lock->cll_intransit_owner = current;
+	cl_lock_hold_add(env, lock, "intransit", current);
+	return state;
+}
+EXPORT_SYMBOL(cl_lock_intransit);
+
+/**
+ *  Exit the intransit state and restore the lock state to the original state
+ */
+void cl_lock_extransit(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(state != CLS_INTRANSIT);
+	LASSERT(lock->cll_intransit_owner == current);
+
+	lock->cll_intransit_owner = NULL;
+	cl_lock_state_set(env, lock, state);
+	cl_lock_unhold(env, lock, "intransit", current);
+}
+EXPORT_SYMBOL(cl_lock_extransit);
+
+/**
+ * Checking whether the lock is intransit state
+ */
+int cl_lock_is_intransit(struct cl_lock *lock)
+{
+	LASSERT(cl_lock_is_mutexed(lock));
+	return lock->cll_state == CLS_INTRANSIT &&
+	       lock->cll_intransit_owner != current;
+}
+EXPORT_SYMBOL(cl_lock_is_intransit);
+/**
+ * Returns true iff lock is "suitable" for given io. E.g., locks acquired by
+ * truncate and O_APPEND cannot be reused for read/non-append-write, as they
+ * cover multiple stripes and can trigger cascading timeouts.
+ */
+static int cl_lock_fits_into(const struct lu_env *env,
+			     const struct cl_lock *lock,
+			     const struct cl_lock_descr *need,
+			     const struct cl_io *io)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_fits_into != NULL &&
+		    !slice->cls_ops->clo_fits_into(env, slice, need, io))
+			RETURN(0);
+	}
+	RETURN(1);
+}
+
+static struct cl_lock *cl_lock_lookup(const struct lu_env *env,
+				      struct cl_object *obj,
+				      const struct cl_io *io,
+				      const struct cl_lock_descr *need)
+{
+	struct cl_lock	  *lock;
+	struct cl_object_header *head;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	LINVRNT(spin_is_locked(&head->coh_lock_guard));
+	CS_LOCK_INC(obj, lookup);
+	list_for_each_entry(lock, &head->coh_locks, cll_linkage) {
+		int matched;
+
+		matched = cl_lock_ext_match(&lock->cll_descr, need) &&
+			  lock->cll_state < CLS_FREEING &&
+			  lock->cll_error == 0 &&
+			  !(lock->cll_flags & CLF_CANCELLED) &&
+			  cl_lock_fits_into(env, lock, need, io);
+		CDEBUG(D_DLMTRACE, "has: "DDESCR"(%d) need: "DDESCR": %d\n",
+		       PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need),
+		       matched);
+		if (matched) {
+			cl_lock_get_trust(lock);
+			CS_LOCK_INC(obj, hit);
+			RETURN(lock);
+		}
+	}
+	RETURN(NULL);
+}
+
+/**
+ * Returns a lock matching description \a need.
+ *
+ * This is the main entry point into the cl_lock caching interface. First, a
+ * cache (implemented as a per-object linked list) is consulted. If lock is
+ * found there, it is returned immediately. Otherwise new lock is allocated
+ * and returned. In any case, additional reference to lock is acquired.
+ *
+ * \see cl_object_find(), cl_page_find()
+ */
+static struct cl_lock *cl_lock_find(const struct lu_env *env,
+				    const struct cl_io *io,
+				    const struct cl_lock_descr *need)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	spin_lock(&head->coh_lock_guard);
+	lock = cl_lock_lookup(env, obj, io, need);
+	spin_unlock(&head->coh_lock_guard);
+
+	if (lock == NULL) {
+		lock = cl_lock_alloc(env, obj, io, need);
+		if (!IS_ERR(lock)) {
+			struct cl_lock *ghost;
+
+			spin_lock(&head->coh_lock_guard);
+			ghost = cl_lock_lookup(env, obj, io, need);
+			if (ghost == NULL) {
+				list_add_tail(&lock->cll_linkage,
+						  &head->coh_locks);
+				spin_unlock(&head->coh_lock_guard);
+				CS_LOCK_INC(obj, busy);
+			} else {
+				spin_unlock(&head->coh_lock_guard);
+				/*
+				 * Other threads can acquire references to the
+				 * top-lock through its sub-locks. Hence, it
+				 * cannot be cl_lock_free()-ed immediately.
+				 */
+				cl_lock_finish(env, lock);
+				lock = ghost;
+			}
+		}
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns existing lock matching given description. This is similar to
+ * cl_lock_find() except that no new lock is created, and returned lock is
+ * guaranteed to be in enum cl_lock_state::CLS_HELD state.
+ */
+struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_object_header *head;
+	struct cl_object	*obj;
+	struct cl_lock	  *lock;
+
+	obj  = need->cld_obj;
+	head = cl_object_header(obj);
+
+	do {
+		spin_lock(&head->coh_lock_guard);
+		lock = cl_lock_lookup(env, obj, io, need);
+		spin_unlock(&head->coh_lock_guard);
+		if (lock == NULL)
+			return NULL;
+
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state == CLS_INTRANSIT)
+			/* Don't care return value. */
+			cl_lock_state_wait(env, lock);
+		if (lock->cll_state == CLS_FREEING) {
+			cl_lock_mutex_put(env, lock);
+			cl_lock_put(env, lock);
+			lock = NULL;
+		}
+	} while (lock == NULL);
+
+	cl_lock_hold_add(env, lock, scope, source);
+	cl_lock_user_add(env, lock);
+	if (lock->cll_state == CLS_CACHED)
+		cl_use_try(env, lock, 1);
+	if (lock->cll_state == CLS_HELD) {
+		cl_lock_mutex_put(env, lock);
+		cl_lock_lockdep_acquire(env, lock, 0);
+		cl_lock_put(env, lock);
+	} else {
+		cl_unuse_try(env, lock);
+		cl_lock_unhold(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+		lock = NULL;
+	}
+
+	return lock;
+}
+EXPORT_SYMBOL(cl_lock_peek);
+
+/**
+ * Returns a slice within a lock, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_page_at()
+ */
+const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock,
+				       const struct lu_device_type *dtype)
+{
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_invariant_trusted(NULL, lock));
+	ENTRY;
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype)
+			RETURN(slice);
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(cl_lock_at);
+
+static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	counters = cl_lock_counters(env, lock);
+	lock->cll_depth++;
+	counters->ctc_nr_locks_locked++;
+	lu_ref_add(&counters->ctc_locks_locked, "cll_guard", lock);
+	cl_lock_trace(D_TRACE, env, "got mutex", lock);
+}
+
+/**
+ * Locks cl_lock object.
+ *
+ * This is used to manipulate cl_lock fields, and to serialize state
+ * transitions in the lock state machine.
+ *
+ * \post cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_put()
+ */
+void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	if (lock->cll_guarder == current) {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(lock->cll_depth > 0);
+	} else {
+		struct cl_object_header *hdr;
+		struct cl_thread_info   *info;
+		int i;
+
+		LINVRNT(lock->cll_guarder != current);
+		hdr = cl_object_header(lock->cll_descr.cld_obj);
+		/*
+		 * Check that mutices are taken in the bottom-to-top order.
+		 */
+		info = cl_env_info(env);
+		for (i = 0; i < hdr->coh_nesting; ++i)
+			LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting);
+		lock->cll_guarder = current;
+		LINVRNT(lock->cll_depth == 0);
+	}
+	cl_lock_mutex_tail(env, lock);
+}
+EXPORT_SYMBOL(cl_lock_mutex_get);
+
+/**
+ * Try-locks cl_lock object.
+ *
+ * \retval 0 \a lock was successfully locked
+ *
+ * \retval -EBUSY \a lock cannot be locked right now
+ *
+ * \post ergo(result == 0, cl_lock_is_mutexed(lock))
+ *
+ * \see cl_lock_mutex_get()
+ */
+int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	LINVRNT(cl_lock_invariant_trusted(env, lock));
+	ENTRY;
+
+	result = 0;
+	if (lock->cll_guarder == current) {
+		LINVRNT(lock->cll_depth > 0);
+		cl_lock_mutex_tail(env, lock);
+	} else if (mutex_trylock(&lock->cll_guard)) {
+		LINVRNT(lock->cll_depth == 0);
+		lock->cll_guarder = current;
+		cl_lock_mutex_tail(env, lock);
+	} else
+		result = -EBUSY;
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_mutex_try);
+
+/**
+ {* Unlocks cl_lock object.
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_mutex_get()
+ */
+void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_counters *counters;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(lock->cll_guarder == current);
+	LINVRNT(lock->cll_depth > 0);
+
+	counters = cl_lock_counters(env, lock);
+	LINVRNT(counters->ctc_nr_locks_locked > 0);
+
+	cl_lock_trace(D_TRACE, env, "put mutex", lock);
+	lu_ref_del(&counters->ctc_locks_locked, "cll_guard", lock);
+	counters->ctc_nr_locks_locked--;
+	if (--lock->cll_depth == 0) {
+		lock->cll_guarder = NULL;
+		mutex_unlock(&lock->cll_guard);
+	}
+}
+EXPORT_SYMBOL(cl_lock_mutex_put);
+
+/**
+ * Returns true iff lock's mutex is owned by the current thread.
+ */
+int cl_lock_is_mutexed(struct cl_lock *lock)
+{
+	return lock->cll_guarder == current;
+}
+EXPORT_SYMBOL(cl_lock_is_mutexed);
+
+/**
+ * Returns number of cl_lock mutices held by the current thread (environment).
+ */
+int cl_lock_nr_mutexed(const struct lu_env *env)
+{
+	struct cl_thread_info *info;
+	int i;
+	int locked;
+
+	/*
+	 * NOTE: if summation across all nesting levels (currently 2) proves
+	 *       too expensive, a summary counter can be added to
+	 *       struct cl_thread_info.
+	 */
+	info = cl_env_info(env);
+	for (i = 0, locked = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		locked += info->clt_counters[i].ctc_nr_locks_locked;
+	return locked;
+}
+EXPORT_SYMBOL(cl_lock_nr_mutexed);
+
+static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	if (!(lock->cll_flags & CLF_CANCELLED)) {
+		const struct cl_lock_slice *slice;
+
+		lock->cll_flags |= CLF_CANCELLED;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_cancel != NULL)
+				slice->cls_ops->clo_cancel(env, slice);
+		}
+	}
+	EXIT;
+}
+
+static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_object_header    *head;
+	const struct cl_lock_slice *slice;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_state < CLS_FREEING) {
+		LASSERT(lock->cll_state != CLS_INTRANSIT);
+		cl_lock_state_set(env, lock, CLS_FREEING);
+
+		head = cl_object_header(lock->cll_descr.cld_obj);
+
+		spin_lock(&head->coh_lock_guard);
+		list_del_init(&lock->cll_linkage);
+		spin_unlock(&head->coh_lock_guard);
+
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by cl_lock_lookup().
+		 */
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_delete != NULL)
+				slice->cls_ops->clo_delete(env, slice);
+		}
+		/*
+		 * From now on, no new references to this lock can be acquired
+		 * by layer-specific means (like a pointer from struct
+		 * ldlm_lock in osc, or a pointer from top-lock to sub-lock in
+		 * lov).
+		 *
+		 * Lock will be finally freed in cl_lock_put() when last of
+		 * existing references goes away.
+		 */
+	}
+	EXIT;
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_holds counter for a given lock. Also, for a
+ * top-lock (nesting == 0) accounts for this modification in the per-thread
+ * debugging counters. Sub-lock holds can be released by a thread different
+ * from one that acquired it.
+ */
+static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_holds += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_held += delta;
+		LASSERT(counters->ctc_nr_held >= 0);
+	}
+}
+
+/**
+ * Mod(ifie)s cl_lock::cll_users counter for a given lock. See
+ * cl_lock_hold_mod() for the explanation of the debugging code.
+ */
+static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock,
+			     int delta)
+{
+	struct cl_thread_counters *counters;
+	enum clt_nesting_level     nesting;
+
+	lock->cll_users += delta;
+	nesting = cl_lock_nesting(lock);
+	if (nesting == CNL_TOP) {
+		counters = &cl_env_info(env)->clt_counters[CNL_TOP];
+		counters->ctc_nr_used += delta;
+		LASSERT(counters->ctc_nr_used >= 0);
+	}
+}
+
+void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock,
+			  const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "hold release lock", lock);
+	lu_ref_del(&lock->cll_holders, scope, source);
+	cl_lock_hold_mod(env, lock, -1);
+	if (lock->cll_holds == 0) {
+		CL_LOCK_ASSERT(lock->cll_state != CLS_HELD, env, lock);
+		if (lock->cll_descr.cld_mode == CLM_PHANTOM ||
+		    lock->cll_descr.cld_mode == CLM_GROUP ||
+		    lock->cll_state != CLS_CACHED)
+			/*
+			 * If lock is still phantom or grouplock when user is
+			 * done with it---destroy the lock.
+			 */
+			lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED;
+		if (lock->cll_flags & CLF_CANCELPEND) {
+			lock->cll_flags &= ~CLF_CANCELPEND;
+			cl_lock_cancel0(env, lock);
+		}
+		if (lock->cll_flags & CLF_DOOMED) {
+			/* no longer doomed: it's dead... Jim. */
+			lock->cll_flags &= ~CLF_DOOMED;
+			cl_lock_delete0(env, lock);
+		}
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_release);
+
+/**
+ * Waits until lock state is changed.
+ *
+ * This function is called with cl_lock mutex locked, atomically releases
+ * mutex and goes to sleep, waiting for a lock state change (signaled by
+ * cl_lock_signal()), and re-acquires the mutex before return.
+ *
+ * This function is used to wait until lock state machine makes some progress
+ * and to emulate synchronous operations on top of asynchronous lock
+ * interface.
+ *
+ * \retval -EINTR wait was interrupted
+ *
+ * \retval 0 wait wasn't interrupted
+ *
+ * \pre cl_lock_is_mutexed(lock)
+ *
+ * \see cl_lock_signal()
+ */
+int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	wait_queue_t waiter;
+	sigset_t blocked;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_depth == 1);
+	LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */
+
+	cl_lock_trace(D_DLMTRACE, env, "state wait lock", lock);
+	result = lock->cll_error;
+	if (result == 0) {
+		/* To avoid being interrupted by the 'non-fatal' signals
+		 * (SIGCHLD, for instance), we'd block them temporarily.
+		 * LU-305 */
+		blocked = cfs_block_sigsinv(LUSTRE_FATAL_SIGS);
+
+		init_waitqueue_entry_current(&waiter);
+		add_wait_queue(&lock->cll_wq, &waiter);
+		set_current_state(TASK_INTERRUPTIBLE);
+		cl_lock_mutex_put(env, lock);
+
+		LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+		/* Returning ERESTARTSYS instead of EINTR so syscalls
+		 * can be restarted if signals are pending here */
+		result = -ERESTARTSYS;
+		if (likely(!OBD_FAIL_CHECK(OBD_FAIL_LOCK_STATE_WAIT_INTR))) {
+			waitq_wait(&waiter, TASK_INTERRUPTIBLE);
+			if (!cfs_signal_pending())
+				result = 0;
+		}
+
+		cl_lock_mutex_get(env, lock);
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&lock->cll_wq, &waiter);
+
+		/* Restore old blocked signals */
+		cfs_restore_sigs(blocked);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_state_wait);
+
+static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock,
+				 enum cl_lock_state state)
+{
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage)
+		if (slice->cls_ops->clo_state != NULL)
+			slice->cls_ops->clo_state(env, slice, state);
+	wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+
+/**
+ * Notifies waiters that lock state changed.
+ *
+ * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all
+ * layers about state change by calling cl_lock_operations::clo_state()
+ * top-to-bottom.
+ */
+void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "state signal lock", lock);
+	cl_lock_state_signal(env, lock, lock->cll_state);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_signal);
+
+/**
+ * Changes lock state.
+ *
+ * This function is invoked to notify layers that lock state changed, possible
+ * as a result of an asynchronous event such as call-back reception.
+ *
+ * \post lock->cll_state == state
+ *
+ * \see cl_lock_operations::clo_state()
+ */
+void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock,
+		       enum cl_lock_state state)
+{
+	ENTRY;
+	LASSERT(lock->cll_state <= state ||
+		(lock->cll_state == CLS_CACHED &&
+		 (state == CLS_HELD || /* lock found in cache */
+		  state == CLS_NEW  ||   /* sub-lock canceled */
+		  state == CLS_INTRANSIT)) ||
+		/* lock is in transit state */
+		lock->cll_state == CLS_INTRANSIT);
+
+	if (lock->cll_state != state) {
+		CS_LOCKSTATE_DEC(lock->cll_descr.cld_obj, lock->cll_state);
+		CS_LOCKSTATE_INC(lock->cll_descr.cld_obj, state);
+
+		cl_lock_state_signal(env, lock, state);
+		lock->cll_state = state;
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_state_set);
+
+static int cl_unuse_try_internal(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	do {
+		result = 0;
+
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERT(lock->cll_state == CLS_INTRANSIT);
+
+		result = -ENOSYS;
+		list_for_each_entry_reverse(slice, &lock->cll_layers,
+						cls_linkage) {
+			if (slice->cls_ops->clo_unuse != NULL) {
+				result = slice->cls_ops->clo_unuse(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+	} while (result == CLO_REPEAT);
+
+	return result;
+}
+
+/**
+ * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling
+ * cl_lock_operations::clo_use() top-to-bottom to notify layers.
+ * @atomic = 1, it must unuse the lock to recovery the lock to keep the
+ *  use process atomic
+ */
+int cl_use_try(const struct lu_env *env, struct cl_lock *lock, int atomic)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+	enum cl_lock_state state;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "use lock", lock);
+
+	LASSERT(lock->cll_state == CLS_CACHED);
+	if (lock->cll_error)
+		RETURN(lock->cll_error);
+
+	result = -ENOSYS;
+	state = cl_lock_intransit(env, lock);
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_use != NULL) {
+			result = slice->cls_ops->clo_use(env, slice);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+
+	LASSERTF(lock->cll_state == CLS_INTRANSIT, "Wrong state %d.\n",
+		 lock->cll_state);
+
+	if (result == 0) {
+		state = CLS_HELD;
+	} else {
+		if (result == -ESTALE) {
+			/*
+			 * ESTALE means sublock being cancelled
+			 * at this time, and set lock state to
+			 * be NEW here and ask the caller to repeat.
+			 */
+			state = CLS_NEW;
+			result = CLO_REPEAT;
+		}
+
+		/* @atomic means back-off-on-failure. */
+		if (atomic) {
+			int rc;
+			rc = cl_unuse_try_internal(env, lock);
+			/* Vet the results. */
+			if (rc < 0 && result > 0)
+				result = rc;
+		}
+
+	}
+	cl_lock_extransit(env, lock, state);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_use_try);
+
+/**
+ * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers
+ * top-to-bottom.
+ */
+static int cl_enqueue_kick(const struct lu_env *env,
+			   struct cl_lock *lock,
+			   struct cl_io *io, __u32 flags)
+{
+	int result;
+	const struct cl_lock_slice *slice;
+
+	ENTRY;
+	result = -ENOSYS;
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_enqueue != NULL) {
+			result = slice->cls_ops->clo_enqueue(env,
+							     slice, io, flags);
+			if (result != 0)
+				break;
+		}
+	}
+	LASSERT(result != -ENOSYS);
+	RETURN(result);
+}
+
+/**
+ * Tries to enqueue a lock.
+ *
+ * This function is called repeatedly by cl_enqueue() until either lock is
+ * enqueued, or error occurs. This function does not block waiting for
+ * networking communication to complete.
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ *
+ * \see cl_enqueue() cl_lock_operations::clo_enqueue()
+ * \see cl_lock_state::CLS_ENQUEUED
+ */
+int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock,
+		   struct cl_io *io, __u32 flags)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enqueue lock", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		switch (lock->cll_state) {
+		case CLS_NEW:
+			cl_lock_state_set(env, lock, CLS_QUEUING);
+			/* fall-through */
+		case CLS_QUEUING:
+			/* kick layers. */
+			result = cl_enqueue_kick(env, lock, io, flags);
+			/* For AGL case, the cl_lock::cll_state may
+			 * become CLS_HELD already. */
+			if (result == 0 && lock->cll_state == CLS_QUEUING)
+				cl_lock_state_set(env, lock, CLS_ENQUEUED);
+			break;
+		case CLS_INTRANSIT:
+			LASSERT(cl_lock_is_intransit(lock));
+			result = CLO_WAIT;
+			break;
+		case CLS_CACHED:
+			/* yank lock from the cache. */
+			result = cl_use_try(env, lock, 0);
+			break;
+		case CLS_ENQUEUED:
+		case CLS_HELD:
+			result = 0;
+			break;
+		default:
+		case CLS_FREEING:
+			/*
+			 * impossible, only held locks with increased
+			 * ->cll_holds can be enqueued, and they cannot be
+			 * freed.
+			 */
+			LBUG();
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue_try);
+
+/**
+ * Cancel the conflicting lock found during previous enqueue.
+ *
+ * \retval 0 conflicting lock has been canceled.
+ * \retval -ve error code.
+ */
+int cl_lock_enqueue_wait(const struct lu_env *env,
+			 struct cl_lock *lock,
+			 int keep_mutex)
+{
+	struct cl_lock  *conflict;
+	int	      rc = 0;
+	ENTRY;
+
+	LASSERT(cl_lock_is_mutexed(lock));
+	LASSERT(lock->cll_state == CLS_QUEUING);
+	LASSERT(lock->cll_conflict != NULL);
+
+	conflict = lock->cll_conflict;
+	lock->cll_conflict = NULL;
+
+	cl_lock_mutex_put(env, lock);
+	LASSERT(cl_lock_nr_mutexed(env) == 0);
+
+	cl_lock_mutex_get(env, conflict);
+	cl_lock_trace(D_DLMTRACE, env, "enqueue wait", conflict);
+	cl_lock_cancel(env, conflict);
+	cl_lock_delete(env, conflict);
+
+	while (conflict->cll_state != CLS_FREEING) {
+		rc = cl_lock_state_wait(env, conflict);
+		if (rc != 0)
+			break;
+	}
+	cl_lock_mutex_put(env, conflict);
+	lu_ref_del(&conflict->cll_reference, "cancel-wait", lock);
+	cl_lock_put(env, conflict);
+
+	if (keep_mutex)
+		cl_lock_mutex_get(env, lock);
+
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_lock_enqueue_wait);
+
+static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock,
+			     struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_holds > 0);
+
+	cl_lock_user_add(env, lock);
+	do {
+		result = cl_enqueue_try(env, lock, io, enqflags);
+		if (result == CLO_WAIT) {
+			if (lock->cll_conflict != NULL)
+				result = cl_lock_enqueue_wait(env, lock, 1);
+			else
+				result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result != 0)
+		cl_unuse_try(env, lock);
+	LASSERT(ergo(result == 0 && !(enqflags & CEF_AGL),
+		     lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+
+/**
+ * Enqueues a lock.
+ *
+ * \pre current thread or io owns a hold on lock.
+ *
+ * \post ergo(result == 0, lock->users increased)
+ * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			 lock->cll_state == CLS_HELD)
+ */
+int cl_enqueue(const struct lu_env *env, struct cl_lock *lock,
+	       struct cl_io *io, __u32 enqflags)
+{
+	int result;
+
+	ENTRY;
+
+	cl_lock_lockdep_acquire(env, lock, enqflags);
+	cl_lock_mutex_get(env, lock);
+	result = cl_enqueue_locked(env, lock, io, enqflags);
+	cl_lock_mutex_put(env, lock);
+	if (result != 0)
+		cl_lock_lockdep_release(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+		     lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_enqueue);
+
+/**
+ * Tries to unlock a lock.
+ *
+ * This function is called to release underlying resource:
+ * 1. for top lock, the resource is sublocks it held;
+ * 2. for sublock, the resource is the reference to dlmlock.
+ *
+ * cl_unuse_try is a one-shot operation, so it must NOT return CLO_WAIT.
+ *
+ * \see cl_unuse() cl_lock_operations::clo_unuse()
+ * \see cl_lock_state::CLS_CACHED
+ */
+int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	int			 result;
+	enum cl_lock_state	  state = CLS_NEW;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "unuse lock", lock);
+
+	if (lock->cll_users > 1) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/* Only if the lock is in CLS_HELD or CLS_ENQUEUED state, it can hold
+	 * underlying resources. */
+	if (!(lock->cll_state == CLS_HELD || lock->cll_state == CLS_ENQUEUED)) {
+		cl_lock_user_del(env, lock);
+		RETURN(0);
+	}
+
+	/*
+	 * New lock users (->cll_users) are not protecting unlocking
+	 * from proceeding. From this point, lock eventually reaches
+	 * CLS_CACHED, is reinitialized to CLS_NEW or fails into
+	 * CLS_FREEING.
+	 */
+	state = cl_lock_intransit(env, lock);
+
+	result = cl_unuse_try_internal(env, lock);
+	LASSERT(lock->cll_state == CLS_INTRANSIT);
+	LASSERT(result != CLO_WAIT);
+	cl_lock_user_del(env, lock);
+	if (result == 0 || result == -ESTALE) {
+		/*
+		 * Return lock back to the cache. This is the only
+		 * place where lock is moved into CLS_CACHED state.
+		 *
+		 * If one of ->clo_unuse() methods returned -ESTALE, lock
+		 * cannot be placed into cache and has to be
+		 * re-initialized. This happens e.g., when a sub-lock was
+		 * canceled while unlocking was in progress.
+		 */
+		if (state == CLS_HELD && result == 0)
+			state = CLS_CACHED;
+		else
+			state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+
+		/*
+		 * Hide -ESTALE error.
+		 * If the lock is a glimpse lock, and it has multiple
+		 * stripes. Assuming that one of its sublock returned -ENAVAIL,
+		 * and other sublocks are matched write locks. In this case,
+		 * we can't set this lock to error because otherwise some of
+		 * its sublocks may not be canceled. This causes some dirty
+		 * pages won't be written to OSTs. -jay
+		 */
+		result = 0;
+	} else {
+		CERROR("result = %d, this is unlikely!\n", result);
+		state = CLS_NEW;
+		cl_lock_extransit(env, lock, state);
+	}
+	RETURN(result ?: lock->cll_error);
+}
+EXPORT_SYMBOL(cl_unuse_try);
+
+static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+	ENTRY;
+
+	result = cl_unuse_try(env, lock);
+	if (result)
+		CL_LOCK_DEBUG(D_ERROR, env, lock, "unuse return %d\n", result);
+
+	EXIT;
+}
+
+/**
+ * Unlocks a lock.
+ */
+void cl_unuse(const struct lu_env *env, struct cl_lock *lock)
+{
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+	cl_unuse_locked(env, lock);
+	cl_lock_mutex_put(env, lock);
+	cl_lock_lockdep_release(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_unuse);
+
+/**
+ * Tries to wait for a lock.
+ *
+ * This function is called repeatedly by cl_wait() until either lock is
+ * granted, or error occurs. This function does not block waiting for network
+ * communication to complete.
+ *
+ * \see cl_wait() cl_lock_operations::clo_wait()
+ * \see cl_lock_state::CLS_HELD
+ */
+int cl_wait_try(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	int			 result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "wait lock try", lock);
+	do {
+		LINVRNT(cl_lock_is_mutexed(lock));
+		LINVRNT(cl_lock_invariant(env, lock));
+		LASSERTF(lock->cll_state == CLS_QUEUING ||
+			 lock->cll_state == CLS_ENQUEUED ||
+			 lock->cll_state == CLS_HELD ||
+			 lock->cll_state == CLS_INTRANSIT,
+			 "lock state: %d\n", lock->cll_state);
+		LASSERT(lock->cll_users > 0);
+		LASSERT(lock->cll_holds > 0);
+
+		result = lock->cll_error;
+		if (result != 0)
+			break;
+
+		if (cl_lock_is_intransit(lock)) {
+			result = CLO_WAIT;
+			break;
+		}
+
+		if (lock->cll_state == CLS_HELD)
+			/* nothing to do */
+			break;
+
+		result = -ENOSYS;
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_wait != NULL) {
+				result = slice->cls_ops->clo_wait(env, slice);
+				if (result != 0)
+					break;
+			}
+		}
+		LASSERT(result != -ENOSYS);
+		if (result == 0) {
+			LASSERT(lock->cll_state != CLS_INTRANSIT);
+			cl_lock_state_set(env, lock, CLS_HELD);
+		}
+	} while (result == CLO_REPEAT);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait_try);
+
+/**
+ * Waits until enqueued lock is granted.
+ *
+ * \pre current thread or io owns a hold on the lock
+ * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED ||
+ *			lock->cll_state == CLS_HELD)
+ *
+ * \post ergo(result == 0, lock->cll_state == CLS_HELD)
+ */
+int cl_wait(const struct lu_env *env, struct cl_lock *lock)
+{
+	int result;
+
+	ENTRY;
+	cl_lock_mutex_get(env, lock);
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERTF(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD,
+		 "Wrong state %d \n", lock->cll_state);
+	LASSERT(lock->cll_holds > 0);
+
+	do {
+		result = cl_wait_try(env, lock);
+		if (result == CLO_WAIT) {
+			result = cl_lock_state_wait(env, lock);
+			if (result == 0)
+				continue;
+		}
+		break;
+	} while (1);
+	if (result < 0) {
+		cl_unuse_try(env, lock);
+		cl_lock_lockdep_release(env, lock);
+	}
+	cl_lock_trace(D_DLMTRACE, env, "wait lock", lock);
+	cl_lock_mutex_put(env, lock);
+	LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD));
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_wait);
+
+/**
+ * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock
+ * value.
+ */
+unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	unsigned long pound;
+	unsigned long ounce;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	pound = 0;
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_weigh != NULL) {
+			ounce = slice->cls_ops->clo_weigh(env, slice);
+			pound += ounce;
+			if (pound < ounce) /* over-weight^Wflow */
+				pound = ~0UL;
+		}
+	}
+	RETURN(pound);
+}
+EXPORT_SYMBOL(cl_lock_weigh);
+
+/**
+ * Notifies layers that lock description changed.
+ *
+ * The server can grant client a lock different from one that was requested
+ * (e.g., larger in extent). This method is called when actually granted lock
+ * description becomes known to let layers to accommodate for changed lock
+ * description.
+ *
+ * \see cl_lock_operations::clo_modify()
+ */
+int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock,
+		   const struct cl_lock_descr *desc)
+{
+	const struct cl_lock_slice *slice;
+	struct cl_object	   *obj = lock->cll_descr.cld_obj;
+	struct cl_object_header    *hdr = cl_object_header(obj);
+	int result;
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "modify lock", lock);
+	/* don't allow object to change */
+	LASSERT(obj == desc->cld_obj);
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) {
+		if (slice->cls_ops->clo_modify != NULL) {
+			result = slice->cls_ops->clo_modify(env, slice, desc);
+			if (result != 0)
+				RETURN(result);
+		}
+	}
+	CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n",
+		      PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu)));
+	/*
+	 * Just replace description in place. Nothing more is needed for
+	 * now. If locks were indexed according to their extent and/or mode,
+	 * that index would have to be updated here.
+	 */
+	spin_lock(&hdr->coh_lock_guard);
+	lock->cll_descr = *desc;
+	spin_unlock(&hdr->coh_lock_guard);
+	RETURN(0);
+}
+EXPORT_SYMBOL(cl_lock_modify);
+
+/**
+ * Initializes lock closure with a given origin.
+ *
+ * \see cl_lock_closure
+ */
+void cl_lock_closure_init(const struct lu_env *env,
+			  struct cl_lock_closure *closure,
+			  struct cl_lock *origin, int wait)
+{
+	LINVRNT(cl_lock_is_mutexed(origin));
+	LINVRNT(cl_lock_invariant(env, origin));
+
+	INIT_LIST_HEAD(&closure->clc_list);
+	closure->clc_origin = origin;
+	closure->clc_wait   = wait;
+	closure->clc_nr     = 0;
+}
+EXPORT_SYMBOL(cl_lock_closure_init);
+
+/**
+ * Builds a closure of \a lock.
+ *
+ * Building of a closure consists of adding initial lock (\a lock) into it,
+ * and calling cl_lock_operations::clo_closure() methods of \a lock. These
+ * methods might call cl_lock_closure_build() recursively again, adding more
+ * locks to the closure, etc.
+ *
+ * \see cl_lock_closure
+ */
+int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock,
+			  struct cl_lock_closure *closure)
+{
+	const struct cl_lock_slice *slice;
+	int result;
+
+	ENTRY;
+	LINVRNT(cl_lock_is_mutexed(closure->clc_origin));
+	LINVRNT(cl_lock_invariant(env, closure->clc_origin));
+
+	result = cl_lock_enclosure(env, lock, closure);
+	if (result == 0) {
+		list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+			if (slice->cls_ops->clo_closure != NULL) {
+				result = slice->cls_ops->clo_closure(env, slice,
+								     closure);
+				if (result != 0)
+					break;
+			}
+		}
+	}
+	if (result != 0)
+		cl_lock_disclosure(env, closure);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_closure_build);
+
+/**
+ * Adds new lock to a closure.
+ *
+ * Try-locks \a lock and if succeeded, adds it to the closure (never more than
+ * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting
+ * until next try-lock is likely to succeed.
+ */
+int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock,
+		      struct cl_lock_closure *closure)
+{
+	int result = 0;
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "enclosure lock", lock);
+	if (!cl_lock_mutex_try(env, lock)) {
+		/*
+		 * If lock->cll_inclosure is not empty, lock is already in
+		 * this closure.
+		 */
+		if (list_empty(&lock->cll_inclosure)) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure", closure);
+			list_add(&lock->cll_inclosure, &closure->clc_list);
+			closure->clc_nr++;
+		} else
+			cl_lock_mutex_put(env, lock);
+		result = 0;
+	} else {
+		cl_lock_disclosure(env, closure);
+		if (closure->clc_wait) {
+			cl_lock_get_trust(lock);
+			lu_ref_add(&lock->cll_reference, "closure-w", closure);
+			cl_lock_mutex_put(env, closure->clc_origin);
+
+			LASSERT(cl_lock_nr_mutexed(env) == 0);
+			cl_lock_mutex_get(env, lock);
+			cl_lock_mutex_put(env, lock);
+
+			cl_lock_mutex_get(env, closure->clc_origin);
+			lu_ref_del(&lock->cll_reference, "closure-w", closure);
+			cl_lock_put(env, lock);
+		}
+		result = CLO_REPEAT;
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_enclosure);
+
+/** Releases mutices of enclosed locks. */
+void cl_lock_disclosure(const struct lu_env *env,
+			struct cl_lock_closure *closure)
+{
+	struct cl_lock *scan;
+	struct cl_lock *temp;
+
+	cl_lock_trace(D_DLMTRACE, env, "disclosure lock", closure->clc_origin);
+	list_for_each_entry_safe(scan, temp, &closure->clc_list,
+				     cll_inclosure){
+		list_del_init(&scan->cll_inclosure);
+		cl_lock_mutex_put(env, scan);
+		lu_ref_del(&scan->cll_reference, "closure", closure);
+		cl_lock_put(env, scan);
+		closure->clc_nr--;
+	}
+	LASSERT(closure->clc_nr == 0);
+}
+EXPORT_SYMBOL(cl_lock_disclosure);
+
+/** Finalizes a closure. */
+void cl_lock_closure_fini(struct cl_lock_closure *closure)
+{
+	LASSERT(closure->clc_nr == 0);
+	LASSERT(list_empty(&closure->clc_list));
+}
+EXPORT_SYMBOL(cl_lock_closure_fini);
+
+/**
+ * Destroys this lock. Notifies layers (bottom-to-top) that lock is being
+ * destroyed, then destroy the lock. If there are holds on the lock, postpone
+ * destruction until all holds are released. This is called when a decision is
+ * made to destroy the lock in the future. E.g., when a blocking AST is
+ * received on it, or fatal communication error happens.
+ *
+ * Caller must have a reference on this lock to prevent a situation, when
+ * deleted lock lingers in memory for indefinite time, because nobody calls
+ * cl_lock_put() to finish it.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ * \pre ergo(cl_lock_nesting(lock) == CNL_TOP,
+ *	   cl_lock_nr_mutexed(env) == 1)
+ *      [i.e., if a top-lock is deleted, mutices of no other locks can be
+ *      held, as deletion of sub-locks might require releasing a top-lock
+ *      mutex]
+ *
+ * \see cl_lock_operations::clo_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(ergo(cl_lock_nesting(lock) == CNL_TOP,
+		     cl_lock_nr_mutexed(env) == 1));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "delete lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_delete0(env, lock);
+	else
+		lock->cll_flags |= CLF_DOOMED;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_delete);
+
+/**
+ * Mark lock as irrecoverably failed, and mark it for destruction. This
+ * happens when, e.g., server fails to grant a lock to us, or networking
+ * time-out happens.
+ *
+ * \pre atomic_read(&lock->cll_ref) > 0
+ *
+ * \see clo_lock_delete()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	if (lock->cll_error == 0 && error != 0) {
+		cl_lock_trace(D_DLMTRACE, env, "set lock error", lock);
+		lock->cll_error = error;
+		cl_lock_signal(env, lock);
+		cl_lock_cancel(env, lock);
+		cl_lock_delete(env, lock);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_error);
+
+/**
+ * Cancels this lock. Notifies layers
+ * (bottom-to-top) that lock is being cancelled, then destroy the lock. If
+ * there are holds on the lock, postpone cancellation until
+ * all holds are released.
+ *
+ * Cancellation notification is delivered to layers at most once.
+ *
+ * \see cl_lock_operations::clo_cancel()
+ * \see cl_lock::cll_holds
+ */
+void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "cancel lock", lock);
+	if (lock->cll_holds == 0)
+		cl_lock_cancel0(env, lock);
+	else
+		lock->cll_flags |= CLF_CANCELPEND;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_cancel);
+
+/**
+ * Finds an existing lock covering given index and optionally different from a
+ * given \a except lock.
+ */
+struct cl_lock *cl_lock_at_pgoff(const struct lu_env *env,
+				 struct cl_object *obj, pgoff_t index,
+				 struct cl_lock *except,
+				 int pending, int canceld)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *scan;
+	struct cl_lock	  *lock;
+	struct cl_lock_descr    *need;
+
+	ENTRY;
+
+	head = cl_object_header(obj);
+	need = &cl_env_info(env)->clt_descr;
+	lock = NULL;
+
+	need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but
+				    * not PHANTOM */
+	need->cld_start = need->cld_end = index;
+	need->cld_enq_flags = 0;
+
+	spin_lock(&head->coh_lock_guard);
+	/* It is fine to match any group lock since there could be only one
+	 * with a uniq gid and it conflicts with all other lock modes too */
+	list_for_each_entry(scan, &head->coh_locks, cll_linkage) {
+		if (scan != except &&
+		    (scan->cll_descr.cld_mode == CLM_GROUP ||
+		    cl_lock_ext_match(&scan->cll_descr, need)) &&
+		    scan->cll_state >= CLS_HELD &&
+		    scan->cll_state < CLS_FREEING &&
+		    /*
+		     * This check is racy as the lock can be canceled right
+		     * after it is done, but this is fine, because page exists
+		     * already.
+		     */
+		    (canceld || !(scan->cll_flags & CLF_CANCELLED)) &&
+		    (pending || !(scan->cll_flags & CLF_CANCELPEND))) {
+			/* Don't increase cs_hit here since this
+			 * is just a helper function. */
+			cl_lock_get_trust(scan);
+			lock = scan;
+			break;
+		}
+	}
+	spin_unlock(&head->coh_lock_guard);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_at_pgoff);
+
+/**
+ * Calculate the page offset at the layer of @lock.
+ * At the time of this writing, @page is top page and @lock is sub lock.
+ */
+static pgoff_t pgoff_at_lock(struct cl_page *page, struct cl_lock *lock)
+{
+	struct lu_device_type *dtype;
+	const struct cl_page_slice *slice;
+
+	dtype = lock->cll_descr.cld_obj->co_lu.lo_dev->ld_type;
+	slice = cl_page_at(page, dtype);
+	LASSERT(slice != NULL);
+	return slice->cpl_page->cp_index;
+}
+
+/**
+ * Check if page @page is covered by an extra lock or discard it.
+ */
+static int check_and_discard_cb(const struct lu_env *env, struct cl_io *io,
+				struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock = cbdata;
+	pgoff_t index = pgoff_at_lock(page, lock);
+
+	if (index >= info->clt_fn_index) {
+		struct cl_lock *tmp;
+
+		/* refresh non-overlapped index */
+		tmp = cl_lock_at_pgoff(env, lock->cll_descr.cld_obj, index,
+					lock, 1, 0);
+		if (tmp != NULL) {
+			/* Cache the first-non-overlapped index so as to skip
+			 * all pages within [index, clt_fn_index). This
+			 * is safe because if tmp lock is canceled, it will
+			 * discard these pages. */
+			info->clt_fn_index = tmp->cll_descr.cld_end + 1;
+			if (tmp->cll_descr.cld_end == CL_PAGE_EOF)
+				info->clt_fn_index = CL_PAGE_EOF;
+			cl_lock_put(env, tmp);
+		} else if (cl_page_own(env, io, page) == 0) {
+			/* discard the page */
+			cl_page_unmap(env, io, page);
+			cl_page_discard(env, io, page);
+			cl_page_disown(env, io, page);
+		} else {
+			LASSERT(page->cp_state == CPS_FREEING);
+		}
+	}
+
+	info->clt_next_index = index + 1;
+	return CLP_GANG_OKAY;
+}
+
+static int discard_cb(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *page, void *cbdata)
+{
+	struct cl_thread_info *info = cl_env_info(env);
+	struct cl_lock *lock   = cbdata;
+
+	LASSERT(lock->cll_descr.cld_mode >= CLM_WRITE);
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageWriteback(cl_page_vmpage(env, page))));
+	KLASSERT(ergo(page->cp_type == CPT_CACHEABLE,
+		      !PageDirty(cl_page_vmpage(env, page))));
+
+	info->clt_next_index = pgoff_at_lock(page, lock) + 1;
+	if (cl_page_own(env, io, page) == 0) {
+		/* discard the page */
+		cl_page_unmap(env, io, page);
+		cl_page_discard(env, io, page);
+		cl_page_disown(env, io, page);
+	} else {
+		LASSERT(page->cp_state == CPS_FREEING);
+	}
+
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Discard pages protected by the given lock. This function traverses radix
+ * tree to find all covering pages and discard them. If a page is being covered
+ * by other locks, it should remain in cache.
+ *
+ * If error happens on any step, the process continues anyway (the reasoning
+ * behind this being that lock cancellation cannot be delayed indefinitely).
+ */
+int cl_lock_discard_pages(const struct lu_env *env, struct cl_lock *lock)
+{
+	struct cl_thread_info *info  = cl_env_info(env);
+	struct cl_io	  *io    = &info->clt_io;
+	struct cl_lock_descr  *descr = &lock->cll_descr;
+	cl_page_gang_cb_t      cb;
+	int res;
+	int result;
+
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+
+	io->ci_obj = cl_object_top(descr->cld_obj);
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
+	if (result != 0)
+		GOTO(out, result);
+
+	cb = descr->cld_mode == CLM_READ ? check_and_discard_cb : discard_cb;
+	info->clt_fn_index = info->clt_next_index = descr->cld_start;
+	do {
+		res = cl_page_gang_lookup(env, descr->cld_obj, io,
+					  info->clt_next_index, descr->cld_end,
+					  cb, (void *)lock);
+		if (info->clt_next_index > descr->cld_end)
+			break;
+
+		if (res == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (res != CLP_GANG_OKAY);
+out:
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_lock_discard_pages);
+
+/**
+ * Eliminate all locks for a given object.
+ *
+ * Caller has to guarantee that no lock is in active use.
+ *
+ * \param cancel when this is set, cl_locks_prune() cancels locks before
+ *	       destroying.
+ */
+void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel)
+{
+	struct cl_object_header *head;
+	struct cl_lock	  *lock;
+
+	ENTRY;
+	head = cl_object_header(obj);
+	/*
+	 * If locks are destroyed without cancellation, all pages must be
+	 * already destroyed (as otherwise they will be left unprotected).
+	 */
+	LASSERT(ergo(!cancel,
+		     head->coh_tree.rnode == NULL && head->coh_pages == 0));
+
+	spin_lock(&head->coh_lock_guard);
+	while (!list_empty(&head->coh_locks)) {
+		lock = container_of(head->coh_locks.next,
+				    struct cl_lock, cll_linkage);
+		cl_lock_get_trust(lock);
+		spin_unlock(&head->coh_lock_guard);
+		lu_ref_add(&lock->cll_reference, "prune", current);
+
+again:
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING) {
+			LASSERT(lock->cll_users <= 1);
+			if (unlikely(lock->cll_users == 1)) {
+				struct l_wait_info lwi = { 0 };
+
+				cl_lock_mutex_put(env, lock);
+				l_wait_event(lock->cll_wq,
+					     lock->cll_users == 0,
+					     &lwi);
+				goto again;
+			}
+
+			if (cancel)
+				cl_lock_cancel(env, lock);
+			cl_lock_delete(env, lock);
+		}
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, "prune", current);
+		cl_lock_put(env, lock);
+		spin_lock(&head->coh_lock_guard);
+	}
+	spin_unlock(&head->coh_lock_guard);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_locks_prune);
+
+static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env,
+					  const struct cl_io *io,
+					  const struct cl_lock_descr *need,
+					  const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	while (1) {
+		lock = cl_lock_find(env, io, need);
+		if (IS_ERR(lock))
+			break;
+		cl_lock_mutex_get(env, lock);
+		if (lock->cll_state < CLS_FREEING &&
+		    !(lock->cll_flags & CLF_CANCELLED)) {
+			cl_lock_hold_mod(env, lock, +1);
+			lu_ref_add(&lock->cll_holders, scope, source);
+			lu_ref_add(&lock->cll_reference, scope, source);
+			break;
+		}
+		cl_lock_mutex_put(env, lock);
+		cl_lock_put(env, lock);
+	}
+	RETURN(lock);
+}
+
+/**
+ * Returns a lock matching \a need description with a reference and a hold on
+ * it.
+ *
+ * This is much like cl_lock_find(), except that cl_lock_hold() additionally
+ * guarantees that lock is not in the CLS_FREEING state on return.
+ */
+struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io,
+			     const struct cl_lock_descr *need,
+			     const char *scope, const void *source)
+{
+	struct cl_lock *lock;
+
+	ENTRY;
+
+	lock = cl_lock_hold_mutex(env, io, need, scope, source);
+	if (!IS_ERR(lock))
+		cl_lock_mutex_put(env, lock);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_hold);
+
+/**
+ * Main high-level entry point of cl_lock interface that finds existing or
+ * enqueues new lock matching given description.
+ */
+struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io,
+				const struct cl_lock_descr *need,
+				const char *scope, const void *source)
+{
+	struct cl_lock       *lock;
+	int		   rc;
+	__u32		 enqflags = need->cld_enq_flags;
+
+	ENTRY;
+	do {
+		lock = cl_lock_hold_mutex(env, io, need, scope, source);
+		if (IS_ERR(lock))
+			break;
+
+		rc = cl_enqueue_locked(env, lock, io, enqflags);
+		if (rc == 0) {
+			if (cl_lock_fits_into(env, lock, need, io)) {
+				if (!(enqflags & CEF_AGL)) {
+					cl_lock_mutex_put(env, lock);
+					cl_lock_lockdep_acquire(env, lock,
+								enqflags);
+					break;
+				}
+				rc = 1;
+			}
+			cl_unuse_locked(env, lock);
+		}
+		cl_lock_trace(D_DLMTRACE, env,
+			      rc <= 0 ? "enqueue failed" : "agl succeed", lock);
+		cl_lock_hold_release(env, lock, scope, source);
+		cl_lock_mutex_put(env, lock);
+		lu_ref_del(&lock->cll_reference, scope, source);
+		cl_lock_put(env, lock);
+		if (rc > 0) {
+			LASSERT(enqflags & CEF_AGL);
+			lock = NULL;
+		} else if (rc != 0) {
+			lock = ERR_PTR(rc);
+		}
+	} while (rc == 0);
+	RETURN(lock);
+}
+EXPORT_SYMBOL(cl_lock_request);
+
+/**
+ * Adds a hold to a known lock.
+ */
+void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock,
+		      const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_state != CLS_FREEING);
+
+	ENTRY;
+	cl_lock_hold_mod(env, lock, +1);
+	cl_lock_get(lock);
+	lu_ref_add(&lock->cll_holders, scope, source);
+	lu_ref_add(&lock->cll_reference, scope, source);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_hold_add);
+
+/**
+ * Releases a hold and a reference on a lock, on which caller acquired a
+ * mutex.
+ */
+void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock,
+		    const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_hold_release(env, lock, scope, source);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_unhold);
+
+/**
+ * Releases a hold and a reference on a lock, obtained by cl_lock_hold().
+ */
+void cl_lock_release(const struct lu_env *env, struct cl_lock *lock,
+		     const char *scope, const void *source)
+{
+	LINVRNT(cl_lock_invariant(env, lock));
+	ENTRY;
+	cl_lock_trace(D_DLMTRACE, env, "release lock", lock);
+	cl_lock_mutex_get(env, lock);
+	cl_lock_hold_release(env, lock, scope, source);
+	cl_lock_mutex_put(env, lock);
+	lu_ref_del(&lock->cll_reference, scope, source);
+	cl_lock_put(env, lock);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_release);
+
+void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, +1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_add);
+
+void cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock)
+{
+	LINVRNT(cl_lock_is_mutexed(lock));
+	LINVRNT(cl_lock_invariant(env, lock));
+	LASSERT(lock->cll_users > 0);
+
+	ENTRY;
+	cl_lock_used_mod(env, lock, -1);
+	if (lock->cll_users == 0)
+		wake_up_all(&lock->cll_wq);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lock_user_del);
+
+const char *cl_lock_mode_name(const enum cl_lock_mode mode)
+{
+	static const char *names[] = {
+		[CLM_PHANTOM] = "P",
+		[CLM_READ]    = "R",
+		[CLM_WRITE]   = "W",
+		[CLM_GROUP]   = "G"
+	};
+	if (0 <= mode && mode < ARRAY_SIZE(names))
+		return names[mode];
+	else
+		return "U";
+}
+EXPORT_SYMBOL(cl_lock_mode_name);
+
+/**
+ * Prints human readable representation of a lock description.
+ */
+void cl_lock_descr_print(const struct lu_env *env, void *cookie,
+		       lu_printer_t printer,
+		       const struct cl_lock_descr *descr)
+{
+	const struct lu_fid  *fid;
+
+	fid = lu_object_fid(&descr->cld_obj->co_lu);
+	(*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid));
+}
+EXPORT_SYMBOL(cl_lock_descr_print);
+
+/**
+ * Prints human readable representation of \a lock to the \a f.
+ */
+void cl_lock_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_lock *lock)
+{
+	const struct cl_lock_slice *slice;
+	(*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ",
+		   lock, atomic_read(&lock->cll_ref),
+		   lock->cll_state, lock->cll_error, lock->cll_holds,
+		   lock->cll_users, lock->cll_flags);
+	cl_lock_descr_print(env, cookie, printer, &lock->cll_descr);
+	(*printer)(env, cookie, " {\n");
+
+	list_for_each_entry(slice, &lock->cll_layers, cls_linkage) {
+		(*printer)(env, cookie, "    %s@%p: ",
+			   slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name,
+			   slice);
+		if (slice->cls_ops->clo_print != NULL)
+			slice->cls_ops->clo_print(env, cookie, printer, slice);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} lock@%p\n", lock);
+}
+EXPORT_SYMBOL(cl_lock_print);
+
+int cl_lock_init(void)
+{
+	return lu_kmem_init(cl_lock_caches);
+}
+
+void cl_lock_fini(void)
+{
+	lu_kmem_fini(cl_lock_caches);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_object.c b/drivers/staging/lustre/lustre/obdclass/cl_object.c
new file mode 100644
index 000000000000..cdb5fba04591
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_object.c
@@ -0,0 +1,1148 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Object.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+/*
+ * Locking.
+ *
+ *  i_mutex
+ *      PG_locked
+ *	  ->coh_page_guard
+ *	  ->coh_lock_guard
+ *	  ->coh_attr_guard
+ *	  ->ls_guard
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+/* class_put_type() */
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_fid.h>
+#include <linux/list.h>
+#include <linux/libcfs/libcfs_hash.h> /* for cfs_hash stuff */
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static struct kmem_cache *cl_env_kmem;
+
+/** Lock class of cl_object_header::coh_page_guard */
+static struct lock_class_key cl_page_guard_class;
+/** Lock class of cl_object_header::coh_lock_guard */
+static struct lock_class_key cl_lock_guard_class;
+/** Lock class of cl_object_header::coh_attr_guard */
+static struct lock_class_key cl_attr_guard_class;
+
+extern __u32 lu_context_tags_default;
+extern __u32 lu_session_tags_default;
+/**
+ * Initialize cl_object_header.
+ */
+int cl_object_header_init(struct cl_object_header *h)
+{
+	int result;
+
+	ENTRY;
+	result = lu_object_header_init(&h->coh_lu);
+	if (result == 0) {
+		spin_lock_init(&h->coh_page_guard);
+		spin_lock_init(&h->coh_lock_guard);
+		spin_lock_init(&h->coh_attr_guard);
+		lockdep_set_class(&h->coh_page_guard, &cl_page_guard_class);
+		lockdep_set_class(&h->coh_lock_guard, &cl_lock_guard_class);
+		lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class);
+		h->coh_pages = 0;
+		/* XXX hard coded GFP_* mask. */
+		INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC);
+		INIT_LIST_HEAD(&h->coh_locks);
+		h->coh_page_bufsize = ALIGN(sizeof(struct cl_page), 8);
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_header_init);
+
+/**
+ * Finalize cl_object_header.
+ */
+void cl_object_header_fini(struct cl_object_header *h)
+{
+	LASSERT(list_empty(&h->coh_locks));
+	lu_object_header_fini(&h->coh_lu);
+}
+EXPORT_SYMBOL(cl_object_header_fini);
+
+/**
+ * Returns a cl_object with a given \a fid.
+ *
+ * Returns either cached or newly created object. Additional reference on the
+ * returned object is acquired.
+ *
+ * \see lu_object_find(), cl_page_find(), cl_lock_find()
+ */
+struct cl_object *cl_object_find(const struct lu_env *env,
+				 struct cl_device *cd, const struct lu_fid *fid,
+				 const struct cl_object_conf *c)
+{
+	might_sleep();
+	return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu));
+}
+EXPORT_SYMBOL(cl_object_find);
+
+/**
+ * Releases a reference on \a o.
+ *
+ * When last reference is released object is returned to the cache, unless
+ * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header.
+ *
+ * \see cl_page_put(), cl_lock_put().
+ */
+void cl_object_put(const struct lu_env *env, struct cl_object *o)
+{
+	lu_object_put(env, &o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_put);
+
+/**
+ * Acquire an additional reference to the object \a o.
+ *
+ * This can only be used to acquire _additional_ reference, i.e., caller
+ * already has to possess at least one reference to \a o before calling this.
+ *
+ * \see cl_page_get(), cl_lock_get().
+ */
+void cl_object_get(struct cl_object *o)
+{
+	lu_object_get(&o->co_lu);
+}
+EXPORT_SYMBOL(cl_object_get);
+
+/**
+ * Returns the top-object for a given \a o.
+ *
+ * \see cl_page_top(), cl_io_top()
+ */
+struct cl_object *cl_object_top(struct cl_object *o)
+{
+	struct cl_object_header *hdr = cl_object_header(o);
+	struct cl_object *top;
+
+	while (hdr->coh_parent != NULL)
+		hdr = hdr->coh_parent;
+
+	top = lu2cl(lu_object_top(&hdr->coh_lu));
+	CDEBUG(D_TRACE, "%p -> %p\n", o, top);
+	return top;
+}
+EXPORT_SYMBOL(cl_object_top);
+
+/**
+ * Returns pointer to the lock protecting data-attributes for the given object
+ * \a o.
+ *
+ * Data-attributes are protected by the cl_object_header::coh_attr_guard
+ * spin-lock in the top-object.
+ *
+ * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get().
+ */
+static spinlock_t *cl_object_attr_guard(struct cl_object *o)
+{
+	return &cl_object_header(cl_object_top(o))->coh_attr_guard;
+}
+
+/**
+ * Locks data-attributes.
+ *
+ * Prevents data-attributes from changing, until lock is released by
+ * cl_object_attr_unlock(). This has to be called before calls to
+ * cl_object_attr_get(), cl_object_attr_set().
+ */
+void cl_object_attr_lock(struct cl_object *o)
+{
+	spin_lock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_lock);
+
+/**
+ * Releases data-attributes lock, acquired by cl_object_attr_lock().
+ */
+void cl_object_attr_unlock(struct cl_object *o)
+{
+	spin_unlock(cl_object_attr_guard(o));
+}
+EXPORT_SYMBOL(cl_object_attr_unlock);
+
+/**
+ * Returns data-attributes of an object \a obj.
+ *
+ * Every layer is asked (by calling cl_object_operations::coo_attr_get())
+ * top-to-bottom to fill in parts of \a attr that this layer is responsible
+ * for.
+ */
+int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj,
+		       struct cl_attr *attr)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_get != NULL) {
+			result = obj->co_ops->coo_attr_get(env, obj, attr);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_get);
+
+/**
+ * Updates data-attributes of an object \a obj.
+ *
+ * Only attributes, mentioned in a validness bit-mask \a v are
+ * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom
+ * to top.
+ */
+int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj,
+		       const struct cl_attr *attr, unsigned v)
+{
+	struct lu_object_header *top;
+	int result;
+
+	LASSERT(spin_is_locked(cl_object_attr_guard(obj)));
+	ENTRY;
+
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_attr_set != NULL) {
+			result = obj->co_ops->coo_attr_set(env, obj, attr, v);
+			if (result != 0) {
+				if (result > 0)
+					result = 0;
+				break;
+			}
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_attr_set);
+
+/**
+ * Notifies layers (bottom-to-top) that glimpse AST was received.
+ *
+ * Layers have to fill \a lvb fields with information that will be shipped
+ * back to glimpse issuer.
+ *
+ * \see cl_lock_operations::clo_glimpse()
+ */
+int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj,
+		      struct ost_lvb *lvb)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry_reverse(obj, &top->loh_layers,
+					co_lu.lo_linkage) {
+		if (obj->co_ops->coo_glimpse != NULL) {
+			result = obj->co_ops->coo_glimpse(env, obj, lvb);
+			if (result != 0)
+				break;
+		}
+	}
+	LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top),
+			 "size: "LPU64" mtime: "LPU64" atime: "LPU64" "
+			 "ctime: "LPU64" blocks: "LPU64"\n",
+			 lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime,
+			 lvb->lvb_ctime, lvb->lvb_blocks);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_object_glimpse);
+
+/**
+ * Updates a configuration of an object \a obj.
+ */
+int cl_conf_set(const struct lu_env *env, struct cl_object *obj,
+		const struct cl_object_conf *conf)
+{
+	struct lu_object_header *top;
+	int result;
+
+	ENTRY;
+	top = obj->co_lu.lo_header;
+	result = 0;
+	list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) {
+		if (obj->co_ops->coo_conf_set != NULL) {
+			result = obj->co_ops->coo_conf_set(env, obj, conf);
+			if (result != 0)
+				break;
+		}
+	}
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_conf_set);
+
+/**
+ * Helper function removing all object locks, and marking object for
+ * deletion. All object pages must have been deleted at this point.
+ *
+ * This is called by cl_inode_fini() and lov_object_delete() to destroy top-
+ * and sub- objects respectively.
+ */
+void cl_object_kill(const struct lu_env *env, struct cl_object *obj)
+{
+	struct cl_object_header *hdr;
+
+	hdr = cl_object_header(obj);
+	LASSERT(hdr->coh_tree.rnode == NULL);
+	LASSERT(hdr->coh_pages == 0);
+
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags);
+	/*
+	 * Destroy all locks. Object destruction (including cl_inode_fini())
+	 * cannot cancel the locks, because in the case of a local client,
+	 * where client and server share the same thread running
+	 * prune_icache(), this can dead-lock with ldlm_cancel_handler()
+	 * waiting on __wait_on_freeing_inode().
+	 */
+	cl_locks_prune(env, obj, 0);
+}
+EXPORT_SYMBOL(cl_object_kill);
+
+/**
+ * Prunes caches of pages and locks for this object.
+ */
+void cl_object_prune(const struct lu_env *env, struct cl_object *obj)
+{
+	ENTRY;
+	cl_pages_prune(env, obj);
+	cl_locks_prune(env, obj, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_object_prune);
+
+/**
+ * Check if the object has locks.
+ */
+int cl_object_has_locks(struct cl_object *obj)
+{
+	struct cl_object_header *head = cl_object_header(obj);
+	int has;
+
+	spin_lock(&head->coh_lock_guard);
+	has = list_empty(&head->coh_locks);
+	spin_unlock(&head->coh_lock_guard);
+
+	return (has == 0);
+}
+EXPORT_SYMBOL(cl_object_has_locks);
+
+void cache_stats_init(struct cache_stats *cs, const char *name)
+{
+	int i;
+
+	cs->cs_name = name;
+	for (i = 0; i < CS_NR; i++)
+		atomic_set(&cs->cs_stats[i], 0);
+}
+
+int cache_stats_print(const struct cache_stats *cs, struct seq_file *m, int h)
+{
+	int i;
+	/*
+	 *   lookup    hit    total  cached create
+	 * env: ...... ...... ...... ...... ......
+	 */
+	if (h) {
+		const char *names[CS_NR] = CS_NAMES;
+
+		seq_printf(m, "%6s", " ");
+		for (i = 0; i < CS_NR; i++)
+			seq_printf(m, "%8s", names[i]);
+		seq_printf(m, "\n");
+	}
+
+	seq_printf(m, "%5.5s:", cs->cs_name);
+	for (i = 0; i < CS_NR; i++)
+		seq_printf(m, "%8u", atomic_read(&cs->cs_stats[i]));
+	return 0;
+}
+
+/**
+ * Initialize client site.
+ *
+ * Perform common initialization (lu_site_init()), and initialize statistical
+ * counters. Also perform global initializations on the first call.
+ */
+int cl_site_init(struct cl_site *s, struct cl_device *d)
+{
+	int i;
+	int result;
+
+	result = lu_site_init(&s->cs_lu, &d->cd_lu_dev);
+	if (result == 0) {
+		cache_stats_init(&s->cs_pages, "pages");
+		cache_stats_init(&s->cs_locks, "locks");
+		for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i)
+			atomic_set(&s->cs_pages_state[0], 0);
+		for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i)
+			atomic_set(&s->cs_locks_state[i], 0);
+	}
+	return result;
+}
+EXPORT_SYMBOL(cl_site_init);
+
+/**
+ * Finalize client site. Dual to cl_site_init().
+ */
+void cl_site_fini(struct cl_site *s)
+{
+	lu_site_fini(&s->cs_lu);
+}
+EXPORT_SYMBOL(cl_site_fini);
+
+static struct cache_stats cl_env_stats = {
+	.cs_name    = "envs",
+	.cs_stats = { ATOMIC_INIT(0), }
+};
+
+/**
+ * Outputs client site statistical counters into a buffer. Suitable for
+ * ll_rd_*()-style functions.
+ */
+int cl_site_stats_print(const struct cl_site *site, struct seq_file *m)
+{
+	int i;
+	static const char *pstate[] = {
+		[CPS_CACHED]  = "c",
+		[CPS_OWNED]   = "o",
+		[CPS_PAGEOUT] = "w",
+		[CPS_PAGEIN]  = "r",
+		[CPS_FREEING] = "f"
+	};
+	static const char *lstate[] = {
+		[CLS_NEW]       = "n",
+		[CLS_QUEUING]   = "q",
+		[CLS_ENQUEUED]  = "e",
+		[CLS_HELD]      = "h",
+		[CLS_INTRANSIT] = "t",
+		[CLS_CACHED]    = "c",
+		[CLS_FREEING]   = "f"
+	};
+/*
+       lookup    hit  total   busy create
+pages: ...... ...... ...... ...... ...... [...... ...... ...... ......]
+locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......]
+  env: ...... ...... ...... ...... ......
+ */
+	lu_site_stats_print(&site->cs_lu, m);
+	cache_stats_print(&site->cs_pages, m, 1);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i)
+		seq_printf(m, "%s: %u ", pstate[i],
+				atomic_read(&site->cs_pages_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&site->cs_locks, m, 0);
+	seq_printf(m, " [");
+	for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i)
+		seq_printf(m, "%s: %u ", lstate[i],
+				atomic_read(&site->cs_locks_state[i]));
+	seq_printf(m, "]\n");
+	cache_stats_print(&cl_env_stats, m, 0);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(cl_site_stats_print);
+
+/*****************************************************************************
+ *
+ * lu_env handling on client.
+ *
+ */
+
+/**
+ * The most efficient way is to store cl_env pointer in task specific
+ * structures. On Linux, it wont' be easy to use task_struct->journal_info
+ * because Lustre code may call into other fs which has certain assumptions
+ * about journal_info. Currently following fields in task_struct are identified
+ * can be used for this purpose:
+ *  - cl_env: for liblustre.
+ *  - tux_info: ony on RedHat kernel.
+ *  - ...
+ * \note As long as we use task_struct to store cl_env, we assume that once
+ * called into Lustre, we'll never call into the other part of the kernel
+ * which will use those fields in task_struct without explicitly exiting
+ * Lustre.
+ *
+ * If there's no space in task_struct is available, hash will be used.
+ * bz20044, bz22683.
+ */
+
+struct cl_env {
+	void	     *ce_magic;
+	struct lu_env     ce_lu;
+	struct lu_context ce_ses;
+
+	/**
+	 * This allows cl_env to be entered into cl_env_hash which implements
+	 * the current thread -> client environment lookup.
+	 */
+	struct hlist_node  ce_node;
+	/**
+	 * Owner for the current cl_env.
+	 *
+	 * If LL_TASK_CL_ENV is defined, this point to the owning current,
+	 * only for debugging purpose ;
+	 * Otherwise hash is used, and this is the key for cfs_hash.
+	 * Now current thread pid is stored. Note using thread pointer would
+	 * lead to unbalanced hash because of its specific allocation locality
+	 * and could be varied for different platforms and OSes, even different
+	 * OS versions.
+	 */
+	void	     *ce_owner;
+
+	/*
+	 * Linkage into global list of all client environments. Used for
+	 * garbage collection.
+	 */
+	struct list_head	ce_linkage;
+	/*
+	 *
+	 */
+	int	       ce_ref;
+	/*
+	 * Debugging field: address of the caller who made original
+	 * allocation.
+	 */
+	void	     *ce_debug;
+};
+
+#define CL_ENV_INC(counter)
+#define CL_ENV_DEC(counter)
+
+static void cl_env_init0(struct cl_env *cle, void *debug)
+{
+	LASSERT(cle->ce_ref == 0);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL);
+
+	cle->ce_ref = 1;
+	cle->ce_debug = debug;
+	CL_ENV_INC(busy);
+}
+
+
+/*
+ * The implementation of using hash table to connect cl_env and thread
+ */
+
+static cfs_hash_t *cl_env_hash;
+
+static unsigned cl_env_hops_hash(cfs_hash_t *lh,
+				 const void *key, unsigned mask)
+{
+#if BITS_PER_LONG == 64
+	return cfs_hash_u64_hash((__u64)key, mask);
+#else
+	return cfs_hash_u32_hash((__u32)key, mask);
+#endif
+}
+
+static void *cl_env_hops_obj(struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+	return (void *)cle;
+}
+
+static int cl_env_hops_keycmp(const void *key, struct hlist_node *hn)
+{
+	struct cl_env *cle = cl_env_hops_obj(hn);
+
+	LASSERT(cle->ce_owner != NULL);
+	return (key == cle->ce_owner);
+}
+
+static void cl_env_hops_noop(cfs_hash_t *hs, struct hlist_node *hn)
+{
+	struct cl_env *cle = hlist_entry(hn, struct cl_env, ce_node);
+	LASSERT(cle->ce_magic == &cl_env_init0);
+}
+
+static cfs_hash_ops_t cl_env_hops = {
+	.hs_hash	= cl_env_hops_hash,
+	.hs_key	 = cl_env_hops_obj,
+	.hs_keycmp      = cl_env_hops_keycmp,
+	.hs_object      = cl_env_hops_obj,
+	.hs_get	 = cl_env_hops_noop,
+	.hs_put_locked  = cl_env_hops_noop,
+};
+
+static inline struct cl_env *cl_env_fetch(void)
+{
+	struct cl_env *cle;
+
+	cle = cfs_hash_lookup(cl_env_hash, (void *) (long) current->pid);
+	LASSERT(ergo(cle, cle->ce_magic == &cl_env_init0));
+	return cle;
+}
+
+static inline void cl_env_attach(struct cl_env *cle)
+{
+	if (cle) {
+		int rc;
+
+		LASSERT(cle->ce_owner == NULL);
+		cle->ce_owner = (void *) (long) current->pid;
+		rc = cfs_hash_add_unique(cl_env_hash, cle->ce_owner,
+					 &cle->ce_node);
+		LASSERT(rc == 0);
+	}
+}
+
+static inline void cl_env_do_detach(struct cl_env *cle)
+{
+	void *cookie;
+
+	LASSERT(cle->ce_owner == (void *) (long) current->pid);
+	cookie = cfs_hash_del(cl_env_hash, cle->ce_owner,
+			      &cle->ce_node);
+	LASSERT(cookie == cle);
+	cle->ce_owner = NULL;
+}
+
+static int cl_env_store_init(void) {
+	cl_env_hash = cfs_hash_create("cl_env",
+				      HASH_CL_ENV_BITS, HASH_CL_ENV_BITS,
+				      HASH_CL_ENV_BKT_BITS, 0,
+				      CFS_HASH_MIN_THETA,
+				      CFS_HASH_MAX_THETA,
+				      &cl_env_hops,
+				      CFS_HASH_RW_BKTLOCK);
+	return cl_env_hash != NULL ? 0 :-ENOMEM;
+}
+
+static void cl_env_store_fini(void) {
+	cfs_hash_putref(cl_env_hash);
+}
+
+
+static inline struct cl_env *cl_env_detach(struct cl_env *cle)
+{
+	if (cle == NULL)
+		cle = cl_env_fetch();
+
+	if (cle && cle->ce_owner)
+		cl_env_do_detach(cle);
+
+	return cle;
+}
+
+static struct lu_env *cl_env_new(__u32 ctx_tags, __u32 ses_tags, void *debug)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	OBD_SLAB_ALLOC_PTR_GFP(cle, cl_env_kmem, __GFP_IO);
+	if (cle != NULL) {
+		int rc;
+
+		INIT_LIST_HEAD(&cle->ce_linkage);
+		cle->ce_magic = &cl_env_init0;
+		env = &cle->ce_lu;
+		rc = lu_env_init(env, LCT_CL_THREAD|ctx_tags);
+		if (rc == 0) {
+			rc = lu_context_init(&cle->ce_ses,
+					     LCT_SESSION | ses_tags);
+			if (rc == 0) {
+				lu_context_enter(&cle->ce_ses);
+				env->le_ses = &cle->ce_ses;
+				cl_env_init0(cle, debug);
+			} else
+				lu_env_fini(env);
+		}
+		if (rc != 0) {
+			OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+			env = ERR_PTR(rc);
+		} else {
+			CL_ENV_INC(create);
+			CL_ENV_INC(total);
+		}
+	} else
+		env = ERR_PTR(-ENOMEM);
+	return env;
+}
+
+static void cl_env_fini(struct cl_env *cle)
+{
+	CL_ENV_DEC(total);
+	lu_context_fini(&cle->ce_lu.le_ctx);
+	lu_context_fini(&cle->ce_ses);
+	OBD_SLAB_FREE_PTR(cle, cl_env_kmem);
+}
+
+static inline struct cl_env *cl_env_container(struct lu_env *env)
+{
+	return container_of(env, struct cl_env, ce_lu);
+}
+
+struct lu_env *cl_env_peek(int *refcheck)
+{
+	struct lu_env *env;
+	struct cl_env *cle;
+
+	CL_ENV_INC(lookup);
+
+	/* check that we don't go far from untrusted pointer */
+	CLASSERT(offsetof(struct cl_env, ce_magic) == 0);
+
+	env = NULL;
+	cle = cl_env_fetch();
+	if (cle != NULL) {
+		CL_ENV_INC(hit);
+		env = &cle->ce_lu;
+		*refcheck = ++cle->ce_ref;
+	}
+	CDEBUG(D_OTHER, "%d@%p\n", cle ? cle->ce_ref : 0, cle);
+	return env;
+}
+EXPORT_SYMBOL(cl_env_peek);
+
+/**
+ * Returns lu_env: if there already is an environment associated with the
+ * current thread, it is returned, otherwise, new environment is allocated.
+ *
+ * \param refcheck pointer to a counter used to detect environment leaks. In
+ * the usual case cl_env_get() and cl_env_put() are called in the same lexical
+ * scope and pointer to the same integer is passed as \a refcheck. This is
+ * used to detect missed cl_env_put().
+ *
+ * \see cl_env_put()
+ */
+struct lu_env *cl_env_get(int *refcheck)
+{
+	struct lu_env *env;
+
+	env = cl_env_peek(refcheck);
+	if (env == NULL) {
+		env = cl_env_new(lu_context_tags_default,
+				 lu_session_tags_default,
+				 __builtin_return_address(0));
+
+		if (!IS_ERR(env)) {
+			struct cl_env *cle;
+
+			cle = cl_env_container(env);
+			cl_env_attach(cle);
+			*refcheck = cle->ce_ref;
+			CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+		}
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_get);
+
+/**
+ * Forces an allocation of a fresh environment with given tags.
+ *
+ * \see cl_env_get()
+ */
+struct lu_env *cl_env_alloc(int *refcheck, __u32 tags)
+{
+	struct lu_env *env;
+
+	LASSERT(cl_env_peek(refcheck) == NULL);
+	env = cl_env_new(tags, tags, __builtin_return_address(0));
+	if (!IS_ERR(env)) {
+		struct cl_env *cle;
+
+		cle = cl_env_container(env);
+		*refcheck = cle->ce_ref;
+		CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	}
+	return env;
+}
+EXPORT_SYMBOL(cl_env_alloc);
+
+static void cl_env_exit(struct cl_env *cle)
+{
+	LASSERT(cle->ce_owner == NULL);
+	lu_context_exit(&cle->ce_lu.le_ctx);
+	lu_context_exit(&cle->ce_ses);
+}
+
+/**
+ * Release an environment.
+ *
+ * Decrement \a env reference counter. When counter drops to 0, nothing in
+ * this thread is using environment and it is returned to the allocation
+ * cache, or freed straight away, if cache is large enough.
+ */
+void cl_env_put(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle;
+
+	cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+	LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck));
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+	if (--cle->ce_ref == 0) {
+		CL_ENV_DEC(busy);
+		cl_env_detach(cle);
+		cle->ce_debug = NULL;
+		cl_env_exit(cle);
+		cl_env_fini(cle);
+	}
+}
+EXPORT_SYMBOL(cl_env_put);
+
+/**
+ * Declares a point of re-entrancy.
+ *
+ * \see cl_env_reexit()
+ */
+void *cl_env_reenter(void)
+{
+	return cl_env_detach(NULL);
+}
+EXPORT_SYMBOL(cl_env_reenter);
+
+/**
+ * Exits re-entrancy.
+ */
+void cl_env_reexit(void *cookie)
+{
+	cl_env_detach(NULL);
+	cl_env_attach(cookie);
+}
+EXPORT_SYMBOL(cl_env_reexit);
+
+/**
+ * Setup user-supplied \a env as a current environment. This is to be used to
+ * guaranteed that environment exists even when cl_env_get() fails. It is up
+ * to user to ensure proper concurrency control.
+ *
+ * \see cl_env_unplant()
+ */
+void cl_env_implant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 0);
+
+	cl_env_attach(cle);
+	cl_env_get(refcheck);
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+}
+EXPORT_SYMBOL(cl_env_implant);
+
+/**
+ * Detach environment installed earlier by cl_env_implant().
+ */
+void cl_env_unplant(struct lu_env *env, int *refcheck)
+{
+	struct cl_env *cle = cl_env_container(env);
+
+	LASSERT(cle->ce_ref > 1);
+
+	CDEBUG(D_OTHER, "%d@%p\n", cle->ce_ref, cle);
+
+	cl_env_detach(cle);
+	cl_env_put(env, refcheck);
+}
+EXPORT_SYMBOL(cl_env_unplant);
+
+struct lu_env *cl_env_nested_get(struct cl_env_nest *nest)
+{
+	struct lu_env *env;
+
+	nest->cen_cookie = NULL;
+	env = cl_env_peek(&nest->cen_refcheck);
+	if (env != NULL) {
+		if (!cl_io_is_going(env))
+			return env;
+		else {
+			cl_env_put(env, &nest->cen_refcheck);
+			nest->cen_cookie = cl_env_reenter();
+		}
+	}
+	env = cl_env_get(&nest->cen_refcheck);
+	if (IS_ERR(env)) {
+		cl_env_reexit(nest->cen_cookie);
+		return env;
+	}
+
+	LASSERT(!cl_io_is_going(env));
+	return env;
+}
+EXPORT_SYMBOL(cl_env_nested_get);
+
+void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env)
+{
+	cl_env_put(env, &nest->cen_refcheck);
+	cl_env_reexit(nest->cen_cookie);
+}
+EXPORT_SYMBOL(cl_env_nested_put);
+
+/**
+ * Converts struct cl_attr to struct ost_lvb.
+ *
+ * \see cl_lvb2attr
+ */
+void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr)
+{
+	ENTRY;
+	lvb->lvb_size   = attr->cat_size;
+	lvb->lvb_mtime  = attr->cat_mtime;
+	lvb->lvb_atime  = attr->cat_atime;
+	lvb->lvb_ctime  = attr->cat_ctime;
+	lvb->lvb_blocks = attr->cat_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_attr2lvb);
+
+/**
+ * Converts struct ost_lvb to struct cl_attr.
+ *
+ * \see cl_attr2lvb
+ */
+void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb)
+{
+	ENTRY;
+	attr->cat_size   = lvb->lvb_size;
+	attr->cat_mtime  = lvb->lvb_mtime;
+	attr->cat_atime  = lvb->lvb_atime;
+	attr->cat_ctime  = lvb->lvb_ctime;
+	attr->cat_blocks = lvb->lvb_blocks;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_lvb2attr);
+
+/*****************************************************************************
+ *
+ * Temporary prototype thing: mirror obd-devices into cl devices.
+ *
+ */
+
+struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site,
+				struct lu_device_type *ldt,
+				struct lu_device *next)
+{
+	const char       *typename;
+	struct lu_device *d;
+
+	LASSERT(ldt != NULL);
+
+	typename = ldt->ldt_name;
+	d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL);
+	if (!IS_ERR(d)) {
+		int rc;
+
+		if (site != NULL)
+			d->ld_site = site;
+		rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next);
+		if (rc == 0) {
+			lu_device_get(d);
+			lu_ref_add(&d->ld_reference,
+				   "lu-stack", &lu_site_init);
+		} else {
+			ldt->ldt_ops->ldto_device_free(env, d);
+			CERROR("can't init device '%s', %d\n", typename, rc);
+			d = ERR_PTR(rc);
+		}
+	} else
+		CERROR("Cannot allocate device: '%s'\n", typename);
+	return lu2cl_dev(d);
+}
+EXPORT_SYMBOL(cl_type_setup);
+
+/**
+ * Finalize device stack by calling lu_stack_fini().
+ */
+void cl_stack_fini(const struct lu_env *env, struct cl_device *cl)
+{
+	lu_stack_fini(env, cl2lu_dev(cl));
+}
+EXPORT_SYMBOL(cl_stack_fini);
+
+int  cl_lock_init(void);
+void cl_lock_fini(void);
+
+int  cl_page_init(void);
+void cl_page_fini(void);
+
+static struct lu_context_key cl_key;
+
+struct cl_thread_info *cl_env_info(const struct lu_env *env)
+{
+	return lu_context_key_get(&env->le_ctx, &cl_key);
+}
+
+/* defines cl0_key_{init,fini}() */
+LU_KEY_INIT_FINI(cl0, struct cl_thread_info);
+
+static void *cl_key_init(const struct lu_context *ctx,
+			 struct lu_context_key *key)
+{
+	struct cl_thread_info *info;
+
+	info = cl0_key_init(ctx, key);
+	if (!IS_ERR(info)) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+			lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+	return info;
+}
+
+static void cl_key_fini(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info;
+	int i;
+
+	info = data;
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i)
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+	cl0_key_fini(ctx, key, data);
+}
+
+static void cl_key_exit(const struct lu_context *ctx,
+			struct lu_context_key *key, void *data)
+{
+	struct cl_thread_info *info = data;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(info->clt_counters); ++i) {
+		LASSERT(info->clt_counters[i].ctc_nr_held == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_used == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_acquired == 0);
+		LASSERT(info->clt_counters[i].ctc_nr_locks_locked == 0);
+		lu_ref_fini(&info->clt_counters[i].ctc_locks_locked);
+		lu_ref_init(&info->clt_counters[i].ctc_locks_locked);
+	}
+}
+
+static struct lu_context_key cl_key = {
+	.lct_tags = LCT_CL_THREAD,
+	.lct_init = cl_key_init,
+	.lct_fini = cl_key_fini,
+	.lct_exit = cl_key_exit
+};
+
+static struct lu_kmem_descr cl_object_caches[] = {
+	{
+		.ckd_cache = &cl_env_kmem,
+		.ckd_name  = "cl_env_kmem",
+		.ckd_size  = sizeof (struct cl_env)
+	},
+	{
+		.ckd_cache = NULL
+	}
+};
+
+/**
+ * Global initialization of cl-data. Create kmem caches, register
+ * lu_context_key's, etc.
+ *
+ * \see cl_global_fini()
+ */
+int cl_global_init(void)
+{
+	int result;
+
+	result = cl_env_store_init();
+	if (result)
+		return result;
+
+	result = lu_kmem_init(cl_object_caches);
+	if (result)
+		goto out_store;
+
+	LU_CONTEXT_KEY_INIT(&cl_key);
+	result = lu_context_key_register(&cl_key);
+	if (result)
+		goto out_kmem;
+
+	result = cl_lock_init();
+	if (result)
+		goto out_context;
+
+	result = cl_page_init();
+	if (result)
+		goto out_lock;
+
+	return 0;
+out_lock:
+	cl_lock_fini();
+out_context:
+	lu_context_key_degister(&cl_key);
+out_kmem:
+	lu_kmem_fini(cl_object_caches);
+out_store:
+	cl_env_store_fini();
+	return result;
+}
+
+/**
+ * Finalization of global cl-data. Dual to cl_global_init().
+ */
+void cl_global_fini(void)
+{
+	cl_lock_fini();
+	cl_page_fini();
+	lu_context_key_degister(&cl_key);
+	lu_kmem_fini(cl_object_caches);
+	cl_env_store_fini();
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/cl_page.c b/drivers/staging/lustre/lustre/obdclass/cl_page.c
new file mode 100644
index 000000000000..bb9335911c34
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/cl_page.c
@@ -0,0 +1,1605 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * Client Lustre Page.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <linux/list.h>
+
+#include <cl_object.h>
+#include "cl_internal.h"
+
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix);
+
+# define PASSERT(env, page, expr)				       \
+  do {								    \
+	  if (unlikely(!(expr))) {				      \
+		  CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n");    \
+		  LASSERT(0);					   \
+	  }							     \
+  } while (0)
+
+# define PINVRNT(env, page, exp) \
+	((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp))
+
+/* Disable page statistic by default due to huge performance penalty. */
+#define CS_PAGE_INC(o, item)
+#define CS_PAGE_DEC(o, item)
+#define CS_PAGESTATE_INC(o, state)
+#define CS_PAGESTATE_DEC(o, state)
+
+/**
+ * Internal version of cl_page_top, it should be called if the page is
+ * known to be not freed, says with page referenced, or radix tree lock held,
+ * or page owned.
+ */
+static struct cl_page *cl_page_top_trusted(struct cl_page *page)
+{
+	while (page->cp_parent != NULL)
+		page = page->cp_parent;
+	return page;
+}
+
+/**
+ * Internal version of cl_page_get().
+ *
+ * This function can be used to obtain initial reference to previously
+ * unreferenced cached object. It can be called only if concurrent page
+ * reclamation is somehow prevented, e.g., by locking page radix-tree
+ * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page,
+ * associated with \a page.
+ *
+ * Use with care! Not exported.
+ */
+static void cl_page_get_trust(struct cl_page *page)
+{
+	LASSERT(atomic_read(&page->cp_ref) > 0);
+	atomic_inc(&page->cp_ref);
+}
+
+/**
+ * Returns a slice within a page, corresponding to the given layer in the
+ * device stack.
+ *
+ * \see cl_lock_at()
+ */
+static const struct cl_page_slice *
+cl_page_at_trusted(const struct cl_page *page,
+		   const struct lu_device_type *dtype)
+{
+	const struct cl_page_slice *slice;
+	ENTRY;
+
+	page = cl_page_top_trusted((struct cl_page *)page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype)
+				RETURN(slice);
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	RETURN(NULL);
+}
+
+/**
+ * Returns a page with given index in the given object, or NULL if no page is
+ * found. Acquires a reference on \a page.
+ *
+ * Locking: called under cl_object_header::coh_page_guard spin-lock.
+ */
+struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index)
+{
+	struct cl_page *page;
+
+	LASSERT(spin_is_locked(&hdr->coh_page_guard));
+
+	page = radix_tree_lookup(&hdr->coh_tree, index);
+	if (page != NULL)
+		cl_page_get_trust(page);
+	return page;
+}
+EXPORT_SYMBOL(cl_page_lookup);
+
+/**
+ * Returns a list of pages by a given [start, end] of \a obj.
+ *
+ * \param resched If not NULL, then we give up before hogging CPU for too
+ * long and set *resched = 1, in that case caller should implement a retry
+ * logic.
+ *
+ * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely
+ * crucial in the face of [offset, EOF] locks.
+ *
+ * Return at least one page in @queue unless there is no covered page.
+ */
+int cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
+			struct cl_io *io, pgoff_t start, pgoff_t end,
+			cl_page_gang_cb_t cb, void *cbdata)
+{
+	struct cl_object_header *hdr;
+	struct cl_page	  *page;
+	struct cl_page	 **pvec;
+	const struct cl_page_slice  *slice;
+	const struct lu_device_type *dtype;
+	pgoff_t		  idx;
+	unsigned int	     nr;
+	unsigned int	     i;
+	unsigned int	     j;
+	int		      res = CLP_GANG_OKAY;
+	int		      tree_lock = 1;
+	ENTRY;
+
+	idx = start;
+	hdr = cl_object_header(obj);
+	pvec = cl_env_info(env)->clt_pvec;
+	dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type;
+	spin_lock(&hdr->coh_page_guard);
+	while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec,
+					    idx, CLT_PVEC_SIZE)) > 0) {
+		int end_of_region = 0;
+		idx = pvec[nr - 1]->cp_index + 1;
+		for (i = 0, j = 0; i < nr; ++i) {
+			page = pvec[i];
+			pvec[i] = NULL;
+
+			LASSERT(page->cp_type == CPT_CACHEABLE);
+			if (page->cp_index > end) {
+				end_of_region = 1;
+				break;
+			}
+			if (page->cp_state == CPS_FREEING)
+				continue;
+
+			slice = cl_page_at_trusted(page, dtype);
+			/*
+			 * Pages for lsm-less file has no underneath sub-page
+			 * for osc, in case of ...
+			 */
+			PASSERT(env, page, slice != NULL);
+
+			page = slice->cpl_page;
+			/*
+			 * Can safely call cl_page_get_trust() under
+			 * radix-tree spin-lock.
+			 *
+			 * XXX not true, because @page is from object another
+			 * than @hdr and protected by different tree lock.
+			 */
+			cl_page_get_trust(page);
+			lu_ref_add_atomic(&page->cp_reference,
+					  "gang_lookup", current);
+			pvec[j++] = page;
+		}
+
+		/*
+		 * Here a delicate locking dance is performed. Current thread
+		 * holds a reference to a page, but has to own it before it
+		 * can be placed into queue. Owning implies waiting, so
+		 * radix-tree lock is to be released. After a wait one has to
+		 * check that pages weren't truncated (cl_page_own() returns
+		 * error in the latter case).
+		 */
+		spin_unlock(&hdr->coh_page_guard);
+		tree_lock = 0;
+
+		for (i = 0; i < j; ++i) {
+			page = pvec[i];
+			if (res == CLP_GANG_OKAY)
+				res = (*cb)(env, io, page, cbdata);
+			lu_ref_del(&page->cp_reference,
+				   "gang_lookup", current);
+			cl_page_put(env, page);
+		}
+		if (nr < CLT_PVEC_SIZE || end_of_region)
+			break;
+
+		if (res == CLP_GANG_OKAY && need_resched())
+			res = CLP_GANG_RESCHED;
+		if (res != CLP_GANG_OKAY)
+			break;
+
+		spin_lock(&hdr->coh_page_guard);
+		tree_lock = 1;
+	}
+	if (tree_lock)
+		spin_unlock(&hdr->coh_page_guard);
+	RETURN(res);
+}
+EXPORT_SYMBOL(cl_page_gang_lookup);
+
+static void cl_page_free(const struct lu_env *env, struct cl_page *page)
+{
+	struct cl_object *obj  = page->cp_obj;
+	int pagesize = cl_object_header(obj)->coh_page_bufsize;
+
+	PASSERT(env, page, list_empty(&page->cp_batch));
+	PASSERT(env, page, page->cp_owner == NULL);
+	PASSERT(env, page, page->cp_req == NULL);
+	PASSERT(env, page, page->cp_parent == NULL);
+	PASSERT(env, page, page->cp_state == CPS_FREEING);
+
+	ENTRY;
+	might_sleep();
+	while (!list_empty(&page->cp_layers)) {
+		struct cl_page_slice *slice;
+
+		slice = list_entry(page->cp_layers.next,
+				       struct cl_page_slice, cpl_linkage);
+		list_del_init(page->cp_layers.next);
+		slice->cpl_ops->cpo_fini(env, slice);
+	}
+	CS_PAGE_DEC(obj, total);
+	CS_PAGESTATE_DEC(obj, page->cp_state);
+	lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page);
+	cl_object_put(env, obj);
+	lu_ref_fini(&page->cp_reference);
+	OBD_FREE(page, pagesize);
+	EXIT;
+}
+
+/**
+ * Helper function updating page state. This is the only place in the code
+ * where cl_page::cp_state field is mutated.
+ */
+static inline void cl_page_state_set_trust(struct cl_page *page,
+					   enum cl_page_state state)
+{
+	/* bypass const. */
+	*(enum cl_page_state *)&page->cp_state = state;
+}
+
+static struct cl_page *cl_page_alloc(const struct lu_env *env,
+		struct cl_object *o, pgoff_t ind, struct page *vmpage,
+		enum cl_page_type type)
+{
+	struct cl_page	  *page;
+	struct lu_object_header *head;
+
+	ENTRY;
+	OBD_ALLOC_GFP(page, cl_object_header(o)->coh_page_bufsize,
+			__GFP_IO);
+	if (page != NULL) {
+		int result = 0;
+		atomic_set(&page->cp_ref, 1);
+		if (type == CPT_CACHEABLE) /* for radix tree */
+			atomic_inc(&page->cp_ref);
+		page->cp_obj = o;
+		cl_object_get(o);
+		page->cp_obj_ref = lu_object_ref_add(&o->co_lu, "cl_page",page);
+		page->cp_index = ind;
+		cl_page_state_set_trust(page, CPS_CACHED);
+		page->cp_type = type;
+		INIT_LIST_HEAD(&page->cp_layers);
+		INIT_LIST_HEAD(&page->cp_batch);
+		INIT_LIST_HEAD(&page->cp_flight);
+		mutex_init(&page->cp_mutex);
+		lu_ref_init(&page->cp_reference);
+		head = o->co_lu.lo_header;
+		list_for_each_entry(o, &head->loh_layers,
+					co_lu.lo_linkage) {
+			if (o->co_ops->coo_page_init != NULL) {
+				result = o->co_ops->coo_page_init(env, o,
+								  page, vmpage);
+				if (result != 0) {
+					cl_page_delete0(env, page, 0);
+					cl_page_free(env, page);
+					page = ERR_PTR(result);
+					break;
+				}
+			}
+		}
+		if (result == 0) {
+			CS_PAGE_INC(o, total);
+			CS_PAGE_INC(o, create);
+			CS_PAGESTATE_DEC(o, CPS_CACHED);
+		}
+	} else {
+		page = ERR_PTR(-ENOMEM);
+	}
+	RETURN(page);
+}
+
+/**
+ * Returns a cl_page with index \a idx at the object \a o, and associated with
+ * the VM page \a vmpage.
+ *
+ * This is the main entry point into the cl_page caching interface. First, a
+ * cache (implemented as a per-object radix tree) is consulted. If page is
+ * found there, it is returned immediately. Otherwise new page is allocated
+ * and returned. In any case, additional reference to page is acquired.
+ *
+ * \see cl_object_find(), cl_lock_find()
+ */
+static struct cl_page *cl_page_find0(const struct lu_env *env,
+				     struct cl_object *o,
+				     pgoff_t idx, struct page *vmpage,
+				     enum cl_page_type type,
+				     struct cl_page *parent)
+{
+	struct cl_page	  *page = NULL;
+	struct cl_page	  *ghost = NULL;
+	struct cl_object_header *hdr;
+	int err;
+
+	LASSERT(type == CPT_CACHEABLE || type == CPT_TRANSIENT);
+	might_sleep();
+
+	ENTRY;
+
+	hdr = cl_object_header(o);
+	CS_PAGE_INC(o, lookup);
+
+	CDEBUG(D_PAGE, "%lu@"DFID" %p %lx %d\n",
+	       idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type);
+	/* fast path. */
+	if (type == CPT_CACHEABLE) {
+		/* vmpage lock is used to protect the child/parent
+		 * relationship */
+		KLASSERT(PageLocked(vmpage));
+		/*
+		 * cl_vmpage_page() can be called here without any locks as
+		 *
+		 *     - "vmpage" is locked (which prevents ->private from
+		 *       concurrent updates), and
+		 *
+		 *     - "o" cannot be destroyed while current thread holds a
+		 *       reference on it.
+		 */
+		page = cl_vmpage_page(vmpage, o);
+		PINVRNT(env, page,
+			ergo(page != NULL,
+			     cl_page_vmpage(env, page) == vmpage &&
+			     (void *)radix_tree_lookup(&hdr->coh_tree,
+						       idx) == page));
+	}
+
+	if (page != NULL) {
+		CS_PAGE_INC(o, hit);
+		RETURN(page);
+	}
+
+	/* allocate and initialize cl_page */
+	page = cl_page_alloc(env, o, idx, vmpage, type);
+	if (IS_ERR(page))
+		RETURN(page);
+
+	if (type == CPT_TRANSIENT) {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		RETURN(page);
+	}
+
+	/*
+	 * XXX optimization: use radix_tree_preload() here, and change tree
+	 * gfp mask to GFP_KERNEL in cl_object_header_init().
+	 */
+	spin_lock(&hdr->coh_page_guard);
+	err = radix_tree_insert(&hdr->coh_tree, idx, page);
+	if (err != 0) {
+		ghost = page;
+		/*
+		 * Noted by Jay: a lock on \a vmpage protects cl_page_find()
+		 * from this race, but
+		 *
+		 *     0. it's better to have cl_page interface "locally
+		 *     consistent" so that its correctness can be reasoned
+		 *     about without appealing to the (obscure world of) VM
+		 *     locking.
+		 *
+		 *     1. handling this race allows ->coh_tree to remain
+		 *     consistent even when VM locking is somehow busted,
+		 *     which is very useful during diagnosing and debugging.
+		 */
+		page = ERR_PTR(err);
+		CL_PAGE_DEBUG(D_ERROR, env, ghost,
+			      "fail to insert into radix tree: %d\n", err);
+	} else {
+		if (parent) {
+			LASSERT(page->cp_parent == NULL);
+			page->cp_parent = parent;
+			parent->cp_child = page;
+		}
+		hdr->coh_pages++;
+	}
+	spin_unlock(&hdr->coh_page_guard);
+
+	if (unlikely(ghost != NULL)) {
+		cl_page_delete0(env, ghost, 0);
+		cl_page_free(env, ghost);
+	}
+	RETURN(page);
+}
+
+struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o,
+			     pgoff_t idx, struct page *vmpage,
+			     enum cl_page_type type)
+{
+	return cl_page_find0(env, o, idx, vmpage, type, NULL);
+}
+EXPORT_SYMBOL(cl_page_find);
+
+
+struct cl_page *cl_page_find_sub(const struct lu_env *env, struct cl_object *o,
+				 pgoff_t idx, struct page *vmpage,
+				 struct cl_page *parent)
+{
+	return cl_page_find0(env, o, idx, vmpage, parent->cp_type, parent);
+}
+EXPORT_SYMBOL(cl_page_find_sub);
+
+static inline int cl_page_invariant(const struct cl_page *pg)
+{
+	struct cl_object_header *header;
+	struct cl_page	  *parent;
+	struct cl_page	  *child;
+	struct cl_io	    *owner;
+
+	/*
+	 * Page invariant is protected by a VM lock.
+	 */
+	LINVRNT(cl_page_is_vmlocked(NULL, pg));
+
+	header = cl_object_header(pg->cp_obj);
+	parent = pg->cp_parent;
+	child  = pg->cp_child;
+	owner  = pg->cp_owner;
+
+	return cl_page_in_use(pg) &&
+		ergo(parent != NULL, parent->cp_child == pg) &&
+		ergo(child != NULL, child->cp_parent == pg) &&
+		ergo(child != NULL, pg->cp_obj != child->cp_obj) &&
+		ergo(parent != NULL, pg->cp_obj != parent->cp_obj) &&
+		ergo(owner != NULL && parent != NULL,
+		     parent->cp_owner == pg->cp_owner->ci_parent) &&
+		ergo(owner != NULL && child != NULL,
+		     child->cp_owner->ci_parent == owner) &&
+		/*
+		 * Either page is early in initialization (has neither child
+		 * nor parent yet), or it is in the object radix tree.
+		 */
+		ergo(pg->cp_state < CPS_FREEING && pg->cp_type == CPT_CACHEABLE,
+		     (void *)radix_tree_lookup(&header->coh_tree,
+					       pg->cp_index) == pg ||
+		     (child == NULL && parent == NULL));
+}
+
+static void cl_page_state_set0(const struct lu_env *env,
+			       struct cl_page *page, enum cl_page_state state)
+{
+	enum cl_page_state old;
+
+	/*
+	 * Matrix of allowed state transitions [old][new], for sanity
+	 * checking.
+	 */
+	static const int allowed_transitions[CPS_NR][CPS_NR] = {
+		[CPS_CACHED] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 1, /* io finds existing cached page */
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 1, /* write-out from the cache */
+			[CPS_FREEING] = 1, /* eviction on the memory pressure */
+		},
+		[CPS_OWNED] = {
+			[CPS_CACHED]  = 1, /* release to the cache */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 1, /* start read immediately */
+			[CPS_PAGEOUT] = 1, /* start write immediately */
+			[CPS_FREEING] = 1, /* lock invalidation or truncate */
+		},
+		[CPS_PAGEIN] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_PAGEOUT] = {
+			[CPS_CACHED]  = 1, /* io completion */
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		},
+		[CPS_FREEING] = {
+			[CPS_CACHED]  = 0,
+			[CPS_OWNED]   = 0,
+			[CPS_PAGEIN]  = 0,
+			[CPS_PAGEOUT] = 0,
+			[CPS_FREEING] = 0,
+		}
+	};
+
+	ENTRY;
+	old = page->cp_state;
+	PASSERT(env, page, allowed_transitions[old][state]);
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d -> %d\n", old, state);
+	for (; page != NULL; page = page->cp_child) {
+		PASSERT(env, page, page->cp_state == old);
+		PASSERT(env, page,
+			equi(state == CPS_OWNED, page->cp_owner != NULL));
+
+		CS_PAGESTATE_DEC(page->cp_obj, page->cp_state);
+		CS_PAGESTATE_INC(page->cp_obj, state);
+		cl_page_state_set_trust(page, state);
+	}
+	EXIT;
+}
+
+static void cl_page_state_set(const struct lu_env *env,
+			      struct cl_page *page, enum cl_page_state state)
+{
+	cl_page_state_set0(env, page, state);
+}
+
+/**
+ * Acquires an additional reference to a page.
+ *
+ * This can be called only by caller already possessing a reference to \a
+ * page.
+ *
+ * \see cl_object_get(), cl_lock_get().
+ */
+void cl_page_get(struct cl_page *page)
+{
+	ENTRY;
+	cl_page_get_trust(page);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_get);
+
+/**
+ * Releases a reference to a page.
+ *
+ * When last reference is released, page is returned to the cache, unless it
+ * is in cl_page_state::CPS_FREEING state, in which case it is immediately
+ * destroyed.
+ *
+ * \see cl_object_put(), cl_lock_put().
+ */
+void cl_page_put(const struct lu_env *env, struct cl_page *page)
+{
+	PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent);
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, page, "%d\n",
+		       atomic_read(&page->cp_ref));
+
+	if (atomic_dec_and_test(&page->cp_ref)) {
+		LASSERT(page->cp_state == CPS_FREEING);
+
+		LASSERT(atomic_read(&page->cp_ref) == 0);
+		PASSERT(env, page, page->cp_owner == NULL);
+		PASSERT(env, page, list_empty(&page->cp_batch));
+		/*
+		 * Page is no longer reachable by other threads. Tear
+		 * it down.
+		 */
+		cl_page_free(env, page);
+	}
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_put);
+
+/**
+ * Returns a VM page associated with a given cl_page.
+ */
+struct page *cl_page_vmpage(const struct lu_env *env, struct cl_page *page)
+{
+	const struct cl_page_slice *slice;
+
+	/*
+	 * Find uppermost layer with ->cpo_vmpage() method, and return its
+	 * result.
+	 */
+	page = cl_page_top(page);
+	do {
+		list_for_each_entry(slice, &page->cp_layers, cpl_linkage) {
+			if (slice->cpl_ops->cpo_vmpage != NULL)
+				RETURN(slice->cpl_ops->cpo_vmpage(env, slice));
+		}
+		page = page->cp_child;
+	} while (page != NULL);
+	LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */
+}
+EXPORT_SYMBOL(cl_page_vmpage);
+
+/**
+ * Returns a cl_page associated with a VM page, and given cl_object.
+ */
+struct cl_page *cl_vmpage_page(struct page *vmpage, struct cl_object *obj)
+{
+	struct cl_page *top;
+	struct cl_page *page;
+
+	ENTRY;
+	KLASSERT(PageLocked(vmpage));
+
+	/*
+	 * NOTE: absence of races and liveness of data are guaranteed by page
+	 *       lock on a "vmpage". That works because object destruction has
+	 *       bottom-to-top pass.
+	 */
+
+	/*
+	 * This loop assumes that ->private points to the top-most page. This
+	 * can be rectified easily.
+	 */
+	top = (struct cl_page *)vmpage->private;
+	if (top == NULL)
+		RETURN(NULL);
+
+	for (page = top; page != NULL; page = page->cp_child) {
+		if (cl_object_same(page->cp_obj, obj)) {
+			cl_page_get_trust(page);
+			break;
+		}
+	}
+	LASSERT(ergo(page, page->cp_type == CPT_CACHEABLE));
+	RETURN(page);
+}
+EXPORT_SYMBOL(cl_vmpage_page);
+
+/**
+ * Returns the top-page for a given page.
+ *
+ * \see cl_object_top(), cl_io_top()
+ */
+struct cl_page *cl_page_top(struct cl_page *page)
+{
+	return cl_page_top_trusted(page);
+}
+EXPORT_SYMBOL(cl_page_top);
+
+const struct cl_page_slice *cl_page_at(const struct cl_page *page,
+				       const struct lu_device_type *dtype)
+{
+	return cl_page_at_trusted(page, dtype);
+}
+EXPORT_SYMBOL(cl_page_at);
+
+#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname)
+
+#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...)		   \
+({								      \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	int			 __result;			   \
+	ptrdiff_t		   __op   = (_op);		     \
+	int		       (*__method)_proto;		    \
+									\
+	__result = 0;						   \
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL) {			 \
+				__result = (*__method)(__env, __scan,   \
+						       ## __VA_ARGS__); \
+				if (__result != 0)		      \
+					break;			  \
+			}					       \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL && __result == 0);		      \
+	if (__result > 0)					       \
+		__result = 0;					   \
+	__result;						       \
+})
+
+#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...)		   \
+do {								    \
+	const struct lu_env	*__env  = (_env);		    \
+	struct cl_page	     *__page = (_page);		   \
+	const struct cl_page_slice *__scan;			     \
+	ptrdiff_t		   __op   = (_op);		     \
+	void		      (*__method)_proto;		    \
+									\
+	__page = cl_page_top(__page);				   \
+	do {							    \
+		list_for_each_entry(__scan, &__page->cp_layers,     \
+					cpl_linkage) {		  \
+			__method = *(void **)((char *)__scan->cpl_ops + \
+					      __op);		    \
+			if (__method != NULL)			   \
+				(*__method)(__env, __scan,	      \
+					    ## __VA_ARGS__);	    \
+		}						       \
+		__page = __page->cp_child;			      \
+	} while (__page != NULL);				       \
+} while (0)
+
+#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...)	       \
+do {									\
+	const struct lu_env	*__env  = (_env);			\
+	struct cl_page	     *__page = (_page);		       \
+	const struct cl_page_slice *__scan;				 \
+	ptrdiff_t		   __op   = (_op);			 \
+	void		      (*__method)_proto;			\
+									    \
+	/* get to the bottom page. */				       \
+	while (__page->cp_child != NULL)				    \
+		__page = __page->cp_child;				  \
+	do {								\
+		list_for_each_entry_reverse(__scan, &__page->cp_layers, \
+						cpl_linkage) {	      \
+			__method = *(void **)((char *)__scan->cpl_ops +     \
+					      __op);			\
+			if (__method != NULL)			       \
+				(*__method)(__env, __scan,		  \
+					    ## __VA_ARGS__);		\
+		}							   \
+		__page = __page->cp_parent;				 \
+	} while (__page != NULL);					   \
+} while (0)
+
+static int cl_page_invoke(const struct lu_env *env,
+			  struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(CL_PAGE_INVOKE(env, page, op,
+			      (const struct lu_env *,
+			       const struct cl_page_slice *, struct cl_io *),
+			      io));
+}
+
+static void cl_page_invoid(const struct lu_env *env,
+			   struct cl_io *io, struct cl_page *page, ptrdiff_t op)
+
+{
+	PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj));
+	ENTRY;
+	CL_PAGE_INVOID(env, page, op,
+		       (const struct lu_env *,
+			const struct cl_page_slice *, struct cl_io *), io);
+	EXIT;
+}
+
+static void cl_page_owner_clear(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		if (page->cp_owner != NULL) {
+			LASSERT(page->cp_owner->ci_owned_nr > 0);
+			page->cp_owner->ci_owned_nr--;
+			page->cp_owner = NULL;
+			page->cp_task = NULL;
+		}
+	}
+	EXIT;
+}
+
+static void cl_page_owner_set(struct cl_page *page)
+{
+	ENTRY;
+	for (page = cl_page_top(page); page != NULL; page = page->cp_child) {
+		LASSERT(page->cp_owner != NULL);
+		page->cp_owner->ci_owned_nr++;
+	}
+	EXIT;
+}
+
+void cl_page_disown0(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	enum cl_page_state state;
+
+	ENTRY;
+	state = pg->cp_state;
+	PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING);
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	cl_page_owner_clear(pg);
+
+	if (state == CPS_OWNED)
+		cl_page_state_set(env, pg, CPS_CACHED);
+	/*
+	 * Completion call-backs are executed in the bottom-up order, so that
+	 * uppermost layer (llite), responsible for VFS/VM interaction runs
+	 * last and can release locks safely.
+	 */
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+
+/**
+ * returns true, iff page is owned by the given io.
+ */
+int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
+{
+	LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj));
+	ENTRY;
+	RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io);
+}
+EXPORT_SYMBOL(cl_page_is_owned);
+
+/**
+ * Try to own a page by IO.
+ *
+ * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
+ * into cl_page_state::CPS_OWNED state.
+ *
+ * \pre  !cl_page_is_owned(pg, io)
+ * \post result == 0 iff cl_page_is_owned(pg, io)
+ *
+ * \retval 0   success
+ *
+ * \retval -ve failure, e.g., page was destroyed (and landed in
+ *	     cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *	     or, page was owned by another thread, or in IO.
+ *
+ * \see cl_page_disown()
+ * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
+ */
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+			struct cl_page *pg, int nonblock)
+{
+	int result;
+
+	PINVRNT(env, pg, !cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	if (pg->cp_state == CPS_FREEING) {
+		result = -ENOENT;
+	} else {
+		result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+					(const struct lu_env *,
+					 const struct cl_page_slice *,
+					 struct cl_io *, int),
+					io, nonblock);
+		if (result == 0) {
+			PASSERT(env, pg, pg->cp_owner == NULL);
+			PASSERT(env, pg, pg->cp_req == NULL);
+			pg->cp_owner = io;
+			pg->cp_task  = current;
+			cl_page_owner_set(pg);
+			if (pg->cp_state != CPS_FREEING) {
+				cl_page_state_set(env, pg, CPS_OWNED);
+			} else {
+				cl_page_disown0(env, io, pg);
+				result = -ENOENT;
+			}
+		}
+	}
+	PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
+	RETURN(result);
+}
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 0);
+}
+EXPORT_SYMBOL(cl_page_own);
+
+/**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+		    struct cl_page *pg)
+{
+	return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
+ * Assume page ownership.
+ *
+ * Called when page is already locked by the hosting VM.
+ *
+ * \pre !cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_assume()
+ */
+void cl_page_assume(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume));
+	PASSERT(env, pg, pg->cp_owner == NULL);
+	pg->cp_owner = io;
+	pg->cp_task = current;
+	cl_page_owner_set(pg);
+	cl_page_state_set(env, pg, CPS_OWNED);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_assume);
+
+/**
+ * Releases page ownership without unlocking the page.
+ *
+ * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the
+ * underlying VM page (as VM is supposed to do this itself).
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_assume()
+ */
+void cl_page_unassume(const struct lu_env *env,
+		      struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, CPS_CACHED);
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, struct cl_io *),
+			       io);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_unassume);
+
+/**
+ * Releases page ownership.
+ *
+ * Moves page into cl_page_state::CPS_CACHED.
+ *
+ * \pre   cl_page_is_owned(pg, io)
+ * \post !cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_own()
+ * \see cl_page_operations::cpo_disown()
+ */
+void cl_page_disown(const struct lu_env *env,
+		    struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+
+	ENTRY;
+	pg = cl_page_top(pg);
+	io = cl_io_top(io);
+	cl_page_disown0(env, io, pg);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_disown);
+
+/**
+ * Called when page is to be removed from the object, e.g., as a result of
+ * truncate.
+ *
+ * Calls cl_page_operations::cpo_discard() top-to-bottom.
+ *
+ * \pre cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_discard()
+ */
+void cl_page_discard(const struct lu_env *env,
+		     struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard));
+}
+EXPORT_SYMBOL(cl_page_discard);
+
+/**
+ * Version of cl_page_delete() that can be called for not fully constructed
+ * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0()
+ * path. Doesn't check page invariant.
+ */
+static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg,
+			    int radix)
+{
+	struct cl_page *tmp = pg;
+	ENTRY;
+
+	PASSERT(env, pg, pg == cl_page_top(pg));
+	PASSERT(env, pg, pg->cp_state != CPS_FREEING);
+
+	/*
+	 * Severe all ways to obtain new pointers to @pg.
+	 */
+	cl_page_owner_clear(pg);
+
+	/*
+	 * unexport the page firstly before freeing it so that
+	 * the page content is considered to be invalid.
+	 * We have to do this because a CPS_FREEING cl_page may
+	 * be NOT under the protection of a cl_lock.
+	 * Afterwards, if this page is found by other threads, then this
+	 * page will be forced to reread.
+	 */
+	cl_page_export(env, pg, 0);
+	cl_page_state_set0(env, pg, CPS_FREEING);
+
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete),
+		       (const struct lu_env *, const struct cl_page_slice *));
+
+	if (tmp->cp_type == CPT_CACHEABLE) {
+		if (!radix)
+			/* !radix means that @pg is not yet in the radix tree,
+			 * skip removing it.
+			 */
+			tmp = pg->cp_child;
+		for (; tmp != NULL; tmp = tmp->cp_child) {
+			void		    *value;
+			struct cl_object_header *hdr;
+
+			hdr = cl_object_header(tmp->cp_obj);
+			spin_lock(&hdr->coh_page_guard);
+			value = radix_tree_delete(&hdr->coh_tree,
+						  tmp->cp_index);
+			PASSERT(env, tmp, value == tmp);
+			PASSERT(env, tmp, hdr->coh_pages > 0);
+			hdr->coh_pages--;
+			spin_unlock(&hdr->coh_page_guard);
+			cl_page_put(env, tmp);
+		}
+	}
+
+	EXIT;
+}
+
+/**
+ * Called when a decision is made to throw page out of memory.
+ *
+ * Notifies all layers about page destruction by calling
+ * cl_page_operations::cpo_delete() method top-to-bottom.
+ *
+ * Moves page into cl_page_state::CPS_FREEING state (this is the only place
+ * where transition to this state happens).
+ *
+ * Eliminates all venues through which new references to the page can be
+ * obtained:
+ *
+ *     - removes page from the radix trees,
+ *
+ *     - breaks linkage from VM page to cl_page.
+ *
+ * Once page reaches cl_page_state::CPS_FREEING, all remaining references will
+ * drain after some time, at which point page will be recycled.
+ *
+ * \pre  pg == cl_page_top(pg)
+ * \pre  VM page is locked
+ * \post pg->cp_state == CPS_FREEING
+ *
+ * \see cl_page_operations::cpo_delete()
+ */
+void cl_page_delete(const struct lu_env *env, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	ENTRY;
+	cl_page_delete0(env, pg, 1);
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_delete);
+
+/**
+ * Unmaps page from user virtual memory.
+ *
+ * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to unmap page from user space
+ * virtual memory.
+ *
+ * \see cl_page_operations::cpo_unmap()
+ */
+int cl_page_unmap(const struct lu_env *env,
+		  struct cl_io *io, struct cl_page *pg)
+{
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap));
+}
+EXPORT_SYMBOL(cl_page_unmap);
+
+/**
+ * Marks page up-to-date.
+ *
+ * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The
+ * layer responsible for VM interaction has to mark/clear page as up-to-date
+ * by the \a uptodate argument.
+ *
+ * \see cl_page_operations::cpo_export()
+ */
+void cl_page_export(const struct lu_env *env, struct cl_page *pg, int uptodate)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export),
+		       (const struct lu_env *,
+			const struct cl_page_slice *, int), uptodate);
+}
+EXPORT_SYMBOL(cl_page_export);
+
+/**
+ * Returns true, iff \a pg is VM locked in a suitable sense by the calling
+ * thread.
+ */
+int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg)
+{
+	int result;
+	const struct cl_page_slice *slice;
+
+	ENTRY;
+	pg = cl_page_top_trusted((struct cl_page *)pg);
+	slice = container_of(pg->cp_layers.next,
+			     const struct cl_page_slice, cpl_linkage);
+	PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL);
+	/*
+	 * Call ->cpo_is_vmlocked() directly instead of going through
+	 * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by
+	 * cl_page_invariant().
+	 */
+	result = slice->cpl_ops->cpo_is_vmlocked(env, slice);
+	PASSERT(env, pg, result == -EBUSY || result == -ENODATA);
+	RETURN(result == -EBUSY);
+}
+EXPORT_SYMBOL(cl_page_is_vmlocked);
+
+static enum cl_page_state cl_req_type_state(enum cl_req_type crt)
+{
+	ENTRY;
+	RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN);
+}
+
+static void cl_page_io_start(const struct lu_env *env,
+			     struct cl_page *pg, enum cl_req_type crt)
+{
+	/*
+	 * Page is queued for IO, change its state.
+	 */
+	ENTRY;
+	cl_page_owner_clear(pg);
+	cl_page_state_set(env, pg, cl_req_type_state(crt));
+	EXIT;
+}
+
+/**
+ * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is
+ * called top-to-bottom. Every layer either agrees to submit this page (by
+ * returning 0), or requests to omit this page (by returning -EALREADY). Layer
+ * handling interactions with the VM also has to inform VM that page is under
+ * transfer now.
+ */
+int cl_page_prep(const struct lu_env *env, struct cl_io *io,
+		 struct cl_page *pg, enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	/*
+	 * XXX this has to be called bottom-to-top, so that llite can set up
+	 * PG_writeback without risking other layers deciding to skip this
+	 * page.
+	 */
+	if (crt >= CRT_NR)
+		return -EINVAL;
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep));
+	if (result == 0)
+		cl_page_io_start(env, pg, crt);
+
+	KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE,
+		      equi(result == 0,
+			   PageWriteback(cl_page_vmpage(env, pg)))));
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	return result;
+}
+EXPORT_SYMBOL(cl_page_prep);
+
+/**
+ * Notify layers about transfer completion.
+ *
+ * Invoked by transfer sub-system (which is a part of osc) to notify layers
+ * that a transfer, of which this page is a part of has completed.
+ *
+ * Completion call-backs are executed in the bottom-up order, so that
+ * uppermost layer (llite), responsible for the VFS/VM interaction runs last
+ * and can release locks safely.
+ *
+ * \pre  pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ * \post pg->cp_state == CPS_CACHED
+ *
+ * \see cl_page_operations::cpo_completion()
+ */
+void cl_page_completion(const struct lu_env *env,
+			struct cl_page *pg, enum cl_req_type crt, int ioret)
+{
+	struct cl_sync_io *anchor = pg->cp_sync_io;
+
+	PASSERT(env, pg, crt < CRT_NR);
+	/* cl_page::cp_req already cleared by the caller (osc_completion()) */
+	PASSERT(env, pg, pg->cp_req == NULL);
+	PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt));
+
+	ENTRY;
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, ioret);
+	if (crt == CRT_READ && ioret == 0) {
+		PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED));
+		pg->cp_flags |= CPF_READ_COMPLETED;
+	}
+
+	cl_page_state_set(env, pg, CPS_CACHED);
+	if (crt >= CRT_NR)
+		return;
+	CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion),
+			       (const struct lu_env *,
+				const struct cl_page_slice *, int), ioret);
+	if (anchor) {
+		LASSERT(cl_page_is_vmlocked(env, pg));
+		LASSERT(pg->cp_sync_io == anchor);
+		pg->cp_sync_io = NULL;
+	}
+	/*
+	 * As page->cp_obj is pinned by a reference from page->cp_req, it is
+	 * safe to call cl_page_put() without risking object destruction in a
+	 * non-blocking context.
+	 */
+	cl_page_put(env, pg);
+
+	if (anchor)
+		cl_sync_io_note(anchor, ioret);
+
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_completion);
+
+/**
+ * Notify layers that transfer formation engine decided to yank this page from
+ * the cache and to make it a part of a transfer.
+ *
+ * \pre  pg->cp_state == CPS_CACHED
+ * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT
+ *
+ * \see cl_page_operations::cpo_make_ready()
+ */
+int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg,
+		       enum cl_req_type crt)
+{
+	int result;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+
+	ENTRY;
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+	result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready),
+				(const struct lu_env *,
+				 const struct cl_page_slice *));
+	if (result == 0) {
+		PASSERT(env, pg, pg->cp_state == CPS_CACHED);
+		cl_page_io_start(env, pg, crt);
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_make_ready);
+
+/**
+ * Notify layers that high level io decided to place this page into a cache
+ * for future transfer.
+ *
+ * The layer implementing transfer engine (osc) has to register this page in
+ * its queues.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post cl_page_is_owned(pg, io)
+ *
+ * \see cl_page_operations::cpo_cache_add()
+ */
+int cl_page_cache_add(const struct lu_env *env, struct cl_io *io,
+		      struct cl_page *pg, enum cl_req_type crt)
+{
+	const struct cl_page_slice *scan;
+	int result = 0;
+
+	PINVRNT(env, pg, crt < CRT_NR);
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	if (crt >= CRT_NR)
+		RETURN(-EINVAL);
+
+	list_for_each_entry(scan, &pg->cp_layers, cpl_linkage) {
+		if (scan->cpl_ops->io[crt].cpo_cache_add == NULL)
+			continue;
+
+		result = scan->cpl_ops->io[crt].cpo_cache_add(env, scan, io);
+		if (result != 0)
+			break;
+	}
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", crt, result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_cache_add);
+
+/**
+ * Called if a pge is being written back by kernel's intention.
+ *
+ * \pre  cl_page_is_owned(pg, io)
+ * \post ergo(result == 0, pg->cp_state == CPS_PAGEOUT)
+ *
+ * \see cl_page_operations::cpo_flush()
+ */
+int cl_page_flush(const struct lu_env *env, struct cl_io *io,
+		  struct cl_page *pg)
+{
+	int result;
+
+	PINVRNT(env, pg, cl_page_is_owned(pg, io));
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	ENTRY;
+
+	result = cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_flush));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d\n", result);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_page_flush);
+
+/**
+ * Checks whether page is protected by any extent lock is at least required
+ * mode.
+ *
+ * \return the same as in cl_page_operations::cpo_is_under_lock() method.
+ * \see cl_page_operations::cpo_is_under_lock()
+ */
+int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io,
+			  struct cl_page *page)
+{
+	int rc;
+
+	PINVRNT(env, page, cl_page_invariant(page));
+
+	ENTRY;
+	rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock),
+			    (const struct lu_env *,
+			     const struct cl_page_slice *, struct cl_io *),
+			    io);
+	PASSERT(env, page, rc != 0);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cl_page_is_under_lock);
+
+static int page_prune_cb(const struct lu_env *env, struct cl_io *io,
+			 struct cl_page *page, void *cbdata)
+{
+	cl_page_own(env, io, page);
+	cl_page_unmap(env, io, page);
+	cl_page_discard(env, io, page);
+	cl_page_disown(env, io, page);
+	return CLP_GANG_OKAY;
+}
+
+/**
+ * Purges all cached pages belonging to the object \a obj.
+ */
+int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
+{
+	struct cl_thread_info   *info;
+	struct cl_object	*obj = cl_object_top(clobj);
+	struct cl_io	    *io;
+	int		      result;
+
+	ENTRY;
+	info  = cl_env_info(env);
+	io    = &info->clt_io;
+
+	/*
+	 * initialize the io. This is ugly since we never do IO in this
+	 * function, we just make cl_page_list functions happy. -jay
+	 */
+	io->ci_obj = obj;
+	io->ci_ignore_layout = 1;
+	result = cl_io_init(env, io, CIT_MISC, obj);
+	if (result != 0) {
+		cl_io_fini(env, io);
+		RETURN(io->ci_result);
+	}
+
+	do {
+		result = cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF,
+					     page_prune_cb, NULL);
+		if (result == CLP_GANG_RESCHED)
+			cond_resched();
+	} while (result != CLP_GANG_OKAY);
+
+	cl_io_fini(env, io);
+	RETURN(result);
+}
+EXPORT_SYMBOL(cl_pages_prune);
+
+/**
+ * Tells transfer engine that only part of a page is to be transmitted.
+ *
+ * \see cl_page_operations::cpo_clip()
+ */
+void cl_page_clip(const struct lu_env *env, struct cl_page *pg,
+		  int from, int to)
+{
+	PINVRNT(env, pg, cl_page_invariant(pg));
+
+	CL_PAGE_HEADER(D_TRACE, env, pg, "%d %d\n", from, to);
+	CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip),
+		       (const struct lu_env *,
+			const struct cl_page_slice *,int, int),
+		       from, to);
+}
+EXPORT_SYMBOL(cl_page_clip);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_header_print(const struct lu_env *env, void *cookie,
+			  lu_printer_t printer, const struct cl_page *pg)
+{
+	(*printer)(env, cookie,
+		   "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n",
+		   pg, atomic_read(&pg->cp_ref), pg->cp_obj,
+		   pg->cp_index, pg->cp_parent, pg->cp_child,
+		   pg->cp_state, pg->cp_error, pg->cp_type,
+		   pg->cp_owner, pg->cp_req, pg->cp_flags);
+}
+EXPORT_SYMBOL(cl_page_header_print);
+
+/**
+ * Prints human readable representation of \a pg to the \a f.
+ */
+void cl_page_print(const struct lu_env *env, void *cookie,
+		   lu_printer_t printer, const struct cl_page *pg)
+{
+	struct cl_page *scan;
+
+	for (scan = cl_page_top((struct cl_page *)pg);
+	     scan != NULL; scan = scan->cp_child)
+		cl_page_header_print(env, cookie, printer, scan);
+	CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print),
+		       (const struct lu_env *env,
+			const struct cl_page_slice *slice,
+			void *cookie, lu_printer_t p), cookie, printer);
+	(*printer)(env, cookie, "end page@%p\n", pg);
+}
+EXPORT_SYMBOL(cl_page_print);
+
+/**
+ * Cancel a page which is still in a transfer.
+ */
+int cl_page_cancel(const struct lu_env *env, struct cl_page *page)
+{
+	return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel),
+			      (const struct lu_env *,
+			       const struct cl_page_slice *));
+}
+EXPORT_SYMBOL(cl_page_cancel);
+
+/**
+ * Converts a byte offset within object \a obj into a page index.
+ */
+loff_t cl_offset(const struct cl_object *obj, pgoff_t idx)
+{
+	/*
+	 * XXX for now.
+	 */
+	return (loff_t)idx << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_offset);
+
+/**
+ * Converts a page index into a byte offset within object \a obj.
+ */
+pgoff_t cl_index(const struct cl_object *obj, loff_t offset)
+{
+	/*
+	 * XXX for now.
+	 */
+	return offset >> PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_index);
+
+int cl_page_size(const struct cl_object *obj)
+{
+	return 1 << PAGE_CACHE_SHIFT;
+}
+EXPORT_SYMBOL(cl_page_size);
+
+/**
+ * Adds page slice to the compound page.
+ *
+ * This is called by cl_object_operations::coo_page_init() methods to add a
+ * per-layer state to the page. New state is added at the end of
+ * cl_page::cp_layers list, that is, it is at the bottom of the stack.
+ *
+ * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add()
+ */
+void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice,
+		       struct cl_object *obj,
+		       const struct cl_page_operations *ops)
+{
+	ENTRY;
+	list_add_tail(&slice->cpl_linkage, &page->cp_layers);
+	slice->cpl_obj  = obj;
+	slice->cpl_ops  = ops;
+	slice->cpl_page = page;
+	EXIT;
+}
+EXPORT_SYMBOL(cl_page_slice_add);
+
+int  cl_page_init(void)
+{
+	return 0;
+}
+
+void cl_page_fini(void)
+{
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/class_obd.c b/drivers/staging/lustre/lustre/obdclass/class_obd.c
new file mode 100644
index 000000000000..af1c2d09c47b
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/class_obd.c
@@ -0,0 +1,689 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+# include <asm/atomic.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lustre_debug.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_build_version.h>
+#include <linux/list.h>
+#include <cl_object.h>
+#include "llog_internal.h"
+
+
+struct obd_device *obd_devs[MAX_OBD_DEVICES];
+EXPORT_SYMBOL(obd_devs);
+struct list_head obd_types;
+DEFINE_RWLOCK(obd_dev_lock);
+
+__u64 obd_max_pages = 0;
+__u64 obd_max_alloc = 0;
+DEFINE_SPINLOCK(obd_updatemax_lock);
+
+/* The following are visible and mutable through /proc/sys/lustre/. */
+unsigned int obd_alloc_fail_rate = 0;
+EXPORT_SYMBOL(obd_alloc_fail_rate);
+unsigned int obd_debug_peer_on_timeout;
+EXPORT_SYMBOL(obd_debug_peer_on_timeout);
+unsigned int obd_dump_on_timeout;
+EXPORT_SYMBOL(obd_dump_on_timeout);
+unsigned int obd_dump_on_eviction;
+EXPORT_SYMBOL(obd_dump_on_eviction);
+unsigned int obd_max_dirty_pages = 256;
+EXPORT_SYMBOL(obd_max_dirty_pages);
+atomic_t obd_dirty_pages;
+EXPORT_SYMBOL(obd_dirty_pages);
+unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT;   /* seconds */
+EXPORT_SYMBOL(obd_timeout);
+unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */
+EXPORT_SYMBOL(ldlm_timeout);
+unsigned int obd_timeout_set;
+EXPORT_SYMBOL(obd_timeout_set);
+unsigned int ldlm_timeout_set;
+EXPORT_SYMBOL(ldlm_timeout_set);
+/* Adaptive timeout defs here instead of ptlrpc module for /proc/sys/ access */
+unsigned int at_min = 0;
+EXPORT_SYMBOL(at_min);
+unsigned int at_max = 600;
+EXPORT_SYMBOL(at_max);
+unsigned int at_history = 600;
+EXPORT_SYMBOL(at_history);
+int at_early_margin = 5;
+EXPORT_SYMBOL(at_early_margin);
+int at_extra = 30;
+EXPORT_SYMBOL(at_extra);
+
+atomic_t obd_dirty_transit_pages;
+EXPORT_SYMBOL(obd_dirty_transit_pages);
+
+char obd_jobid_var[JOBSTATS_JOBID_VAR_MAX_LEN + 1] = JOBSTATS_DISABLE;
+EXPORT_SYMBOL(obd_jobid_var);
+
+/* Get jobid of current process by reading the environment variable
+ * stored in between the "env_start" & "env_end" of task struct.
+ *
+ * TODO:
+ * It's better to cache the jobid for later use if there is any
+ * efficient way, the cl_env code probably could be reused for this
+ * purpose.
+ *
+ * If some job scheduler doesn't store jobid in the "env_start/end",
+ * then an upcall could be issued here to get the jobid by utilizing
+ * the userspace tools/api. Then, the jobid must be cached.
+ */
+int lustre_get_jobid(char *jobid)
+{
+	int jobid_len = JOBSTATS_JOBID_SIZE;
+	int rc = 0;
+	ENTRY;
+
+	memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+	/* Jobstats isn't enabled */
+	if (strcmp(obd_jobid_var, JOBSTATS_DISABLE) == 0)
+		RETURN(0);
+
+	/* Use process name + fsuid as jobid */
+	if (strcmp(obd_jobid_var, JOBSTATS_PROCNAME_UID) == 0) {
+		snprintf(jobid, JOBSTATS_JOBID_SIZE, "%s.%u",
+			 current_comm(), current_fsuid());
+		RETURN(0);
+	}
+
+	rc = cfs_get_environ(obd_jobid_var, jobid, &jobid_len);
+	if (rc) {
+		if (rc == -EOVERFLOW) {
+			/* For the PBS_JOBID and LOADL_STEP_ID keys (which are
+			 * variable length strings instead of just numbers), it
+			 * might make sense to keep the unique parts for JobID,
+			 * instead of just returning an error.  That means a
+			 * larger temp buffer for cfs_get_environ(), then
+			 * truncating the string at some separator to fit into
+			 * the specified jobid_len.  Fix later if needed. */
+			static bool printed;
+			if (unlikely(!printed)) {
+				LCONSOLE_ERROR_MSG(0x16b, "%s value too large "
+						   "for JobID buffer (%d)\n",
+						   obd_jobid_var, jobid_len);
+				printed = true;
+			}
+		} else {
+			CDEBUG((rc == -ENOENT || rc == -EINVAL ||
+				rc == -EDEADLK) ? D_INFO : D_ERROR,
+			       "Get jobid for (%s) failed: rc = %d\n",
+			       obd_jobid_var, rc);
+		}
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_get_jobid);
+
+int obd_alloc_fail(const void *ptr, const char *name, const char *type,
+		   size_t size, const char *file, int line)
+{
+	if (ptr == NULL ||
+	    (cfs_rand() & OBD_ALLOC_FAIL_MASK) < obd_alloc_fail_rate) {
+		CERROR("%s%salloc of %s ("LPU64" bytes) failed at %s:%d\n",
+		       ptr ? "force " :"", type, name, (__u64)size, file,
+		       line);
+		CERROR(LPU64" total bytes and "LPU64" total pages "
+		       "("LPU64" bytes) allocated by Lustre, "
+		       "%d total bytes by LNET\n",
+		       obd_memory_sum(),
+		       obd_pages_sum() << PAGE_CACHE_SHIFT,
+		       obd_pages_sum(),
+			atomic_read(&libcfs_kmemory));
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(obd_alloc_fail);
+
+static inline void obd_data2conn(struct lustre_handle *conn,
+				 struct obd_ioctl_data *data)
+{
+	memset(conn, 0, sizeof *conn);
+	conn->cookie = data->ioc_cookie;
+}
+
+static inline void obd_conn2data(struct obd_ioctl_data *data,
+				 struct lustre_handle *conn)
+{
+	data->ioc_cookie = conn->cookie;
+}
+
+int class_resolve_dev_name(__u32 len, const char *name)
+{
+	int rc;
+	int dev;
+
+	ENTRY;
+	if (!len || !name) {
+		CERROR("No name passed,!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+	if (name[len - 1] != 0) {
+		CERROR("Name not nul terminated!\n");
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s\n", name);
+	dev = class_name2dev(name);
+	if (dev == -1) {
+		CDEBUG(D_IOCTL, "No device for name %s!\n", name);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CDEBUG(D_IOCTL, "device name %s, dev %d\n", name, dev);
+	rc = dev;
+
+out:
+	RETURN(rc);
+}
+
+int class_handle_ioctl(unsigned int cmd, unsigned long arg)
+{
+	char *buf = NULL;
+	struct obd_ioctl_data *data;
+	struct libcfs_debug_ioctl_data *debug_data;
+	struct obd_device *obd = NULL;
+	int err = 0, len = 0;
+	ENTRY;
+
+	/* only for debugging */
+	if (cmd == LIBCFS_IOC_DEBUG_MASK) {
+		debug_data = (struct libcfs_debug_ioctl_data*)arg;
+		libcfs_subsystem_debug = debug_data->subs;
+		libcfs_debug = debug_data->debug;
+		return 0;
+	}
+
+	CDEBUG(D_IOCTL, "cmd = %x\n", cmd);
+	if (obd_ioctl_getdata(&buf, &len, (void *)arg)) {
+		CERROR("OBD ioctl: data error\n");
+		RETURN(-EINVAL);
+	}
+	data = (struct obd_ioctl_data *)buf;
+
+	switch (cmd) {
+	case OBD_IOC_PROCESS_CFG: {
+		struct lustre_cfg *lcfg;
+
+		if (!data->ioc_plen1 || !data->ioc_pbuf1) {
+			CERROR("No config buffer passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		OBD_ALLOC(lcfg, data->ioc_plen1);
+		if (lcfg == NULL)
+			GOTO(out, err = -ENOMEM);
+		err = copy_from_user(lcfg, data->ioc_pbuf1,
+					 data->ioc_plen1);
+		if (!err)
+			err = lustre_cfg_sanity_check(lcfg, data->ioc_plen1);
+		if (!err)
+			err = class_process_config(lcfg);
+
+		OBD_FREE(lcfg, data->ioc_plen1);
+		GOTO(out, err);
+	}
+
+	case OBD_GET_VERSION:
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		if (strlen(BUILD_VERSION) + 1 > data->ioc_inllen1) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		memcpy(data->ioc_bulk, BUILD_VERSION,
+		       strlen(BUILD_VERSION) + 1);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+
+	case OBD_IOC_NAME2DEV: {
+		/* Resolve a device name.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+
+		dev = class_resolve_dev_name(data->ioc_inllen1,
+					     data->ioc_inlbuf1);
+		data->ioc_dev = dev;
+		if (dev < 0)
+			GOTO(out, err = -EINVAL);
+
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_UUID2DEV: {
+		/* Resolve a device uuid.  This does not change the
+		 * currently selected device.
+		 */
+		int dev;
+		struct obd_uuid uuid;
+
+		if (!data->ioc_inllen1 || !data->ioc_inlbuf1) {
+			CERROR("No UUID passed!\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inlbuf1[data->ioc_inllen1 - 1] != 0) {
+			CERROR("UUID not NUL terminated!\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s\n", data->ioc_inlbuf1);
+		obd_str2uuid(&uuid, data->ioc_inlbuf1);
+		dev = class_uuid2dev(&uuid);
+		data->ioc_dev = dev;
+		if (dev == -1) {
+			CDEBUG(D_IOCTL, "No device for UUID %s!\n",
+			       data->ioc_inlbuf1);
+			GOTO(out, err = -EINVAL);
+		}
+
+		CDEBUG(D_IOCTL, "device name %s, dev %d\n", data->ioc_inlbuf1,
+		       dev);
+		err = obd_ioctl_popdata((void *)arg, data, sizeof(*data));
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+
+	case OBD_IOC_CLOSE_UUID: {
+		CDEBUG(D_IOCTL, "closing all connections to uuid %s (NOOP)\n",
+		       data->ioc_inlbuf1);
+		GOTO(out, err = 0);
+	}
+
+	case OBD_IOC_GETDEVICE: {
+		int     index = data->ioc_count;
+		char    *status, *str;
+
+		if (!data->ioc_inlbuf1) {
+			CERROR("No buffer passed in ioctl\n");
+			GOTO(out, err = -EINVAL);
+		}
+		if (data->ioc_inllen1 < 128) {
+			CERROR("ioctl buffer too small to hold version\n");
+			GOTO(out, err = -EINVAL);
+		}
+
+		obd = class_num2obd(index);
+		if (!obd)
+			GOTO(out, err = -ENOENT);
+
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		str = (char *)data->ioc_bulk;
+		snprintf(str, len - sizeof(*data), "%3d %s %s %s %s %d",
+			 (int)index, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+		err = obd_ioctl_popdata((void *)arg, data, len);
+
+		GOTO(out, err = 0);
+	}
+
+	}
+
+	if (data->ioc_dev == OBD_DEV_BY_DEVNAME) {
+		if (data->ioc_inllen4 <= 0 || data->ioc_inlbuf4 == NULL)
+			GOTO(out, err = -EINVAL);
+		if (strnlen(data->ioc_inlbuf4, MAX_OBD_NAME) >= MAX_OBD_NAME)
+			GOTO(out, err = -EINVAL);
+		obd = class_name2obd(data->ioc_inlbuf4);
+	} else if (data->ioc_dev < class_devno_max()) {
+		obd = class_num2obd(data->ioc_dev);
+	} else {
+		CERROR("OBD ioctl: No device\n");
+		GOTO(out, err = -EINVAL);
+	}
+
+	if (obd == NULL) {
+		CERROR("OBD ioctl : No Device %d\n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+
+	if (!obd->obd_set_up || obd->obd_stopping) {
+		CERROR("OBD ioctl: device not setup %d \n", data->ioc_dev);
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(cmd) {
+	case OBD_IOC_NO_TRANSNO: {
+		if (!obd->obd_attached) {
+			CERROR("Device %d not attached\n", obd->obd_minor);
+			GOTO(out, err = -ENODEV);
+		}
+		CDEBUG(D_HA, "%s: disabling committed-transno notification\n",
+		       obd->obd_name);
+		obd->obd_no_transno = 1;
+		GOTO(out, err = 0);
+	}
+
+	default: {
+		err = obd_iocontrol(cmd, obd->obd_self_export, len, data, NULL);
+		if (err)
+			GOTO(out, err);
+
+		err = obd_ioctl_popdata((void *)arg, data, len);
+		if (err)
+			err = -EFAULT;
+		GOTO(out, err);
+	}
+	}
+
+ out:
+	if (buf)
+		obd_ioctl_freedata(buf, len);
+	RETURN(err);
+} /* class_handle_ioctl */
+
+extern psdev_t obd_psdev;
+
+#define OBD_INIT_CHECK
+int obd_init_checks(void)
+{
+	__u64 u64val, div64val;
+	char buf[64];
+	int len, ret = 0;
+
+	CDEBUG(D_INFO, "LPU64=%s, LPD64=%s, LPX64=%s\n", LPU64, LPD64, LPX64);
+
+	CDEBUG(D_INFO, "OBD_OBJECT_EOF = "LPX64"\n", (__u64)OBD_OBJECT_EOF);
+
+	u64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+
+	div64val = OBD_OBJECT_EOF;
+	CDEBUG(D_INFO, "u64val OBD_OBJECT_EOF = "LPX64"\n", u64val);
+	if (u64val != OBD_OBJECT_EOF) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		ret = -EOVERFLOW;
+	}
+	if (u64val >> 8 != OBD_OBJECT_EOF >> 8) {
+		CERROR("__u64 "LPX64"(%d) != 0xffffffffffffffff\n",
+		       u64val, (int)sizeof(u64val));
+		return -EOVERFLOW;
+	}
+	if (do_div(div64val, 256) != (u64val & 255)) {
+		CERROR("do_div("LPX64",256) != "LPU64"\n", u64val, u64val &255);
+		return -EOVERFLOW;
+	}
+	if (u64val >> 8 != div64val) {
+		CERROR("do_div("LPX64",256) "LPU64" != "LPU64"\n",
+		       u64val, div64val, u64val >> 8);
+		return -EOVERFLOW;
+	}
+	len = snprintf(buf, sizeof(buf), LPX64, u64val);
+	if (len != 18) {
+		CWARN("LPX64 wrong length! strlen(%s)=%d != 18\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPU64, u64val);
+	if (len != 20) {
+		CWARN("LPU64 wrong length! strlen(%s)=%d != 20\n", buf, len);
+		ret = -EINVAL;
+	}
+	len = snprintf(buf, sizeof(buf), LPD64, u64val);
+	if (len != 2) {
+		CWARN("LPD64 wrong length! strlen(%s)=%d != 2\n", buf, len);
+		ret = -EINVAL;
+	}
+	if ((u64val & ~CFS_PAGE_MASK) >= PAGE_CACHE_SIZE) {
+		CWARN("mask failed: u64val "LPU64" >= "LPU64"\n", u64val,
+		      (__u64)PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+extern spinlock_t obd_types_lock;
+extern int class_procfs_init(void);
+extern int class_procfs_clean(void);
+
+static int __init init_obdclass(void)
+{
+	int i, err;
+	int lustre_register_fs(void);
+
+	for (i = CAPA_SITE_CLIENT; i < CAPA_SITE_MAX; i++)
+		INIT_LIST_HEAD(&capa_list[i]);
+
+	LCONSOLE_INFO("Lustre: Build Version: "BUILD_VERSION"\n");
+
+	spin_lock_init(&obd_types_lock);
+	obd_zombie_impexp_init();
+#ifdef LPROCFS
+	obd_memory = lprocfs_alloc_stats(OBD_STATS_NUM,
+					 LPROCFS_STATS_FLAG_NONE |
+					 LPROCFS_STATS_FLAG_IRQ_SAFE);
+	if (obd_memory == NULL) {
+		CERROR("kmalloc of 'obd_memory' failed\n");
+		RETURN(-ENOMEM);
+	}
+
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "memused", "bytes");
+	lprocfs_counter_init(obd_memory, OBD_MEMORY_PAGES_STAT,
+			     LPROCFS_CNTR_AVGMINMAX,
+			     "pagesused", "pages");
+#endif
+	err = obd_init_checks();
+	if (err == -EOVERFLOW)
+		return err;
+
+	class_init_uuidlist();
+	err = class_handle_init();
+	if (err)
+		return err;
+
+	INIT_LIST_HEAD(&obd_types);
+
+	err = misc_register(&obd_psdev);
+	if (err) {
+		CERROR("cannot register %d err %d\n", OBD_DEV_MINOR, err);
+		return err;
+	}
+
+	/* This struct is already zeroed for us (static global) */
+	for (i = 0; i < class_devno_max(); i++)
+		obd_devs[i] = NULL;
+
+	/* Default the dirty page cache cap to 1/2 of system memory.
+	 * For clients with less memory, a larger fraction is needed
+	 * for other purposes (mostly for BGL). */
+	if (num_physpages <= 512 << (20 - PAGE_CACHE_SHIFT))
+		obd_max_dirty_pages = num_physpages / 4;
+	else
+		obd_max_dirty_pages = num_physpages / 2;
+
+	err = obd_init_caches();
+	if (err)
+		return err;
+	err = class_procfs_init();
+	if (err)
+		return err;
+
+	err = lu_global_init();
+	if (err)
+		return err;
+
+	err = cl_global_init();
+	if (err != 0)
+		return err;
+
+
+	err = llog_info_init();
+	if (err)
+		return err;
+
+	err = lustre_register_fs();
+
+	return err;
+}
+
+void obd_update_maxusage(void)
+{
+	__u64 max1, max2;
+
+	max1 = obd_pages_sum();
+	max2 = obd_memory_sum();
+
+	spin_lock(&obd_updatemax_lock);
+	if (max1 > obd_max_pages)
+		obd_max_pages = max1;
+	if (max2 > obd_max_alloc)
+		obd_max_alloc = max2;
+	spin_unlock(&obd_updatemax_lock);
+}
+EXPORT_SYMBOL(obd_update_maxusage);
+
+#ifdef LPROCFS
+__u64 obd_memory_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_alloc;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_memory_max);
+
+__u64 obd_pages_max(void)
+{
+	__u64 ret;
+
+	spin_lock(&obd_updatemax_lock);
+	ret = obd_max_pages;
+	spin_unlock(&obd_updatemax_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(obd_pages_max);
+#endif
+
+/* liblustre doesn't call cleanup_obdclass, apparently.  we carry on in this
+ * ifdef to the end of the file to cover module and versioning goo.*/
+static void cleanup_obdclass(void)
+{
+	int i;
+	int lustre_unregister_fs(void);
+	__u64 memory_leaked, pages_leaked;
+	__u64 memory_max, pages_max;
+	ENTRY;
+
+	lustre_unregister_fs();
+
+	misc_deregister(&obd_psdev);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+		if (obd && obd->obd_set_up &&
+		    OBT(obd) && OBP(obd, detach)) {
+			/* XXX should this call generic detach otherwise? */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			OBP(obd, detach)(obd);
+		}
+	}
+	llog_info_fini();
+	cl_global_fini();
+	lu_global_fini();
+
+	obd_cleanup_caches();
+	obd_sysctl_clean();
+
+	class_procfs_clean();
+
+	class_handle_cleanup();
+	class_exit_uuidlist();
+	obd_zombie_impexp_stop();
+
+	memory_leaked = obd_memory_sum();
+	pages_leaked = obd_pages_sum();
+
+	memory_max = obd_memory_max();
+	pages_max = obd_pages_max();
+
+	lprocfs_free_stats(&obd_memory);
+	CDEBUG((memory_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory max: "LPU64", leaked: "LPU64"\n",
+	       memory_max, memory_leaked);
+	CDEBUG((pages_leaked) ? D_ERROR : D_INFO,
+	       "obd_memory_pages max: "LPU64", leaked: "LPU64"\n",
+	       pages_max, pages_leaked);
+
+	EXIT;
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("Lustre Class Driver Build Version: " BUILD_VERSION);
+MODULE_LICENSE("GPL");
+
+cfs_module(obdclass, LUSTRE_VERSION_STRING, init_obdclass, cleanup_obdclass);
diff --git a/drivers/staging/lustre/lustre/obdclass/debug.c b/drivers/staging/lustre/lustre/obdclass/debug.c
new file mode 100644
index 000000000000..15f71bbb7276
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/debug.c
@@ -0,0 +1,124 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/debug.c
+ *
+ * Helper routines for dumping data structs for debugging.
+ */
+
+#define DEBUG_SUBSYSTEM D_OTHER
+
+
+#include <obd_ost.h>
+#include <obd_support.h>
+#include <lustre_debug.h>
+#include <lustre_net.h>
+
+void dump_lniobuf(struct niobuf_local *nb)
+{
+	CDEBUG(D_RPCTRACE,
+	       "niobuf_local: file_offset="LPD64", len=%d, page=%p, rc=%d\n",
+	       nb->lnb_file_offset, nb->len, nb->page, nb->rc);
+	CDEBUG(D_RPCTRACE, "nb->page: index = %ld\n",
+			nb->page ? page_index(nb->page) : -1);
+}
+EXPORT_SYMBOL(dump_lniobuf);
+
+void dump_lsm(int level, struct lov_stripe_md *lsm)
+{
+	CDEBUG(level, "lsm %p, objid "DOSTID", maxbytes "LPX64", magic 0x%08X,"
+	       " stripe_size %u, stripe_count %u, refc: %d,"
+	       " layout_gen %u, pool ["LOV_POOLNAMEF"]\n", lsm,
+	       POSTID(&lsm->lsm_oi), lsm->lsm_maxbytes, lsm->lsm_magic,
+	       lsm->lsm_stripe_size, lsm->lsm_stripe_count,
+	       atomic_read(&lsm->lsm_refc), lsm->lsm_layout_gen,
+	       lsm->lsm_pool_name);
+}
+EXPORT_SYMBOL(dump_lsm);
+
+#define LPDS sizeof(__u64)
+int block_debug_setup(void *addr, int len, __u64 off, __u64 id)
+{
+	LASSERT(addr);
+
+	off = cpu_to_le64 (off);
+	id = cpu_to_le64 (id);
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	addr += len - LPDS - LPDS;
+	memcpy(addr, (char *)&off, LPDS);
+	memcpy(addr + LPDS, (char *)&id, LPDS);
+
+	return 0;
+}
+EXPORT_SYMBOL(block_debug_setup);
+
+int block_debug_check(char *who, void *addr, int end, __u64 off, __u64 id)
+{
+	__u64 ne_off;
+	int err = 0;
+
+	LASSERT(addr);
+
+	ne_off = le64_to_cpu (off);
+	id = le64_to_cpu (id);
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" id: "LPX64" != "LPX64"\n",
+		       who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	addr += end - LPDS - LPDS;
+	if (memcmp(addr, (char *)&ne_off, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end off: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)addr, ne_off);
+		err = -EINVAL;
+	}
+	if (memcmp(addr + LPDS, (char *)&id, LPDS)) {
+		CDEBUG(D_ERROR, "%s: id "LPX64" offset "LPU64" end id: "LPX64" != "
+		       LPX64"\n", who, id, off, *(__u64 *)(addr + LPDS), id);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+EXPORT_SYMBOL(block_debug_check);
+#undef LPDS
diff --git a/drivers/staging/lustre/lustre/obdclass/dt_object.c b/drivers/staging/lustre/lustre/obdclass/dt_object.c
new file mode 100644
index 000000000000..1c962dd3bd2f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/dt_object.c
@@ -0,0 +1,1055 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/dt_object.c
+ *
+ * Dt Object.
+ * Generic functions from dt_object.h
+ *
+ * Author: Nikita Danilov <nikita@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd.h>
+#include <dt_object.h>
+#include <linux/list.h>
+/* fid_be_to_cpu() */
+#include <lustre_fid.h>
+
+#include <lustre_quota.h>
+
+/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */
+LU_KEY_INIT(dt_global, struct dt_thread_info);
+LU_KEY_FINI(dt_global, struct dt_thread_info);
+
+struct lu_context_key dt_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD | LCT_MG_THREAD | LCT_LOCAL,
+	.lct_init = dt_global_key_init,
+	.lct_fini = dt_global_key_fini
+};
+EXPORT_SYMBOL(dt_key);
+
+/* no lock is necessary to protect the list, because call-backs
+ * are added during system startup. Please refer to "struct dt_device".
+ */
+void dt_txn_callback_add(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_add(&cb->dtc_linkage, &dev->dd_txn_callbacks);
+}
+EXPORT_SYMBOL(dt_txn_callback_add);
+
+void dt_txn_callback_del(struct dt_device *dev, struct dt_txn_callback *cb)
+{
+	list_del_init(&cb->dtc_linkage);
+}
+EXPORT_SYMBOL(dt_txn_callback_del);
+
+int dt_txn_hook_start(const struct lu_env *env,
+		      struct dt_device *dev, struct thandle *th)
+{
+	int rc = 0;
+	struct dt_txn_callback *cb;
+
+	if (th->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_start == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_start(env, th, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_start);
+
+int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn)
+{
+	struct dt_device       *dev = txn->th_dev;
+	struct dt_txn_callback *cb;
+	int		     rc = 0;
+
+	if (txn->th_local)
+		return 0;
+
+	list_for_each_entry(cb, &dev->dd_txn_callbacks, dtc_linkage) {
+		if (cb->dtc_txn_stop == NULL ||
+		    !(cb->dtc_tag & env->le_ctx.lc_tags))
+			continue;
+		rc = cb->dtc_txn_stop(env, txn, cb->dtc_cookie);
+		if (rc < 0)
+			break;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(dt_txn_hook_stop);
+
+void dt_txn_hook_commit(struct thandle *txn)
+{
+	struct dt_txn_callback *cb;
+
+	if (txn->th_local)
+		return;
+
+	list_for_each_entry(cb, &txn->th_dev->dd_txn_callbacks,
+				dtc_linkage) {
+		if (cb->dtc_txn_commit)
+			cb->dtc_txn_commit(txn, cb->dtc_cookie);
+	}
+}
+EXPORT_SYMBOL(dt_txn_hook_commit);
+
+int dt_device_init(struct dt_device *dev, struct lu_device_type *t)
+{
+
+	INIT_LIST_HEAD(&dev->dd_txn_callbacks);
+	return lu_device_init(&dev->dd_lu_dev, t);
+}
+EXPORT_SYMBOL(dt_device_init);
+
+void dt_device_fini(struct dt_device *dev)
+{
+	lu_device_fini(&dev->dd_lu_dev);
+}
+EXPORT_SYMBOL(dt_device_fini);
+
+int dt_object_init(struct dt_object *obj,
+		   struct lu_object_header *h, struct lu_device *d)
+
+{
+	return lu_object_init(&obj->do_lu, h, d);
+}
+EXPORT_SYMBOL(dt_object_init);
+
+void dt_object_fini(struct dt_object *obj)
+{
+	lu_object_fini(&obj->do_lu);
+}
+EXPORT_SYMBOL(dt_object_fini);
+
+int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj)
+{
+	if (obj->do_index_ops == NULL)
+		obj->do_ops->do_index_try(env, obj, &dt_directory_features);
+	return obj->do_index_ops != NULL;
+}
+EXPORT_SYMBOL(dt_try_as_dir);
+
+enum dt_format_type dt_mode_to_dft(__u32 mode)
+{
+	enum dt_format_type result;
+
+	switch (mode & S_IFMT) {
+	case S_IFDIR:
+		result = DFT_DIR;
+		break;
+	case S_IFREG:
+		result = DFT_REGULAR;
+		break;
+	case S_IFLNK:
+		result = DFT_SYM;
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		result = DFT_NODE;
+		break;
+	default:
+		LBUG();
+		break;
+	}
+	return result;
+}
+EXPORT_SYMBOL(dt_mode_to_dft);
+
+/**
+ * lookup fid for object named \a name in directory \a dir.
+ */
+
+int dt_lookup_dir(const struct lu_env *env, struct dt_object *dir,
+		  const char *name, struct lu_fid *fid)
+{
+	if (dt_try_as_dir(env, dir))
+		return dt_lookup(env, dir, (struct dt_rec *)fid,
+				 (const struct dt_key *)name, BYPASS_CAPA);
+	return -ENOTDIR;
+}
+EXPORT_SYMBOL(dt_lookup_dir);
+
+/* this differs from dt_locate by top_dev as parameter
+ * but not one from lu_site */
+struct dt_object *dt_locate_at(const struct lu_env *env,
+			       struct dt_device *dev, const struct lu_fid *fid,
+			       struct lu_device *top_dev)
+{
+	struct lu_object *lo, *n;
+	ENTRY;
+
+	lo = lu_object_find_at(env, top_dev, fid, NULL);
+	if (IS_ERR(lo))
+		return (void *)lo;
+
+	LASSERT(lo != NULL);
+
+	list_for_each_entry(n, &lo->lo_header->loh_layers, lo_linkage) {
+		if (n->lo_dev == &dev->dd_lu_dev)
+			return container_of0(n, struct dt_object, do_lu);
+	}
+	return ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL(dt_locate_at);
+
+/**
+ * find a object named \a entry in given \a dfh->dfh_o directory.
+ */
+static int dt_find_entry(const struct lu_env *env, const char *entry, void *data)
+{
+	struct dt_find_hint  *dfh = data;
+	struct dt_device     *dt = dfh->dfh_dt;
+	struct lu_fid	*fid = dfh->dfh_fid;
+	struct dt_object     *obj = dfh->dfh_o;
+	int		   result;
+
+	result = dt_lookup_dir(env, obj, entry, fid);
+	lu_object_put(env, &obj->do_lu);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (IS_ERR(obj))
+			result = PTR_ERR(obj);
+	}
+	dfh->dfh_o = obj;
+	return result;
+}
+
+/**
+ * Abstract function which parses path name. This function feeds
+ * path component to \a entry_func.
+ */
+int dt_path_parser(const struct lu_env *env,
+		   char *path, dt_entry_func_t entry_func,
+		   void *data)
+{
+	char *e;
+	int rc = 0;
+
+	while (1) {
+		e = strsep(&path, "/");
+		if (e == NULL)
+			break;
+
+		if (e[0] == 0) {
+			if (!path || path[0] == '\0')
+				break;
+			continue;
+		}
+		rc = entry_func(env, e, data);
+		if (rc)
+			break;
+	}
+
+	return rc;
+}
+
+struct dt_object *
+dt_store_resolve(const struct lu_env *env, struct dt_device *dt,
+		 const char *path, struct lu_fid *fid)
+{
+	struct dt_thread_info *info = dt_info(env);
+	struct dt_find_hint   *dfh = &info->dti_dfh;
+	struct dt_object      *obj;
+	char		      *local = info->dti_buf;
+	int		       result;
+
+
+	dfh->dfh_dt = dt;
+	dfh->dfh_fid = fid;
+
+	strncpy(local, path, DT_MAX_PATH);
+	local[DT_MAX_PATH - 1] = '\0';
+
+	result = dt->dd_ops->dt_root_get(env, dt, fid);
+	if (result == 0) {
+		obj = dt_locate(env, dt, fid);
+		if (!IS_ERR(obj)) {
+			dfh->dfh_o = obj;
+			result = dt_path_parser(env, local, dt_find_entry, dfh);
+			if (result != 0)
+				obj = ERR_PTR(result);
+			else
+				obj = dfh->dfh_o;
+		}
+	} else {
+		obj = ERR_PTR(result);
+	}
+	return obj;
+}
+EXPORT_SYMBOL(dt_store_resolve);
+
+static struct dt_object *dt_reg_open(const struct lu_env *env,
+				     struct dt_device *dt,
+				     struct dt_object *p,
+				     const char *name,
+				     struct lu_fid *fid)
+{
+	struct dt_object *o;
+	int result;
+
+	result = dt_lookup_dir(env, p, name, fid);
+	if (result == 0){
+		o = dt_locate(env, dt, fid);
+	}
+	else
+		o = ERR_PTR(result);
+
+	return o;
+}
+
+/**
+ * Open dt object named \a filename from \a dirname directory.
+ *      \param  dt      dt device
+ *      \param  fid     on success, object fid is stored in *fid
+ */
+struct dt_object *dt_store_open(const struct lu_env *env,
+				struct dt_device *dt,
+				const char *dirname,
+				const char *filename,
+				struct lu_fid *fid)
+{
+	struct dt_object *file;
+	struct dt_object *dir;
+
+	dir = dt_store_resolve(env, dt, dirname, fid);
+	if (!IS_ERR(dir)) {
+		file = dt_reg_open(env, dt, dir,
+				   filename, fid);
+		lu_object_put(env, &dir->do_lu);
+	} else {
+		file = dir;
+	}
+	return file;
+}
+EXPORT_SYMBOL(dt_store_open);
+
+struct dt_object *dt_find_or_create(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object_format *dof,
+				    struct lu_attr *at)
+{
+	struct dt_object *dto;
+	struct thandle *th;
+	int rc;
+
+	ENTRY;
+
+	dto = dt_locate(env, dt, fid);
+	if (IS_ERR(dto))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		RETURN(dto);
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n", PFID(fid));
+
+	rc = dt_create(env, dto, at, NULL, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, dt, th);
+out:
+	if (rc) {
+		lu_object_put(env, &dto->do_lu);
+		RETURN(ERR_PTR(rc));
+	}
+	RETURN(dto);
+}
+EXPORT_SYMBOL(dt_find_or_create);
+
+/* dt class init function. */
+int dt_global_init(void)
+{
+	int result;
+
+	LU_CONTEXT_KEY_INIT(&dt_key);
+	result = lu_context_key_register(&dt_key);
+	return result;
+}
+
+void dt_global_fini(void)
+{
+	lu_context_key_degister(&dt_key);
+}
+
+/**
+ * Generic read helper. May return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval real size of data read
+ * \retval -ve errno on failure
+ */
+int dt_read(const struct lu_env *env, struct dt_object *dt,
+	    struct lu_buf *buf, loff_t *pos)
+{
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+	return dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+}
+EXPORT_SYMBOL(dt_read);
+
+/**
+ * Read structures of fixed size from storage.  Unlike dt_read(), using
+ * dt_record_read() will return an error for partial reads.
+ *
+ * \param env  lustre environment
+ * \param dt   object to be read
+ * \param buf  lu_buf to be filled, with buffer pointer and length
+ * \param pos position to start reading, updated as data is read
+ *
+ * \retval 0 on successfully reading full buffer
+ * \retval -EFAULT on short read
+ * \retval -ve errno on failure
+ */
+int dt_record_read(const struct lu_env *env, struct dt_object *dt,
+		   struct lu_buf *buf, loff_t *pos)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to read record\n");
+
+	rc = dt->do_body_ops->dbo_read(env, dt, buf, pos, BYPASS_CAPA);
+
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_read);
+
+int dt_record_write(const struct lu_env *env, struct dt_object *dt,
+		    const struct lu_buf *buf, loff_t *pos, struct thandle *th)
+{
+	int rc;
+
+	LASSERTF(dt != NULL, "dt is NULL when we want to write record\n");
+	LASSERT(th != NULL);
+	LASSERT(dt->do_body_ops);
+	LASSERT(dt->do_body_ops->dbo_write);
+	rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1);
+	if (rc == buf->lb_len)
+		rc = 0;
+	else if (rc >= 0)
+		rc = -EFAULT;
+	return rc;
+}
+EXPORT_SYMBOL(dt_record_write);
+
+int dt_declare_version_set(const struct lu_env *env, struct dt_object *o,
+			   struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+
+	LASSERT(o);
+	vbuf.lb_buf = NULL;
+	vbuf.lb_len = sizeof(dt_obj_version_t);
+	return dt_declare_xattr_set(env, o, &vbuf, xname, 0, th);
+
+}
+EXPORT_SYMBOL(dt_declare_version_set);
+
+void dt_version_set(const struct lu_env *env, struct dt_object *o,
+		    dt_obj_version_t version, struct thandle *th)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+
+	rc = dt_xattr_set(env, o, &vbuf, xname, 0, th, BYPASS_CAPA);
+	if (rc < 0)
+		CDEBUG(D_INODE, "Can't set version, rc %d\n", rc);
+	return;
+}
+EXPORT_SYMBOL(dt_version_set);
+
+dt_obj_version_t dt_version_get(const struct lu_env *env, struct dt_object *o)
+{
+	struct lu_buf vbuf;
+	char *xname = XATTR_NAME_VERSION;
+	dt_obj_version_t version;
+	int rc;
+
+	LASSERT(o);
+	vbuf.lb_buf = &version;
+	vbuf.lb_len = sizeof(version);
+	rc = dt_xattr_get(env, o, &vbuf, xname, BYPASS_CAPA);
+	if (rc != sizeof(version)) {
+		CDEBUG(D_INODE, "Can't get version, rc %d\n", rc);
+		version = 0;
+	}
+	return version;
+}
+EXPORT_SYMBOL(dt_version_get);
+
+/* list of all supported index types */
+
+/* directories */
+const struct dt_index_features dt_directory_features;
+EXPORT_SYMBOL(dt_directory_features);
+
+/* scrub iterator */
+const struct dt_index_features dt_otable_features;
+EXPORT_SYMBOL(dt_otable_features);
+
+/* lfsck */
+const struct dt_index_features dt_lfsck_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(struct lu_fid),
+	.dif_keysize_max	= sizeof(struct lu_fid),
+	.dif_recsize_min	= sizeof(__u8),
+	.dif_recsize_max	= sizeof(__u8),
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_lfsck_features);
+
+/* accounting indexes */
+const struct dt_index_features dt_acct_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_acct_rec), /* 16 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_acct_features);
+
+/* global quota files */
+const struct dt_index_features dt_quota_glb_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_glb_rec), /* 32 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_glb_features);
+
+/* slave quota files */
+const struct dt_index_features dt_quota_slv_features = {
+	.dif_flags		= DT_IND_UPDATE,
+	/* a different key would have to be used for per-directory quota */
+	.dif_keysize_min	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_keysize_max	= sizeof(__u64), /* 64-bit uid/gid */
+	.dif_recsize_min	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_recsize_max	= sizeof(struct lquota_slv_rec), /* 8 bytes */
+	.dif_ptrsize		= 4
+};
+EXPORT_SYMBOL(dt_quota_slv_features);
+
+/* helper function returning what dt_index_features structure should be used
+ * based on the FID sequence. This is used by OBD_IDX_READ RPC */
+static inline const struct dt_index_features *dt_index_feat_select(__u64 seq,
+								   __u32 mode)
+{
+	if (seq == FID_SEQ_QUOTA_GLB) {
+		/* global quota index */
+		if (!S_ISREG(mode))
+			/* global quota index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_glb_features;
+	} else if (seq == FID_SEQ_QUOTA) {
+		/* quota slave index */
+		if (!S_ISREG(mode))
+			/* slave index should be a regular file */
+			return ERR_PTR(-ENOENT);
+		return &dt_quota_slv_features;
+	} else if (seq >= FID_SEQ_NORMAL) {
+		/* object is part of the namespace, verify that it is a
+		 * directory */
+		if (!S_ISDIR(mode))
+			/* sorry, we can only deal with directory */
+			return ERR_PTR(-ENOTDIR);
+		return &dt_directory_features;
+	}
+
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
+/*
+ * Fill a lu_idxpage with key/record pairs read for transfer via OBD_IDX_READ
+ * RPC
+ *
+ * \param env - is the environment passed by the caller
+ * \param lp  - is a pointer to the lu_page to fill
+ * \param nob - is the maximum number of bytes that should be copied
+ * \param iops - is the index operation vector associated with the index object
+ * \param it   - is a pointer to the current iterator
+ * \param attr - is the index attribute to pass to iops->rec()
+ * \param arg  - is a pointer to the idx_info structure
+ */
+static int dt_index_page_build(const struct lu_env *env, union lu_page *lp,
+			       int nob, const struct dt_it_ops *iops,
+			       struct dt_it *it, __u32 attr, void *arg)
+{
+	struct idx_info		*ii = (struct idx_info *)arg;
+	struct lu_idxpage	*lip = &lp->lp_idx;
+	char			*entry;
+	int			 rc, size;
+	ENTRY;
+
+	/* no support for variable key & record size for now */
+	LASSERT((ii->ii_flags & II_FL_VARKEY) == 0);
+	LASSERT((ii->ii_flags & II_FL_VARREC) == 0);
+
+	/* initialize the header of the new container */
+	memset(lip, 0, LIP_HDR_SIZE);
+	lip->lip_magic = LIP_MAGIC;
+	nob	   -= LIP_HDR_SIZE;
+
+	/* compute size needed to store a key/record pair */
+	size = ii->ii_recsize + ii->ii_keysize;
+	if ((ii->ii_flags & II_FL_NOHASH) == 0)
+		/* add hash if the client wants it */
+		size += sizeof(__u64);
+
+	entry = lip->lip_entries;
+	do {
+		char		*tmp_entry = entry;
+		struct dt_key	*key;
+		__u64		 hash;
+
+		/* fetch 64-bit hash value */
+		hash = iops->store(env, it);
+		ii->ii_hash_end = hash;
+
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_IDX_READ_BREAK)) {
+			if (lip->lip_nr != 0)
+				GOTO(out, rc = 0);
+		}
+
+		if (nob < size) {
+			if (lip->lip_nr == 0)
+				GOTO(out, rc = -EINVAL);
+			GOTO(out, rc = 0);
+		}
+
+		if ((ii->ii_flags & II_FL_NOHASH) == 0) {
+			/* client wants to the 64-bit hash value associated with
+			 * each record */
+			memcpy(tmp_entry, &hash, sizeof(hash));
+			tmp_entry += sizeof(hash);
+		}
+
+		/* then the key value */
+		LASSERT(iops->key_size(env, it) == ii->ii_keysize);
+		key = iops->key(env, it);
+		memcpy(tmp_entry, key, ii->ii_keysize);
+		tmp_entry += ii->ii_keysize;
+
+		/* and finally the record */
+		rc = iops->rec(env, it, (struct dt_rec *)tmp_entry, attr);
+		if (rc != -ESTALE) {
+			if (rc != 0)
+				GOTO(out, rc);
+
+			/* hash/key/record successfully copied! */
+			lip->lip_nr++;
+			if (unlikely(lip->lip_nr == 1 && ii->ii_count == 0))
+				ii->ii_hash_start = hash;
+			entry = tmp_entry + ii->ii_recsize;
+			nob -= size;
+		}
+
+		/* move on to the next record */
+		do {
+			rc = iops->next(env, it);
+		} while (rc == -ESTALE);
+
+	} while (rc == 0);
+
+	GOTO(out, rc);
+out:
+	if (rc >= 0 && lip->lip_nr > 0)
+		/* one more container */
+		ii->ii_count++;
+	if (rc > 0)
+		/* no more entries */
+		ii->ii_hash_end = II_END_OFF;
+	return rc;
+}
+
+/*
+ * Walk index and fill lu_page containers with key/record pairs
+ *
+ * \param env - is the environment passed by the caller
+ * \param obj - is the index object to parse
+ * \param rdpg - is the lu_rdpg descriptor associated with the transfer
+ * \param filler - is the callback function responsible for filling a lu_page
+ *		 with key/record pairs in the format wanted by the caller
+ * \param arg    - is an opaq argument passed to the filler function
+ *
+ * \retval sum (in bytes) of all filled lu_pages
+ * \retval -ve errno on failure
+ */
+int dt_index_walk(const struct lu_env *env, struct dt_object *obj,
+		  const struct lu_rdpg *rdpg, dt_index_page_build_t filler,
+		  void *arg)
+{
+	struct dt_it		*it;
+	const struct dt_it_ops	*iops;
+	unsigned int		 pageidx, nob, nlupgs = 0;
+	int			 rc;
+	ENTRY;
+
+	LASSERT(rdpg->rp_pages != NULL);
+	LASSERT(obj->do_index_ops != NULL);
+
+	nob = rdpg->rp_count;
+	if (nob <= 0)
+		RETURN(-EFAULT);
+
+	/* Iterate through index and fill containers from @rdpg */
+	iops = &obj->do_index_ops->dio_it;
+	LASSERT(iops != NULL);
+	it = iops->init(env, obj, rdpg->rp_attrs, BYPASS_CAPA);
+	if (IS_ERR(it))
+		RETURN(PTR_ERR(it));
+
+	rc = iops->load(env, it, rdpg->rp_hash);
+	if (rc == 0) {
+		/*
+		 * Iterator didn't find record with exactly the key requested.
+		 *
+		 * It is currently either
+		 *
+		 *     - positioned above record with key less than
+		 *     requested---skip it.
+		 *     - or not positioned at all (is in IAM_IT_SKEWED
+		 *     state)---position it on the next item.
+		 */
+		rc = iops->next(env, it);
+	} else if (rc > 0) {
+		rc = 0;
+	}
+
+	/* Fill containers one after the other. There might be multiple
+	 * containers per physical page.
+	 *
+	 * At this point and across for-loop:
+	 *  rc == 0 -> ok, proceed.
+	 *  rc >  0 -> end of index.
+	 *  rc <  0 -> error. */
+	for (pageidx = 0; rc == 0 && nob > 0; pageidx++) {
+		union lu_page	*lp;
+		int		 i;
+
+		LASSERT(pageidx < rdpg->rp_npages);
+		lp = kmap(rdpg->rp_pages[pageidx]);
+
+		/* fill lu pages */
+		for (i = 0; i < LU_PAGE_COUNT; i++, lp++, nob -= LU_PAGE_SIZE) {
+			rc = filler(env, lp, min_t(int, nob, LU_PAGE_SIZE),
+				    iops, it, rdpg->rp_attrs, arg);
+			if (rc < 0)
+				break;
+			/* one more lu_page */
+			nlupgs++;
+			if (rc > 0)
+				/* end of index */
+				break;
+		}
+		kunmap(rdpg->rp_pages[i]);
+	}
+
+	iops->put(env, it);
+	iops->fini(env, it);
+
+	if (rc >= 0)
+		rc = min_t(unsigned int, nlupgs * LU_PAGE_SIZE, rdpg->rp_count);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(dt_index_walk);
+
+/**
+ * Walk key/record pairs of an index and copy them into 4KB containers to be
+ * transferred over the network. This is the common handler for OBD_IDX_READ
+ * RPC processing.
+ *
+ * \param env - is the environment passed by the caller
+ * \param dev - is the dt_device storing the index
+ * \param ii  - is the idx_info structure packed by the client in the
+ *	      OBD_IDX_READ request
+ * \param rdpg - is the lu_rdpg descriptor
+ *
+ * \retval on success, return sum (in bytes) of all filled containers
+ * \retval appropriate error otherwise.
+ */
+int dt_index_read(const struct lu_env *env, struct dt_device *dev,
+		  struct idx_info *ii, const struct lu_rdpg *rdpg)
+{
+	const struct dt_index_features	*feat;
+	struct dt_object		*obj;
+	int				 rc;
+	ENTRY;
+
+	/* rp_count shouldn't be null and should be a multiple of the container
+	 * size */
+	if (rdpg->rp_count <= 0 && (rdpg->rp_count & (LU_PAGE_SIZE - 1)) != 0)
+		RETURN(-EFAULT);
+
+	if (fid_seq(&ii->ii_fid) >= FID_SEQ_NORMAL)
+		/* we don't support directory transfer via OBD_IDX_READ for the
+		 * time being */
+		RETURN(-EOPNOTSUPP);
+
+	if (!fid_is_quota(&ii->ii_fid))
+		/* block access to all local files except quota files */
+		RETURN(-EPERM);
+
+	/* lookup index object subject to the transfer */
+	obj = dt_locate(env, dev, &ii->ii_fid);
+	if (IS_ERR(obj))
+		RETURN(PTR_ERR(obj));
+	if (dt_object_exists(obj) == 0)
+		GOTO(out, rc = -ENOENT);
+
+	/* fetch index features associated with index object */
+	feat = dt_index_feat_select(fid_seq(&ii->ii_fid),
+				    lu_object_attr(&obj->do_lu));
+	if (IS_ERR(feat))
+		GOTO(out, rc = PTR_ERR(feat));
+
+	/* load index feature if not done already */
+	if (obj->do_index_ops == NULL) {
+		rc = obj->do_ops->do_index_try(env, obj, feat);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	/* fill ii_flags with supported index features */
+	ii->ii_flags &= II_FL_NOHASH;
+
+	ii->ii_keysize = feat->dif_keysize_max;
+	if ((feat->dif_flags & DT_IND_VARKEY) != 0) {
+		/* key size is variable */
+		ii->ii_flags |= II_FL_VARKEY;
+		/* we don't support variable key size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	ii->ii_recsize = feat->dif_recsize_max;
+	if ((feat->dif_flags & DT_IND_VARREC) != 0) {
+		/* record size is variable */
+		ii->ii_flags |= II_FL_VARREC;
+		/* we don't support variable record size for the time being */
+		GOTO(out, rc = -EOPNOTSUPP);
+	}
+
+	if ((feat->dif_flags & DT_IND_NONUNQ) != 0)
+		/* key isn't necessarily unique */
+		ii->ii_flags |= II_FL_NONUNQ;
+
+	dt_read_lock(env, obj, 0);
+	/* fetch object version before walking the index */
+	ii->ii_version = dt_version_get(env, obj);
+
+	/* walk the index and fill lu_idxpages with key/record pairs */
+	rc = dt_index_walk(env, obj, rdpg, dt_index_page_build ,ii);
+	dt_read_unlock(env, obj);
+
+	if (rc == 0) {
+		/* index is empty */
+		LASSERT(ii->ii_count == 0);
+		ii->ii_hash_end = II_END_OFF;
+	}
+
+	GOTO(out, rc);
+out:
+	lu_object_put(env, &obj->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(dt_index_read);
+
+#ifdef LPROCFS
+
+int lprocfs_dt_rd_blksize(char *page, char **start, off_t off,
+			  int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, "%u\n",
+				(unsigned) osfs.os_bsize);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_blksize);
+
+int lprocfs_dt_rd_kbytestotal(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytestotal);
+
+int lprocfs_dt_rd_kbytesfree(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesfree);
+
+int lprocfs_dt_rd_kbytesavail(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", result);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_kbytesavail);
+
+int lprocfs_dt_rd_filestotal(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_files);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filestotal);
+
+int lprocfs_dt_rd_filesfree(char *page, char **start, off_t off,
+			    int count, int *eof, void *data)
+{
+	struct dt_device *dt = data;
+	struct obd_statfs osfs;
+
+	int rc = dt_statfs(NULL, dt, &osfs);
+	if (rc == 0) {
+		*eof = 1;
+		rc = snprintf(page, count, LPU64"\n", osfs.os_ffree);
+	}
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_dt_rd_filesfree);
+
+#endif /* LPROCFS */
diff --git a/drivers/staging/lustre/lustre/obdclass/genops.c b/drivers/staging/lustre/lustre/obdclass/genops.c
new file mode 100644
index 000000000000..d96876e0bc68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/genops.c
@@ -0,0 +1,1853 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/genops.c
+ *
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_ost.h>
+#include <obd_class.h>
+#include <lprocfs_status.h>
+
+extern struct list_head obd_types;
+spinlock_t obd_types_lock;
+
+struct kmem_cache *obd_device_cachep;
+struct kmem_cache *obdo_cachep;
+EXPORT_SYMBOL(obdo_cachep);
+struct kmem_cache *import_cachep;
+
+struct list_head      obd_zombie_imports;
+struct list_head      obd_zombie_exports;
+spinlock_t  obd_zombie_impexp_lock;
+static void obd_zombie_impexp_notify(void);
+static void obd_zombie_export_add(struct obd_export *exp);
+static void obd_zombie_import_add(struct obd_import *imp);
+static void print_export_data(struct obd_export *exp,
+			      const char *status, int locks);
+
+int (*ptlrpc_put_connection_superhack)(struct ptlrpc_connection *c);
+EXPORT_SYMBOL(ptlrpc_put_connection_superhack);
+
+/*
+ * support functions: we could use inter-module communication, but this
+ * is more portable to other OS's
+ */
+static struct obd_device *obd_device_alloc(void)
+{
+	struct obd_device *obd;
+
+	OBD_SLAB_ALLOC_PTR_GFP(obd, obd_device_cachep, __GFP_IO);
+	if (obd != NULL) {
+		obd->obd_magic = OBD_DEVICE_MAGIC;
+	}
+	return obd;
+}
+
+static void obd_device_free(struct obd_device *obd)
+{
+	LASSERT(obd != NULL);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	if (obd->obd_namespace != NULL) {
+		CERROR("obd %p: namespace %p was not properly cleaned up (obd_force=%d)!\n",
+		       obd, obd->obd_namespace, obd->obd_force);
+		LBUG();
+	}
+	lu_ref_fini(&obd->obd_reference);
+	OBD_SLAB_FREE_PTR(obd, obd_device_cachep);
+}
+
+struct obd_type *class_search_type(const char *name)
+{
+	struct list_head *tmp;
+	struct obd_type *type;
+
+	spin_lock(&obd_types_lock);
+	list_for_each(tmp, &obd_types) {
+		type = list_entry(tmp, struct obd_type, typ_chain);
+		if (strcmp(type->typ_name, name) == 0) {
+			spin_unlock(&obd_types_lock);
+			return type;
+		}
+	}
+	spin_unlock(&obd_types_lock);
+	return NULL;
+}
+EXPORT_SYMBOL(class_search_type);
+
+struct obd_type *class_get_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+
+	if (!type) {
+		const char *modname = name;
+
+		if (strcmp(modname, "obdfilter") == 0)
+			modname = "ofd";
+
+		if (strcmp(modname, LUSTRE_LWP_NAME) == 0)
+			modname = LUSTRE_OSP_NAME;
+
+		if (!strncmp(modname, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)))
+			modname = LUSTRE_MDT_NAME;
+
+		if (!request_module("%s", modname)) {
+			CDEBUG(D_INFO, "Loaded module '%s'\n", modname);
+			type = class_search_type(name);
+		} else {
+			LCONSOLE_ERROR_MSG(0x158, "Can't load module '%s'\n",
+					   modname);
+		}
+	}
+	if (type) {
+		spin_lock(&type->obd_type_lock);
+		type->typ_refcnt++;
+		try_module_get(type->typ_dt_ops->o_owner);
+		spin_unlock(&type->obd_type_lock);
+	}
+	return type;
+}
+EXPORT_SYMBOL(class_get_type);
+
+void class_put_type(struct obd_type *type)
+{
+	LASSERT(type);
+	spin_lock(&type->obd_type_lock);
+	type->typ_refcnt--;
+	module_put(type->typ_dt_ops->o_owner);
+	spin_unlock(&type->obd_type_lock);
+}
+EXPORT_SYMBOL(class_put_type);
+
+#define CLASS_MAX_NAME 1024
+
+int class_register_type(struct obd_ops *dt_ops, struct md_ops *md_ops,
+			struct lprocfs_vars *vars, const char *name,
+			struct lu_device_type *ldt)
+{
+	struct obd_type *type;
+	int rc = 0;
+	ENTRY;
+
+	/* sanity check */
+	LASSERT(strnlen(name, CLASS_MAX_NAME) < CLASS_MAX_NAME);
+
+	if (class_search_type(name)) {
+		CDEBUG(D_IOCTL, "Type %s already registered\n", name);
+		RETURN(-EEXIST);
+	}
+
+	rc = -ENOMEM;
+	OBD_ALLOC(type, sizeof(*type));
+	if (type == NULL)
+		RETURN(rc);
+
+	OBD_ALLOC_PTR(type->typ_dt_ops);
+	OBD_ALLOC_PTR(type->typ_md_ops);
+	OBD_ALLOC(type->typ_name, strlen(name) + 1);
+
+	if (type->typ_dt_ops == NULL ||
+	    type->typ_md_ops == NULL ||
+	    type->typ_name == NULL)
+		GOTO (failed, rc);
+
+	*(type->typ_dt_ops) = *dt_ops;
+	/* md_ops is optional */
+	if (md_ops)
+		*(type->typ_md_ops) = *md_ops;
+	strcpy(type->typ_name, name);
+	spin_lock_init(&type->obd_type_lock);
+
+#ifdef LPROCFS
+	type->typ_procroot = lprocfs_register(type->typ_name, proc_lustre_root,
+					      vars, type);
+	if (IS_ERR(type->typ_procroot)) {
+		rc = PTR_ERR(type->typ_procroot);
+		type->typ_procroot = NULL;
+		GOTO (failed, rc);
+	}
+#endif
+	if (ldt != NULL) {
+		type->typ_lu = ldt;
+		rc = lu_device_type_init(ldt);
+		if (rc != 0)
+			GOTO (failed, rc);
+	}
+
+	spin_lock(&obd_types_lock);
+	list_add(&type->typ_chain, &obd_types);
+	spin_unlock(&obd_types_lock);
+
+	RETURN (0);
+
+ failed:
+	if (type->typ_name != NULL)
+		OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_register_type);
+
+int class_unregister_type(const char *name)
+{
+	struct obd_type *type = class_search_type(name);
+	ENTRY;
+
+	if (!type) {
+		CERROR("unknown obd type\n");
+		RETURN(-EINVAL);
+	}
+
+	if (type->typ_refcnt) {
+		CERROR("type %s has refcount (%d)\n", name, type->typ_refcnt);
+		/* This is a bad situation, let's make the best of it */
+		/* Remove ops, but leave the name for debugging */
+		OBD_FREE_PTR(type->typ_dt_ops);
+		OBD_FREE_PTR(type->typ_md_ops);
+		RETURN(-EBUSY);
+	}
+
+	if (type->typ_procroot) {
+		lprocfs_remove(&type->typ_procroot);
+	}
+
+	if (type->typ_lu)
+		lu_device_type_fini(type->typ_lu);
+
+	spin_lock(&obd_types_lock);
+	list_del(&type->typ_chain);
+	spin_unlock(&obd_types_lock);
+	OBD_FREE(type->typ_name, strlen(name) + 1);
+	if (type->typ_dt_ops != NULL)
+		OBD_FREE_PTR(type->typ_dt_ops);
+	if (type->typ_md_ops != NULL)
+		OBD_FREE_PTR(type->typ_md_ops);
+	OBD_FREE(type, sizeof(*type));
+	RETURN(0);
+} /* class_unregister_type */
+EXPORT_SYMBOL(class_unregister_type);
+
+/**
+ * Create a new obd device.
+ *
+ * Find an empty slot in ::obd_devs[], create a new obd device in it.
+ *
+ * \param[in] type_name obd device type string.
+ * \param[in] name      obd device name.
+ *
+ * \retval NULL if create fails, otherwise return the obd device
+ *	 pointer created.
+ */
+struct obd_device *class_newdev(const char *type_name, const char *name)
+{
+	struct obd_device *result = NULL;
+	struct obd_device *newdev;
+	struct obd_type *type = NULL;
+	int i;
+	int new_obd_minor = 0;
+	ENTRY;
+
+	if (strlen(name) >= MAX_OBD_NAME) {
+		CERROR("name/uuid must be < %u bytes long\n", MAX_OBD_NAME);
+		RETURN(ERR_PTR(-EINVAL));
+	}
+
+	type = class_get_type(type_name);
+	if (type == NULL){
+		CERROR("OBD: unknown type: %s\n", type_name);
+		RETURN(ERR_PTR(-ENODEV));
+	}
+
+	newdev = obd_device_alloc();
+	if (newdev == NULL)
+		GOTO(out_type, result = ERR_PTR(-ENOMEM));
+
+	LASSERT(newdev->obd_magic == OBD_DEVICE_MAGIC);
+
+	write_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && (strcmp(name, obd->obd_name) == 0)) {
+			CERROR("Device %s already exists at %d, won't add\n",
+			       name, i);
+			if (result) {
+				LASSERTF(result->obd_magic == OBD_DEVICE_MAGIC,
+					 "%p obd_magic %08x != %08x\n", result,
+					 result->obd_magic, OBD_DEVICE_MAGIC);
+				LASSERTF(result->obd_minor == new_obd_minor,
+					 "%p obd_minor %d != %d\n", result,
+					 result->obd_minor, new_obd_minor);
+
+				obd_devs[result->obd_minor] = NULL;
+				result->obd_name[0]='\0';
+			 }
+			result = ERR_PTR(-EEXIST);
+			break;
+		}
+		if (!result && !obd) {
+			result = newdev;
+			result->obd_minor = i;
+			new_obd_minor = i;
+			result->obd_type = type;
+			strncpy(result->obd_name, name,
+				sizeof(result->obd_name) - 1);
+			obd_devs[i] = result;
+		}
+	}
+	write_unlock(&obd_dev_lock);
+
+	if (result == NULL && i >= class_devno_max()) {
+		CERROR("all %u OBD devices used, increase MAX_OBD_DEVICES\n",
+		       class_devno_max());
+		GOTO(out, result = ERR_PTR(-EOVERFLOW));
+	}
+
+	if (IS_ERR(result))
+		GOTO(out, result);
+
+	CDEBUG(D_IOCTL, "Adding new device %s (%p)\n",
+	       result->obd_name, result);
+
+	RETURN(result);
+out:
+	obd_device_free(newdev);
+out_type:
+	class_put_type(type);
+	return result;
+}
+
+void class_release_dev(struct obd_device *obd)
+{
+	struct obd_type *obd_type = obd->obd_type;
+
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC, "%p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(obd == obd_devs[obd->obd_minor], "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, obd_devs[obd->obd_minor]);
+	LASSERT(obd_type != NULL);
+
+	CDEBUG(D_INFO, "Release obd device %s at %d obd_type name =%s\n",
+	       obd->obd_name, obd->obd_minor, obd->obd_type->typ_name);
+
+	write_lock(&obd_dev_lock);
+	obd_devs[obd->obd_minor] = NULL;
+	write_unlock(&obd_dev_lock);
+	obd_device_free(obd);
+
+	class_put_type(obd_type);
+}
+
+int class_name2dev(const char *name)
+{
+	int i;
+
+	if (!name)
+		return -1;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && strcmp(name, obd->obd_name) == 0) {
+			/* Make sure we finished attaching before we give
+			   out any references */
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			if (obd->obd_attached) {
+				read_unlock(&obd_dev_lock);
+				return i;
+			}
+			break;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_name2dev);
+
+struct obd_device *class_name2obd(const char *name)
+{
+	int dev = class_name2dev(name);
+
+	if (dev < 0 || dev > class_devno_max())
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_name2obd);
+
+int class_uuid2dev(struct obd_uuid *uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd && obd_uuid_equals(uuid, &obd->obd_uuid)) {
+			LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+			read_unlock(&obd_dev_lock);
+			return i;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return -1;
+}
+EXPORT_SYMBOL(class_uuid2dev);
+
+struct obd_device *class_uuid2obd(struct obd_uuid *uuid)
+{
+	int dev = class_uuid2dev(uuid);
+	if (dev < 0)
+		return NULL;
+	return class_num2obd(dev);
+}
+EXPORT_SYMBOL(class_uuid2obd);
+
+/**
+ * Get obd device from ::obd_devs[]
+ *
+ * \param num [in] array index
+ *
+ * \retval NULL if ::obd_devs[\a num] does not contains an obd device
+ *	 otherwise return the obd device there.
+ */
+struct obd_device *class_num2obd(int num)
+{
+	struct obd_device *obd = NULL;
+
+	if (num < class_devno_max()) {
+		obd = obd_devs[num];
+		if (obd == NULL)
+			return NULL;
+
+		LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+			 "%p obd_magic %08x != %08x\n",
+			 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+		LASSERTF(obd->obd_minor == num,
+			 "%p obd_minor %0d != %0d\n",
+			 obd, obd->obd_minor, num);
+	}
+
+	return obd;
+}
+EXPORT_SYMBOL(class_num2obd);
+
+/**
+ * Get obd devices count. Device in any
+ *    state are counted
+ * \retval obd device count
+ */
+int get_devices_count(void)
+{
+	int index, max_index = class_devno_max(), dev_count = 0;
+
+	read_lock(&obd_dev_lock);
+	for (index = 0; index <= max_index; index++) {
+		struct obd_device *obd = class_num2obd(index);
+		if (obd != NULL)
+			dev_count++;
+	}
+	read_unlock(&obd_dev_lock);
+
+	return dev_count;
+}
+EXPORT_SYMBOL(get_devices_count);
+
+void class_obd_list(void)
+{
+	char *status;
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd->obd_stopping)
+			status = "ST";
+		else if (obd->obd_set_up)
+			status = "UP";
+		else if (obd->obd_attached)
+			status = "AT";
+		else
+			status = "--";
+		LCONSOLE(D_CONFIG, "%3d %s %s %s %s %d\n",
+			 i, status, obd->obd_type->typ_name,
+			 obd->obd_name, obd->obd_uuid.uuid,
+			 atomic_read(&obd->obd_refcount));
+	}
+	read_unlock(&obd_dev_lock);
+	return;
+}
+
+/* Search for a client OBD connected to tgt_uuid.  If grp_uuid is
+   specified, then only the client with that uuid is returned,
+   otherwise any client connected to the tgt is returned. */
+struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid,
+					  const char * typ_name,
+					  struct obd_uuid *grp_uuid)
+{
+	int i;
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if ((strncmp(obd->obd_type->typ_name, typ_name,
+			     strlen(typ_name)) == 0)) {
+			if (obd_uuid_equals(tgt_uuid,
+					    &obd->u.cli.cl_target_uuid) &&
+			    ((grp_uuid)? obd_uuid_equals(grp_uuid,
+							 &obd->obd_uuid) : 1)) {
+				read_unlock(&obd_dev_lock);
+				return obd;
+			}
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_find_client_obd);
+
+/* Iterate the obd_device list looking devices have grp_uuid. Start
+   searching at *next, and if a device is found, the next index to look
+   at is saved in *next. If next is NULL, then the first matching device
+   will always be returned. */
+struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next)
+{
+	int i;
+
+	if (next == NULL)
+		i = 0;
+	else if (*next >= 0 && *next < class_devno_max())
+		i = *next;
+	else
+		return NULL;
+
+	read_lock(&obd_dev_lock);
+	for (; i < class_devno_max(); i++) {
+		struct obd_device *obd = class_num2obd(i);
+
+		if (obd == NULL)
+			continue;
+		if (obd_uuid_equals(grp_uuid, &obd->obd_uuid)) {
+			if (next != NULL)
+				*next = i+1;
+			read_unlock(&obd_dev_lock);
+			return obd;
+		}
+	}
+	read_unlock(&obd_dev_lock);
+
+	return NULL;
+}
+EXPORT_SYMBOL(class_devices_in_group);
+
+/**
+ * to notify sptlrpc log for \a fsname has changed, let every relevant OBD
+ * adjust sptlrpc settings accordingly.
+ */
+int class_notify_sptlrpc_conf(const char *fsname, int namelen)
+{
+	struct obd_device  *obd;
+	const char	 *type;
+	int		 i, rc = 0, rc2;
+
+	LASSERT(namelen > 0);
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		obd = class_num2obd(i);
+
+		if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping)
+			continue;
+
+		/* only notify mdc, osc, mdt, ost */
+		type = obd->obd_type->typ_name;
+		if (strcmp(type, LUSTRE_MDC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OSC_NAME) != 0 &&
+		    strcmp(type, LUSTRE_MDT_NAME) != 0 &&
+		    strcmp(type, LUSTRE_OST_NAME) != 0)
+			continue;
+
+		if (strncmp(obd->obd_name, fsname, namelen))
+			continue;
+
+		class_incref(obd, __FUNCTION__, obd);
+		read_unlock(&obd_dev_lock);
+		rc2 = obd_set_info_async(NULL, obd->obd_self_export,
+					 sizeof(KEY_SPTLRPC_CONF),
+					 KEY_SPTLRPC_CONF, 0, NULL, NULL);
+		rc = rc ? rc : rc2;
+		class_decref(obd, __FUNCTION__, obd);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+	return rc;
+}
+EXPORT_SYMBOL(class_notify_sptlrpc_conf);
+
+void obd_cleanup_caches(void)
+{
+	ENTRY;
+	if (obd_device_cachep) {
+		kmem_cache_destroy(obd_device_cachep);
+		obd_device_cachep = NULL;
+	}
+	if (obdo_cachep) {
+		kmem_cache_destroy(obdo_cachep);
+		obdo_cachep = NULL;
+	}
+	if (import_cachep) {
+		kmem_cache_destroy(import_cachep);
+		import_cachep = NULL;
+	}
+	if (capa_cachep) {
+		kmem_cache_destroy(capa_cachep);
+		capa_cachep = NULL;
+	}
+	EXIT;
+}
+
+int obd_init_caches(void)
+{
+	ENTRY;
+
+	LASSERT(obd_device_cachep == NULL);
+	obd_device_cachep = kmem_cache_create("ll_obd_dev_cache",
+						 sizeof(struct obd_device),
+						 0, 0, NULL);
+	if (!obd_device_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(obdo_cachep == NULL);
+	obdo_cachep = kmem_cache_create("ll_obdo_cache", sizeof(struct obdo),
+					   0, 0, NULL);
+	if (!obdo_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(import_cachep == NULL);
+	import_cachep = kmem_cache_create("ll_import_cache",
+					     sizeof(struct obd_import),
+					     0, 0, NULL);
+	if (!import_cachep)
+		GOTO(out, -ENOMEM);
+
+	LASSERT(capa_cachep == NULL);
+	capa_cachep = kmem_cache_create("capa_cache",
+					   sizeof(struct obd_capa), 0, 0, NULL);
+	if (!capa_cachep)
+		GOTO(out, -ENOMEM);
+
+	RETURN(0);
+ out:
+	obd_cleanup_caches();
+	RETURN(-ENOMEM);
+
+}
+
+/* map connection to client */
+struct obd_export *class_conn2export(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	ENTRY;
+
+	if (!conn) {
+		CDEBUG(D_CACHE, "looking for null handle\n");
+		RETURN(NULL);
+	}
+
+	if (conn->cookie == -1) {  /* this means assign a new connection */
+		CDEBUG(D_CACHE, "want a new connection\n");
+		RETURN(NULL);
+	}
+
+	CDEBUG(D_INFO, "looking for export cookie "LPX64"\n", conn->cookie);
+	export = class_handle2object(conn->cookie);
+	RETURN(export);
+}
+EXPORT_SYMBOL(class_conn2export);
+
+struct obd_device *class_exp2obd(struct obd_export *exp)
+{
+	if (exp)
+		return exp->exp_obd;
+	return NULL;
+}
+EXPORT_SYMBOL(class_exp2obd);
+
+struct obd_device *class_conn2obd(struct lustre_handle *conn)
+{
+	struct obd_export *export;
+	export = class_conn2export(conn);
+	if (export) {
+		struct obd_device *obd = export->exp_obd;
+		class_export_put(export);
+		return obd;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(class_conn2obd);
+
+struct obd_import *class_exp2cliimp(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_exp2cliimp);
+
+struct obd_import *class_conn2cliimp(struct lustre_handle *conn)
+{
+	struct obd_device *obd = class_conn2obd(conn);
+	if (obd == NULL)
+		return NULL;
+	return obd->u.cli.cl_import;
+}
+EXPORT_SYMBOL(class_conn2cliimp);
+
+/* Export management functions */
+static void class_export_destroy(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+	ENTRY;
+
+	LASSERT_ATOMIC_ZERO(&exp->exp_refcount);
+	LASSERT(obd != NULL);
+
+	CDEBUG(D_IOCTL, "destroying export %p/%s for %s\n", exp,
+	       exp->exp_client_uuid.uuid, obd->obd_name);
+
+	/* "Local" exports (lctl, LOV->{mdc,osc}) have no connection. */
+	if (exp->exp_connection)
+		ptlrpc_put_connection_superhack(exp->exp_connection);
+
+	LASSERT(list_empty(&exp->exp_outstanding_replies));
+	LASSERT(list_empty(&exp->exp_uncommitted_replies));
+	LASSERT(list_empty(&exp->exp_req_replay_queue));
+	LASSERT(list_empty(&exp->exp_hp_rpcs));
+	obd_destroy_export(exp);
+	class_decref(obd, "export", exp);
+
+	OBD_FREE_RCU(exp, sizeof(*exp), &exp->exp_handle);
+	EXIT;
+}
+
+static void export_handle_addref(void *export)
+{
+	class_export_get(export);
+}
+
+static struct portals_handle_ops export_handle_ops = {
+	.hop_addref = export_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_export *class_export_get(struct obd_export *exp)
+{
+	atomic_inc(&exp->exp_refcount);
+	CDEBUG(D_INFO, "GETting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount));
+	return exp;
+}
+EXPORT_SYMBOL(class_export_get);
+
+void class_export_put(struct obd_export *exp)
+{
+	LASSERT(exp != NULL);
+	LASSERT_ATOMIC_GT_LT(&exp->exp_refcount, 0, LI_POISON);
+	CDEBUG(D_INFO, "PUTting export %p : new refcount %d\n", exp,
+	       atomic_read(&exp->exp_refcount) - 1);
+
+	if (atomic_dec_and_test(&exp->exp_refcount)) {
+		LASSERT(!list_empty(&exp->exp_obd_chain));
+		CDEBUG(D_IOCTL, "final put %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+
+		/* release nid stat refererence */
+		lprocfs_exp_cleanup(exp);
+
+		obd_zombie_export_add(exp);
+	}
+}
+EXPORT_SYMBOL(class_export_put);
+
+/* Creates a new export, adds it to the hash table, and returns a
+ * pointer to it. The refcount is 2: one for the hash reference, and
+ * one for the pointer returned by this function. */
+struct obd_export *class_new_export(struct obd_device *obd,
+				    struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	cfs_hash_t *hash = NULL;
+	int rc = 0;
+	ENTRY;
+
+	OBD_ALLOC_PTR(export);
+	if (!export)
+		return ERR_PTR(-ENOMEM);
+
+	export->exp_conn_cnt = 0;
+	export->exp_lock_hash = NULL;
+	export->exp_flock_hash = NULL;
+	atomic_set(&export->exp_refcount, 2);
+	atomic_set(&export->exp_rpc_count, 0);
+	atomic_set(&export->exp_cb_count, 0);
+	atomic_set(&export->exp_locks_count, 0);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	INIT_LIST_HEAD(&export->exp_locks_list);
+	spin_lock_init(&export->exp_locks_list_guard);
+#endif
+	atomic_set(&export->exp_replay_count, 0);
+	export->exp_obd = obd;
+	INIT_LIST_HEAD(&export->exp_outstanding_replies);
+	spin_lock_init(&export->exp_uncommitted_replies_lock);
+	INIT_LIST_HEAD(&export->exp_uncommitted_replies);
+	INIT_LIST_HEAD(&export->exp_req_replay_queue);
+	INIT_LIST_HEAD(&export->exp_handle.h_link);
+	INIT_LIST_HEAD(&export->exp_hp_rpcs);
+	class_handle_hash(&export->exp_handle, &export_handle_ops);
+	export->exp_last_request_time = cfs_time_current_sec();
+	spin_lock_init(&export->exp_lock);
+	spin_lock_init(&export->exp_rpc_lock);
+	INIT_HLIST_NODE(&export->exp_uuid_hash);
+	INIT_HLIST_NODE(&export->exp_nid_hash);
+	spin_lock_init(&export->exp_bl_list_lock);
+	INIT_LIST_HEAD(&export->exp_bl_list);
+
+	export->exp_sp_peer = LUSTRE_SP_ANY;
+	export->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID;
+	export->exp_client_uuid = *cluuid;
+	obd_init_export(export);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* shouldn't happen, but might race */
+	if (obd->obd_stopping)
+		GOTO(exit_unlock, rc = -ENODEV);
+
+	hash = cfs_hash_getref(obd->obd_uuid_hash);
+	if (hash == NULL)
+		GOTO(exit_unlock, rc = -ENODEV);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!obd_uuid_equals(cluuid, &obd->obd_uuid)) {
+		rc = cfs_hash_add_unique(hash, cluuid, &export->exp_uuid_hash);
+		if (rc != 0) {
+			LCONSOLE_WARN("%s: denying duplicate export for %s, %d\n",
+				      obd->obd_name, cluuid->uuid, rc);
+			GOTO(exit_err, rc = -EALREADY);
+		}
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		cfs_hash_del(hash, cluuid, &export->exp_uuid_hash);
+		GOTO(exit_unlock, rc = -ENODEV);
+	}
+
+	class_incref(obd, "export", export);
+	list_add(&export->exp_obd_chain, &export->exp_obd->obd_exports);
+	list_add_tail(&export->exp_obd_chain_timed,
+			  &export->exp_obd->obd_exports_timed);
+	export->exp_obd->obd_num_exports++;
+	spin_unlock(&obd->obd_dev_lock);
+	cfs_hash_putref(hash);
+	RETURN(export);
+
+exit_unlock:
+	spin_unlock(&obd->obd_dev_lock);
+exit_err:
+	if (hash)
+		cfs_hash_putref(hash);
+	class_handle_unhash(&export->exp_handle);
+	LASSERT(hlist_unhashed(&export->exp_uuid_hash));
+	obd_destroy_export(export);
+	OBD_FREE_PTR(export);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL(class_new_export);
+
+void class_unlink_export(struct obd_export *exp)
+{
+	class_handle_unhash(&exp->exp_handle);
+
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	/* delete an uuid-export hashitem from hashtables */
+	if (!hlist_unhashed(&exp->exp_uuid_hash))
+		cfs_hash_del(exp->exp_obd->obd_uuid_hash,
+			     &exp->exp_client_uuid,
+			     &exp->exp_uuid_hash);
+
+	list_move(&exp->exp_obd_chain, &exp->exp_obd->obd_unlinked_exports);
+	list_del_init(&exp->exp_obd_chain_timed);
+	exp->exp_obd->obd_num_exports--;
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_unlink_export);
+
+/* Import management functions */
+void class_import_destroy(struct obd_import *imp)
+{
+	ENTRY;
+
+	CDEBUG(D_IOCTL, "destroying import %p for %s\n", imp,
+		imp->imp_obd->obd_name);
+
+	LASSERT_ATOMIC_ZERO(&imp->imp_refcount);
+
+	ptlrpc_put_connection_superhack(imp->imp_connection);
+
+	while (!list_empty(&imp->imp_conn_list)) {
+		struct obd_import_conn *imp_conn;
+
+		imp_conn = list_entry(imp->imp_conn_list.next,
+					  struct obd_import_conn, oic_item);
+		list_del_init(&imp_conn->oic_item);
+		ptlrpc_put_connection_superhack(imp_conn->oic_conn);
+		OBD_FREE(imp_conn, sizeof(*imp_conn));
+	}
+
+	LASSERT(imp->imp_sec == NULL);
+	class_decref(imp->imp_obd, "import", imp);
+	OBD_FREE_RCU(imp, sizeof(*imp), &imp->imp_handle);
+	EXIT;
+}
+
+static void import_handle_addref(void *import)
+{
+	class_import_get(import);
+}
+
+static struct portals_handle_ops import_handle_ops = {
+	.hop_addref = import_handle_addref,
+	.hop_free   = NULL,
+};
+
+struct obd_import *class_import_get(struct obd_import *import)
+{
+	atomic_inc(&import->imp_refcount);
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import,
+	       atomic_read(&import->imp_refcount),
+	       import->imp_obd->obd_name);
+	return import;
+}
+EXPORT_SYMBOL(class_import_get);
+
+void class_import_put(struct obd_import *imp)
+{
+	ENTRY;
+
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	LASSERT_ATOMIC_GT_LT(&imp->imp_refcount, 0, LI_POISON);
+
+	CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", imp,
+	       atomic_read(&imp->imp_refcount) - 1,
+	       imp->imp_obd->obd_name);
+
+	if (atomic_dec_and_test(&imp->imp_refcount)) {
+		CDEBUG(D_INFO, "final put import %p\n", imp);
+		obd_zombie_import_add(imp);
+	}
+
+	/* catch possible import put race */
+	LASSERT_ATOMIC_GE_LT(&imp->imp_refcount, 0, LI_POISON);
+	EXIT;
+}
+EXPORT_SYMBOL(class_import_put);
+
+static void init_imp_at(struct imp_at *at) {
+	int i;
+	at_init(&at->iat_net_latency, 0, 0);
+	for (i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		/* max service estimates are tracked on the server side, so
+		   don't use the AT history here, just use the last reported
+		   val. (But keep hist for proc histogram, worst_ever) */
+		at_init(&at->iat_service_estimate[i], INITIAL_CONNECT_TIMEOUT,
+			AT_FLG_NOHIST);
+	}
+}
+
+struct obd_import *class_new_import(struct obd_device *obd)
+{
+	struct obd_import *imp;
+
+	OBD_ALLOC(imp, sizeof(*imp));
+	if (imp == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&imp->imp_pinger_chain);
+	INIT_LIST_HEAD(&imp->imp_zombie_chain);
+	INIT_LIST_HEAD(&imp->imp_replay_list);
+	INIT_LIST_HEAD(&imp->imp_sending_list);
+	INIT_LIST_HEAD(&imp->imp_delayed_list);
+	spin_lock_init(&imp->imp_lock);
+	imp->imp_last_success_conn = 0;
+	imp->imp_state = LUSTRE_IMP_NEW;
+	imp->imp_obd = class_incref(obd, "import", imp);
+	mutex_init(&imp->imp_sec_mutex);
+	init_waitqueue_head(&imp->imp_recovery_waitq);
+
+	atomic_set(&imp->imp_refcount, 2);
+	atomic_set(&imp->imp_unregistering, 0);
+	atomic_set(&imp->imp_inflight, 0);
+	atomic_set(&imp->imp_replay_inflight, 0);
+	atomic_set(&imp->imp_inval_count, 0);
+	INIT_LIST_HEAD(&imp->imp_conn_list);
+	INIT_LIST_HEAD(&imp->imp_handle.h_link);
+	class_handle_hash(&imp->imp_handle, &import_handle_ops);
+	init_imp_at(&imp->imp_at);
+
+	/* the default magic is V2, will be used in connect RPC, and
+	 * then adjusted according to the flags in request/reply. */
+	imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2;
+
+	return imp;
+}
+EXPORT_SYMBOL(class_new_import);
+
+void class_destroy_import(struct obd_import *import)
+{
+	LASSERT(import != NULL);
+	LASSERT(import != LP_POISON);
+
+	class_handle_unhash(&import->imp_handle);
+
+	spin_lock(&import->imp_lock);
+	import->imp_generation++;
+	spin_unlock(&import->imp_lock);
+	class_import_put(import);
+}
+EXPORT_SYMBOL(class_destroy_import);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+
+void __class_export_add_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+
+	LASSERT(lock->l_exp_refs_nr >= 0);
+
+	if (lock->l_exp_refs_target != NULL &&
+	    lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("setting export %p for lock %p which already has export %p\n",
+			      exp, lock, lock->l_exp_refs_target);
+	}
+	if ((lock->l_exp_refs_nr ++) == 0) {
+		list_add(&lock->l_exp_refs_link, &exp->exp_locks_list);
+		lock->l_exp_refs_target = exp;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_add_lock_ref);
+
+void __class_export_del_lock_ref(struct obd_export *exp, struct ldlm_lock *lock)
+{
+	spin_lock(&exp->exp_locks_list_guard);
+	LASSERT(lock->l_exp_refs_nr > 0);
+	if (lock->l_exp_refs_target != exp) {
+		LCONSOLE_WARN("lock %p, "
+			      "mismatching export pointers: %p, %p\n",
+			      lock, lock->l_exp_refs_target, exp);
+	}
+	if (-- lock->l_exp_refs_nr == 0) {
+		list_del_init(&lock->l_exp_refs_link);
+		lock->l_exp_refs_target = NULL;
+	}
+	CDEBUG(D_INFO, "lock = %p, export = %p, refs = %u\n",
+	       lock, exp, lock->l_exp_refs_nr);
+	spin_unlock(&exp->exp_locks_list_guard);
+}
+EXPORT_SYMBOL(__class_export_del_lock_ref);
+#endif
+
+/* A connection defines an export context in which preallocation can
+   be managed. This releases the export pointer reference, and returns
+   the export handle, so the export refcount is 1 when this function
+   returns. */
+int class_connect(struct lustre_handle *conn, struct obd_device *obd,
+		  struct obd_uuid *cluuid)
+{
+	struct obd_export *export;
+	LASSERT(conn != NULL);
+	LASSERT(obd != NULL);
+	LASSERT(cluuid != NULL);
+	ENTRY;
+
+	export = class_new_export(obd, cluuid);
+	if (IS_ERR(export))
+		RETURN(PTR_ERR(export));
+
+	conn->cookie = export->exp_handle.h_cookie;
+	class_export_put(export);
+
+	CDEBUG(D_IOCTL, "connect: client %s, cookie "LPX64"\n",
+	       cluuid->uuid, conn->cookie);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_connect);
+
+/* if export is involved in recovery then clean up related things */
+void class_export_recovery_cleanup(struct obd_export *exp)
+{
+	struct obd_device *obd = exp->exp_obd;
+
+	spin_lock(&obd->obd_recovery_task_lock);
+	if (exp->exp_delayed)
+		obd->obd_delayed_clients--;
+	if (obd->obd_recovering) {
+		if (exp->exp_in_recovery) {
+			spin_lock(&exp->exp_lock);
+			exp->exp_in_recovery = 0;
+			spin_unlock(&exp->exp_lock);
+			LASSERT_ATOMIC_POS(&obd->obd_connected_clients);
+			atomic_dec(&obd->obd_connected_clients);
+		}
+
+		/* if called during recovery then should update
+		 * obd_stale_clients counter,
+		 * lightweight exports are not counted */
+		if (exp->exp_failed &&
+		    (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) == 0)
+			exp->exp_obd->obd_stale_clients++;
+	}
+	spin_unlock(&obd->obd_recovery_task_lock);
+	/** Cleanup req replay fields */
+	if (exp->exp_req_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_req_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_req_replay_clients));
+		atomic_dec(&obd->obd_req_replay_clients);
+	}
+	/** Cleanup lock replay data */
+	if (exp->exp_lock_replay_needed) {
+		spin_lock(&exp->exp_lock);
+		exp->exp_lock_replay_needed = 0;
+		spin_unlock(&exp->exp_lock);
+		LASSERT(atomic_read(&obd->obd_lock_replay_clients));
+		atomic_dec(&obd->obd_lock_replay_clients);
+	}
+}
+
+/* This function removes 1-3 references from the export:
+ * 1 - for export pointer passed
+ * and if disconnect really need
+ * 2 - removing from hash
+ * 3 - in client_unlink_export
+ * The export pointer passed to this function can destroyed */
+int class_disconnect(struct obd_export *export)
+{
+	int already_disconnected;
+	ENTRY;
+
+	if (export == NULL) {
+		CWARN("attempting to free NULL export %p\n", export);
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&export->exp_lock);
+	already_disconnected = export->exp_disconnected;
+	export->exp_disconnected = 1;
+	spin_unlock(&export->exp_lock);
+
+	/* class_cleanup(), abort_recovery(), and class_fail_export()
+	 * all end up in here, and if any of them race we shouldn't
+	 * call extra class_export_puts(). */
+	if (already_disconnected) {
+		LASSERT(hlist_unhashed(&export->exp_nid_hash));
+		GOTO(no_disconn, already_disconnected);
+	}
+
+	CDEBUG(D_IOCTL, "disconnect: cookie "LPX64"\n",
+	       export->exp_handle.h_cookie);
+
+	if (!hlist_unhashed(&export->exp_nid_hash))
+		cfs_hash_del(export->exp_obd->obd_nid_hash,
+			     &export->exp_connection->c_peer.nid,
+			     &export->exp_nid_hash);
+
+	class_export_recovery_cleanup(export);
+	class_unlink_export(export);
+no_disconn:
+	class_export_put(export);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_disconnect);
+
+/* Return non-zero for a fully connected export */
+int class_connected_export(struct obd_export *exp)
+{
+	if (exp) {
+		int connected;
+		spin_lock(&exp->exp_lock);
+		connected = (exp->exp_conn_cnt > 0);
+		spin_unlock(&exp->exp_lock);
+		return connected;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_connected_export);
+
+static void class_disconnect_export_list(struct list_head *list,
+					 enum obd_option flags)
+{
+	int rc;
+	struct obd_export *exp;
+	ENTRY;
+
+	/* It's possible that an export may disconnect itself, but
+	 * nothing else will be added to this list. */
+	while (!list_empty(list)) {
+		exp = list_entry(list->next, struct obd_export,
+				     exp_obd_chain);
+		/* need for safe call CDEBUG after obd_disconnect */
+		class_export_get(exp);
+
+		spin_lock(&exp->exp_lock);
+		exp->exp_flags = flags;
+		spin_unlock(&exp->exp_lock);
+
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid)) {
+			CDEBUG(D_HA,
+			       "exp %p export uuid == obd uuid, don't discon\n",
+			       exp);
+			/* Need to delete this now so we don't end up pointing
+			 * to work_list later when this export is cleaned up. */
+			list_del_init(&exp->exp_obd_chain);
+			class_export_put(exp);
+			continue;
+		}
+
+		class_export_get(exp);
+		CDEBUG(D_HA, "%s: disconnecting export at %s (%p), "
+		       "last request at "CFS_TIME_T"\n",
+		       exp->exp_obd->obd_name, obd_export_nid2str(exp),
+		       exp, exp->exp_last_request_time);
+		/* release one export reference anyway */
+		rc = obd_disconnect(exp);
+
+		CDEBUG(D_HA, "disconnected export at %s (%p): rc %d\n",
+		       obd_export_nid2str(exp), exp, rc);
+		class_export_put(exp);
+	}
+	EXIT;
+}
+
+void class_disconnect_exports(struct obd_device *obd)
+{
+	struct list_head work_list;
+	ENTRY;
+
+	/* Move all of the exports from obd_exports to a work list, en masse. */
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_splice_init(&obd->obd_exports, &work_list);
+	list_splice_init(&obd->obd_delayed_exports, &work_list);
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (!list_empty(&work_list)) {
+		CDEBUG(D_HA, "OBD device %d (%p) has exports, "
+		       "disconnecting them\n", obd->obd_minor, obd);
+		class_disconnect_export_list(&work_list,
+					     exp_flags_from_obd(obd));
+	} else
+		CDEBUG(D_HA, "OBD device %d (%p) has no exports\n",
+		       obd->obd_minor, obd);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_exports);
+
+/* Remove exports that have not completed recovery.
+ */
+void class_disconnect_stale_exports(struct obd_device *obd,
+				    int (*test_export)(struct obd_export *))
+{
+	struct list_head work_list;
+	struct obd_export *exp, *n;
+	int evicted = 0;
+	ENTRY;
+
+	INIT_LIST_HEAD(&work_list);
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry_safe(exp, n, &obd->obd_exports,
+				     exp_obd_chain) {
+		/* don't count self-export as client */
+		if (obd_uuid_equals(&exp->exp_client_uuid,
+				    &exp->exp_obd->obd_uuid))
+			continue;
+
+		/* don't evict clients which have no slot in last_rcvd
+		 * (e.g. lightweight connection) */
+		if (exp->exp_target_data.ted_lr_idx == -1)
+			continue;
+
+		spin_lock(&exp->exp_lock);
+		if (exp->exp_failed || test_export(exp)) {
+			spin_unlock(&exp->exp_lock);
+			continue;
+		}
+		exp->exp_failed = 1;
+		spin_unlock(&exp->exp_lock);
+
+		list_move(&exp->exp_obd_chain, &work_list);
+		evicted++;
+		CDEBUG(D_HA, "%s: disconnect stale client %s@%s\n",
+		       obd->obd_name, exp->exp_client_uuid.uuid,
+		       exp->exp_connection == NULL ? "<unknown>" :
+		       libcfs_nid2str(exp->exp_connection->c_peer.nid));
+		print_export_data(exp, "EVICTING", 0);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (evicted)
+		LCONSOLE_WARN("%s: disconnecting %d stale clients\n",
+			      obd->obd_name, evicted);
+
+	class_disconnect_export_list(&work_list, exp_flags_from_obd(obd) |
+						 OBD_OPT_ABORT_RECOV);
+	EXIT;
+}
+EXPORT_SYMBOL(class_disconnect_stale_exports);
+
+void class_fail_export(struct obd_export *exp)
+{
+	int rc, already_failed;
+
+	spin_lock(&exp->exp_lock);
+	already_failed = exp->exp_failed;
+	exp->exp_failed = 1;
+	spin_unlock(&exp->exp_lock);
+
+	if (already_failed) {
+		CDEBUG(D_HA, "disconnecting dead export %p/%s; skipping\n",
+		       exp, exp->exp_client_uuid.uuid);
+		return;
+	}
+
+	CDEBUG(D_HA, "disconnecting export %p/%s\n",
+	       exp, exp->exp_client_uuid.uuid);
+
+	if (obd_dump_on_timeout)
+		libcfs_debug_dumplog();
+
+	/* need for safe call CDEBUG after obd_disconnect */
+	class_export_get(exp);
+
+	/* Most callers into obd_disconnect are removing their own reference
+	 * (request, for example) in addition to the one from the hash table.
+	 * We don't have such a reference here, so make one. */
+	class_export_get(exp);
+	rc = obd_disconnect(exp);
+	if (rc)
+		CERROR("disconnecting export %p failed: %d\n", exp, rc);
+	else
+		CDEBUG(D_HA, "disconnected export %p/%s\n",
+		       exp, exp->exp_client_uuid.uuid);
+	class_export_put(exp);
+}
+EXPORT_SYMBOL(class_fail_export);
+
+char *obd_export_nid2str(struct obd_export *exp)
+{
+	if (exp->exp_connection != NULL)
+		return libcfs_nid2str(exp->exp_connection->c_peer.nid);
+
+	return "(no nid)";
+}
+EXPORT_SYMBOL(obd_export_nid2str);
+
+int obd_export_evict_by_nid(struct obd_device *obd, const char *nid)
+{
+	cfs_hash_t *nid_hash;
+	struct obd_export *doomed_exp = NULL;
+	int exports_evicted = 0;
+
+	lnet_nid_t nid_key = libcfs_str2nid((char *)nid);
+
+	spin_lock(&obd->obd_dev_lock);
+	/* umount has run already, so evict thread should leave
+	 * its task to umount thread now */
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	nid_hash = obd->obd_nid_hash;
+	cfs_hash_getref(nid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	do {
+		doomed_exp = cfs_hash_lookup(nid_hash, &nid_key);
+		if (doomed_exp == NULL)
+			break;
+
+		LASSERTF(doomed_exp->exp_connection->c_peer.nid == nid_key,
+			 "nid %s found, wanted nid %s, requested nid %s\n",
+			 obd_export_nid2str(doomed_exp),
+			 libcfs_nid2str(nid_key), nid);
+		LASSERTF(doomed_exp != obd->obd_self_export,
+			 "self-export is hashed by NID?\n");
+		exports_evicted++;
+		LCONSOLE_WARN("%s: evicting %s (at %s) by administrative "
+			      "request\n", obd->obd_name,
+			      obd_uuid2str(&doomed_exp->exp_client_uuid),
+			      obd_export_nid2str(doomed_exp));
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+	} while (1);
+
+	cfs_hash_putref(nid_hash);
+
+	if (!exports_evicted)
+		CDEBUG(D_HA,"%s: can't disconnect NID '%s': no exports found\n",
+		       obd->obd_name, nid);
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_nid);
+
+int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid)
+{
+	cfs_hash_t *uuid_hash;
+	struct obd_export *doomed_exp = NULL;
+	struct obd_uuid doomed_uuid;
+	int exports_evicted = 0;
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		return exports_evicted;
+	}
+	uuid_hash = obd->obd_uuid_hash;
+	cfs_hash_getref(uuid_hash);
+	spin_unlock(&obd->obd_dev_lock);
+
+	obd_str2uuid(&doomed_uuid, uuid);
+	if (obd_uuid_equals(&doomed_uuid, &obd->obd_uuid)) {
+		CERROR("%s: can't evict myself\n", obd->obd_name);
+		cfs_hash_putref(uuid_hash);
+		return exports_evicted;
+	}
+
+	doomed_exp = cfs_hash_lookup(uuid_hash, &doomed_uuid);
+
+	if (doomed_exp == NULL) {
+		CERROR("%s: can't disconnect %s: no exports found\n",
+		       obd->obd_name, uuid);
+	} else {
+		CWARN("%s: evicting %s at adminstrative request\n",
+		       obd->obd_name, doomed_exp->exp_client_uuid.uuid);
+		class_fail_export(doomed_exp);
+		class_export_put(doomed_exp);
+		exports_evicted++;
+	}
+	cfs_hash_putref(uuid_hash);
+
+	return exports_evicted;
+}
+EXPORT_SYMBOL(obd_export_evict_by_uuid);
+
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+void (*class_export_dump_hook)(struct obd_export*) = NULL;
+EXPORT_SYMBOL(class_export_dump_hook);
+#endif
+
+static void print_export_data(struct obd_export *exp, const char *status,
+			      int locks)
+{
+	struct ptlrpc_reply_state *rs;
+	struct ptlrpc_reply_state *first_reply = NULL;
+	int nreplies = 0;
+
+	spin_lock(&exp->exp_lock);
+	list_for_each_entry(rs, &exp->exp_outstanding_replies,
+				rs_exp_list) {
+		if (nreplies == 0)
+			first_reply = rs;
+		nreplies++;
+	}
+	spin_unlock(&exp->exp_lock);
+
+	CDEBUG(D_HA, "%s: %s %p %s %s %d (%d %d %d) %d %d %d %d: %p %s "LPU64"\n",
+	       exp->exp_obd->obd_name, status, exp, exp->exp_client_uuid.uuid,
+	       obd_export_nid2str(exp), atomic_read(&exp->exp_refcount),
+	       atomic_read(&exp->exp_rpc_count),
+	       atomic_read(&exp->exp_cb_count),
+	       atomic_read(&exp->exp_locks_count),
+	       exp->exp_disconnected, exp->exp_delayed, exp->exp_failed,
+	       nreplies, first_reply, nreplies > 3 ? "..." : "",
+	       exp->exp_last_committed);
+#if LUSTRE_TRACKS_LOCK_EXP_REFS
+	if (locks && class_export_dump_hook != NULL)
+		class_export_dump_hook(exp);
+#endif
+}
+
+void dump_exports(struct obd_device *obd, int locks)
+{
+	struct obd_export *exp;
+
+	spin_lock(&obd->obd_dev_lock);
+	list_for_each_entry(exp, &obd->obd_exports, exp_obd_chain)
+		print_export_data(exp, "ACTIVE", locks);
+	list_for_each_entry(exp, &obd->obd_unlinked_exports, exp_obd_chain)
+		print_export_data(exp, "UNLINKED", locks);
+	list_for_each_entry(exp, &obd->obd_delayed_exports, exp_obd_chain)
+		print_export_data(exp, "DELAYED", locks);
+	spin_unlock(&obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	list_for_each_entry(exp, &obd_zombie_exports, exp_obd_chain)
+		print_export_data(exp, "ZOMBIE", locks);
+	spin_unlock(&obd_zombie_impexp_lock);
+}
+EXPORT_SYMBOL(dump_exports);
+
+void obd_exports_barrier(struct obd_device *obd)
+{
+	int waited = 2;
+	LASSERT(list_empty(&obd->obd_exports));
+	spin_lock(&obd->obd_dev_lock);
+	while (!list_empty(&obd->obd_unlinked_exports)) {
+		spin_unlock(&obd->obd_dev_lock);
+		schedule_timeout_and_set_state(TASK_UNINTERRUPTIBLE,
+						   cfs_time_seconds(waited));
+		if (waited > 5 && IS_PO2(waited)) {
+			LCONSOLE_WARN("%s is waiting for obd_unlinked_exports "
+				      "more than %d seconds. "
+				      "The obd refcount = %d. Is it stuck?\n",
+				      obd->obd_name, waited,
+				      atomic_read(&obd->obd_refcount));
+			dump_exports(obd, 1);
+		}
+		waited *= 2;
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+}
+EXPORT_SYMBOL(obd_exports_barrier);
+
+/* Total amount of zombies to be destroyed */
+static int zombies_count = 0;
+
+/**
+ * kill zombie imports and exports
+ */
+void obd_zombie_impexp_cull(void)
+{
+	struct obd_import *import;
+	struct obd_export *export;
+	ENTRY;
+
+	do {
+		spin_lock(&obd_zombie_impexp_lock);
+
+		import = NULL;
+		if (!list_empty(&obd_zombie_imports)) {
+			import = list_entry(obd_zombie_imports.next,
+						struct obd_import,
+						imp_zombie_chain);
+			list_del_init(&import->imp_zombie_chain);
+		}
+
+		export = NULL;
+		if (!list_empty(&obd_zombie_exports)) {
+			export = list_entry(obd_zombie_exports.next,
+						struct obd_export,
+						exp_obd_chain);
+			list_del_init(&export->exp_obd_chain);
+		}
+
+		spin_unlock(&obd_zombie_impexp_lock);
+
+		if (import != NULL) {
+			class_import_destroy(import);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		if (export != NULL) {
+			class_export_destroy(export);
+			spin_lock(&obd_zombie_impexp_lock);
+			zombies_count--;
+			spin_unlock(&obd_zombie_impexp_lock);
+		}
+
+		cond_resched();
+	} while (import != NULL || export != NULL);
+	EXIT;
+}
+
+static struct completion	obd_zombie_start;
+static struct completion	obd_zombie_stop;
+static unsigned long		obd_zombie_flags;
+static wait_queue_head_t		obd_zombie_waitq;
+static pid_t			obd_zombie_pid;
+
+enum {
+	OBD_ZOMBIE_STOP		= 0x0001,
+};
+
+/**
+ * check for work for kill zombie import/export thread.
+ */
+static int obd_zombie_impexp_check(void *arg)
+{
+	int rc;
+
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0) &&
+	     !test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	RETURN(rc);
+}
+
+/**
+ * Add export to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_export_add(struct obd_export *exp) {
+	spin_lock(&exp->exp_obd->obd_dev_lock);
+	LASSERT(!list_empty(&exp->exp_obd_chain));
+	list_del_init(&exp->exp_obd_chain);
+	spin_unlock(&exp->exp_obd->obd_dev_lock);
+	spin_lock(&obd_zombie_impexp_lock);
+	zombies_count++;
+	list_add(&exp->exp_obd_chain, &obd_zombie_exports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * Add import to the obd_zombe thread and notify it.
+ */
+static void obd_zombie_import_add(struct obd_import *imp) {
+	LASSERT(imp->imp_sec == NULL);
+	LASSERT(imp->imp_rq_pool == NULL);
+	spin_lock(&obd_zombie_impexp_lock);
+	LASSERT(list_empty(&imp->imp_zombie_chain));
+	zombies_count++;
+	list_add(&imp->imp_zombie_chain, &obd_zombie_imports);
+	spin_unlock(&obd_zombie_impexp_lock);
+
+	obd_zombie_impexp_notify();
+}
+
+/**
+ * notify import/export destroy thread about new zombie.
+ */
+static void obd_zombie_impexp_notify(void)
+{
+	/*
+	 * Make sure obd_zomebie_impexp_thread get this notification.
+	 * It is possible this signal only get by obd_zombie_barrier, and
+	 * barrier gulps this notification and sleeps away and hangs ensues
+	 */
+	wake_up_all(&obd_zombie_waitq);
+}
+
+/**
+ * check whether obd_zombie is idle
+ */
+static int obd_zombie_is_idle(void)
+{
+	int rc;
+
+	LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags));
+	spin_lock(&obd_zombie_impexp_lock);
+	rc = (zombies_count == 0);
+	spin_unlock(&obd_zombie_impexp_lock);
+	return rc;
+}
+
+/**
+ * wait when obd_zombie import/export queues become empty
+ */
+void obd_zombie_barrier(void)
+{
+	struct l_wait_info lwi = { 0 };
+
+	if (obd_zombie_pid == current_pid())
+		/* don't wait for myself */
+		return;
+	l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi);
+}
+EXPORT_SYMBOL(obd_zombie_barrier);
+
+
+/**
+ * destroy zombie export/import thread.
+ */
+static int obd_zombie_impexp_thread(void *unused)
+{
+	unshare_fs_struct();
+	complete(&obd_zombie_start);
+
+	obd_zombie_pid = current_pid();
+
+	while (!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)) {
+		struct l_wait_info lwi = { 0 };
+
+		l_wait_event(obd_zombie_waitq,
+			     !obd_zombie_impexp_check(NULL), &lwi);
+		obd_zombie_impexp_cull();
+
+		/*
+		 * Notify obd_zombie_barrier callers that queues
+		 * may be empty.
+		 */
+		wake_up(&obd_zombie_waitq);
+	}
+
+	complete(&obd_zombie_stop);
+
+	RETURN(0);
+}
+
+
+/**
+ * start destroy zombie import/export thread
+ */
+int obd_zombie_impexp_init(void)
+{
+	task_t *task;
+
+	INIT_LIST_HEAD(&obd_zombie_imports);
+	INIT_LIST_HEAD(&obd_zombie_exports);
+	spin_lock_init(&obd_zombie_impexp_lock);
+	init_completion(&obd_zombie_start);
+	init_completion(&obd_zombie_stop);
+	init_waitqueue_head(&obd_zombie_waitq);
+	obd_zombie_pid = 0;
+
+	task = kthread_run(obd_zombie_impexp_thread, NULL, "obd_zombid");
+	if (IS_ERR(task))
+		RETURN(PTR_ERR(task));
+
+	wait_for_completion(&obd_zombie_start);
+	RETURN(0);
+}
+/**
+ * stop destroy zombie import/export thread
+ */
+void obd_zombie_impexp_stop(void)
+{
+	set_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags);
+	obd_zombie_impexp_notify();
+	wait_for_completion(&obd_zombie_stop);
+}
+
+/***** Kernel-userspace comm helpers *******/
+
+/* Get length of entire message, including header */
+int kuc_len(int payload_len)
+{
+	return sizeof(struct kuc_hdr) + payload_len;
+}
+EXPORT_SYMBOL(kuc_len);
+
+/* Get a pointer to kuc header, given a ptr to the payload
+ * @param p Pointer to payload area
+ * @returns Pointer to kuc header
+ */
+struct kuc_hdr * kuc_ptr(void *p)
+{
+	struct kuc_hdr *lh = ((struct kuc_hdr *)p) - 1;
+	LASSERT(lh->kuc_magic == KUC_MAGIC);
+	return lh;
+}
+EXPORT_SYMBOL(kuc_ptr);
+
+/* Test if payload is part of kuc message
+ * @param p Pointer to payload area
+ * @returns boolean
+ */
+int kuc_ispayload(void *p)
+{
+	struct kuc_hdr *kh = ((struct kuc_hdr *)p) - 1;
+
+	if (kh->kuc_magic == KUC_MAGIC)
+		return 1;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(kuc_ispayload);
+
+/* Alloc space for a message, and fill in header
+ * @return Pointer to payload area
+ */
+void *kuc_alloc(int payload_len, int transport, int type)
+{
+	struct kuc_hdr *lh;
+	int len = kuc_len(payload_len);
+
+	OBD_ALLOC(lh, len);
+	if (lh == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	lh->kuc_magic = KUC_MAGIC;
+	lh->kuc_transport = transport;
+	lh->kuc_msgtype = type;
+	lh->kuc_msglen = len;
+
+	return (void *)(lh + 1);
+}
+EXPORT_SYMBOL(kuc_alloc);
+
+/* Takes pointer to payload area */
+inline void kuc_free(void *p, int payload_len)
+{
+	struct kuc_hdr *lh = kuc_ptr(p);
+	OBD_FREE(lh, kuc_len(payload_len));
+}
+EXPORT_SYMBOL(kuc_free);
diff --git a/drivers/staging/lustre/lustre/obdclass/idmap.c b/drivers/staging/lustre/lustre/obdclass/idmap.c
new file mode 100644
index 000000000000..622f8d165275
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/idmap.c
@@ -0,0 +1,474 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/idmap.c
+ *
+ * Lustre user identity mapping.
+ *
+ * Author: Fan Yong <fanyong@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_SEC
+
+#include <lustre_idmap.h>
+#include <md_object.h>
+#include <obd_support.h>
+
+#define lustre_get_group_info(group_info) do {	     \
+	atomic_inc(&(group_info)->usage);	      \
+} while (0)
+
+#define lustre_put_group_info(group_info) do {	     \
+	if (atomic_dec_and_test(&(group_info)->usage)) \
+		groups_free(group_info);	       \
+} while (0)
+
+/*
+ * groups_search() is copied from linux kernel!
+ * A simple bsearch.
+ */
+static int lustre_groups_search(group_info_t *group_info,
+				gid_t grp)
+{
+	int left, right;
+
+	if (!group_info)
+		return 0;
+
+	left = 0;
+	right = group_info->ngroups;
+	while (left < right) {
+		int mid = (left + right) / 2;
+		int cmp = grp - CFS_GROUP_AT(group_info, mid);
+
+		if (cmp > 0)
+			left = mid + 1;
+		else if (cmp < 0)
+			right = mid;
+		else
+			return 1;
+	}
+	return 0;
+}
+
+void lustre_groups_from_list(group_info_t *ginfo, gid_t *glist)
+{
+	int i;
+	int count = ginfo->ngroups;
+
+	/* fill group_info from gid array */
+	for (i = 0; i < ginfo->nblocks && count > 0; i++) {
+		int cp_count = min(CFS_NGROUPS_PER_BLOCK, count);
+		int off = i * CFS_NGROUPS_PER_BLOCK;
+		int len = cp_count * sizeof(*glist);
+
+		memcpy(ginfo->blocks[i], glist + off, len);
+		count -= cp_count;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_from_list);
+
+/* groups_sort() is copied from linux kernel! */
+/* a simple shell-metzner sort */
+void lustre_groups_sort(group_info_t *group_info)
+{
+	int base, max, stride;
+	int gidsetsize = group_info->ngroups;
+
+	for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
+		; /* nothing */
+	stride /= 3;
+
+	while (stride) {
+		max = gidsetsize - stride;
+		for (base = 0; base < max; base++) {
+			int left = base;
+			int right = left + stride;
+			gid_t tmp = CFS_GROUP_AT(group_info, right);
+
+			while (left >= 0 &&
+			       CFS_GROUP_AT(group_info, left) > tmp) {
+				CFS_GROUP_AT(group_info, right) =
+				    CFS_GROUP_AT(group_info, left);
+				right = left;
+				left -= stride;
+			}
+			CFS_GROUP_AT(group_info, right) = tmp;
+		}
+		stride /= 3;
+	}
+}
+EXPORT_SYMBOL(lustre_groups_sort);
+
+int lustre_in_group_p(struct lu_ucred *mu, gid_t grp)
+{
+	int rc = 1;
+
+	if (grp != mu->uc_fsgid) {
+		group_info_t *group_info = NULL;
+
+		if (mu->uc_ginfo || !mu->uc_identity ||
+		    mu->uc_valid == UCRED_OLD)
+			if (grp == mu->uc_suppgids[0] ||
+			    grp == mu->uc_suppgids[1])
+				return 1;
+
+		if (mu->uc_ginfo)
+			group_info = mu->uc_ginfo;
+		else if (mu->uc_identity)
+			group_info = mu->uc_identity->mi_ginfo;
+
+		if (!group_info)
+			return 0;
+
+		lustre_get_group_info(group_info);
+		rc = lustre_groups_search(group_info, grp);
+		lustre_put_group_info(group_info);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lustre_in_group_p);
+
+struct lustre_idmap_entry {
+	struct list_head       lie_rmt_uid_hash; /* hashed as lie_rmt_uid; */
+	struct list_head       lie_lcl_uid_hash; /* hashed as lie_lcl_uid; */
+	struct list_head       lie_rmt_gid_hash; /* hashed as lie_rmt_gid; */
+	struct list_head       lie_lcl_gid_hash; /* hashed as lie_lcl_gid; */
+	uid_t	    lie_rmt_uid;      /* remote uid */
+	uid_t	    lie_lcl_uid;      /* local uid */
+	gid_t	    lie_rmt_gid;      /* remote gid */
+	gid_t	    lie_lcl_gid;      /* local gid */
+};
+
+static inline __u32 lustre_idmap_hashfunc(__u32 id)
+{
+	return id & (CFS_IDMAP_HASHSIZE - 1);
+}
+
+static
+struct lustre_idmap_entry *idmap_entry_alloc(uid_t rmt_uid, uid_t lcl_uid,
+					     gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct lustre_idmap_entry *e;
+
+	OBD_ALLOC_PTR(e);
+	if (e == NULL)
+		return NULL;
+
+	INIT_LIST_HEAD(&e->lie_rmt_uid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_uid_hash);
+	INIT_LIST_HEAD(&e->lie_rmt_gid_hash);
+	INIT_LIST_HEAD(&e->lie_lcl_gid_hash);
+	e->lie_rmt_uid = rmt_uid;
+	e->lie_lcl_uid = lcl_uid;
+	e->lie_rmt_gid = rmt_gid;
+	e->lie_lcl_gid = lcl_gid;
+
+	return e;
+}
+
+static void idmap_entry_free(struct lustre_idmap_entry *e)
+{
+	if (!list_empty(&e->lie_rmt_uid_hash))
+		list_del(&e->lie_rmt_uid_hash);
+	if (!list_empty(&e->lie_lcl_uid_hash))
+		list_del(&e->lie_lcl_uid_hash);
+	if (!list_empty(&e->lie_rmt_gid_hash))
+		list_del(&e->lie_rmt_gid_hash);
+	if (!list_empty(&e->lie_lcl_gid_hash))
+		list_del(&e->lie_lcl_gid_hash);
+	OBD_FREE_PTR(e);
+}
+
+/*
+ * return value
+ * NULL: not found entry
+ * ERR_PTR(-EACCES): found 1(remote):N(local) mapped entry
+ * others: found normal entry
+ */
+static
+struct lustre_idmap_entry *idmap_search_entry(struct lustre_idmap_table *t,
+					      uid_t rmt_uid, uid_t lcl_uid,
+					      gid_t rmt_gid, gid_t lcl_gid)
+{
+	struct list_head *head;
+	struct lustre_idmap_entry *e;
+
+	head = &t->lit_idmaps[RMT_UIDMAP_IDX][lustre_idmap_hashfunc(rmt_uid)];
+	list_for_each_entry(e, head, lie_rmt_uid_hash)
+		if (e->lie_rmt_uid == rmt_uid) {
+			if (e->lie_lcl_uid == lcl_uid) {
+				if (e->lie_rmt_gid == rmt_gid &&
+				    e->lie_lcl_gid == lcl_gid)
+					/* must be quaternion match */
+					return e;
+			} else {
+				/* 1:N uid mapping */
+				CERROR("rmt uid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_uid,
+				       e->lie_lcl_uid, lcl_uid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	head = &t->lit_idmaps[RMT_GIDMAP_IDX][lustre_idmap_hashfunc(rmt_gid)];
+	list_for_each_entry(e, head, lie_rmt_gid_hash)
+		if (e->lie_rmt_gid == rmt_gid) {
+			if (e->lie_lcl_gid == lcl_gid) {
+				if (unlikely(e->lie_rmt_uid == rmt_uid &&
+				    e->lie_lcl_uid == lcl_uid))
+					/* after uid mapping search above,
+					 * we should never come here */
+					LBUG();
+			} else {
+				/* 1:N gid mapping */
+				CERROR("rmt gid %u already be mapped to %u"
+				       " (new %u)\n", e->lie_rmt_gid,
+				       e->lie_lcl_gid, lcl_gid);
+				return ERR_PTR(-EACCES);
+			}
+		}
+
+	return NULL;
+}
+
+static __u32 idmap_lookup_uid(struct list_head *hash, int reverse,
+			      __u32 uid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(uid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_uid_hash)
+			if (e->lie_rmt_uid == uid)
+				return e->lie_lcl_uid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_uid_hash)
+			if (e->lie_lcl_uid == uid)
+				return e->lie_rmt_uid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+static __u32 idmap_lookup_gid(struct list_head *hash, int reverse, __u32 gid)
+{
+	struct list_head *head = &hash[lustre_idmap_hashfunc(gid)];
+	struct lustre_idmap_entry *e;
+
+	if (!reverse) {
+		list_for_each_entry(e, head, lie_rmt_gid_hash)
+			if (e->lie_rmt_gid == gid)
+				return e->lie_lcl_gid;
+	} else {
+		list_for_each_entry(e, head, lie_lcl_gid_hash)
+			if (e->lie_lcl_gid == gid)
+				return e->lie_rmt_gid;
+	}
+
+	return CFS_IDMAP_NOTFOUND;
+}
+
+int lustre_idmap_add(struct lustre_idmap_table *t,
+		     uid_t ruid, uid_t luid,
+		     gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e0, *e1;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e0 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	spin_unlock(&t->lit_lock);
+	if (!e0) {
+		e0 = idmap_entry_alloc(ruid, luid, rgid, lgid);
+		if (!e0)
+			return -ENOMEM;
+
+		spin_lock(&t->lit_lock);
+		e1 = idmap_search_entry(t, ruid, luid, rgid, lgid);
+		if (e1 == NULL) {
+			list_add_tail(&e0->lie_rmt_uid_hash,
+					  &t->lit_idmaps[RMT_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(ruid)]);
+			list_add_tail(&e0->lie_lcl_uid_hash,
+					  &t->lit_idmaps[LCL_UIDMAP_IDX]
+					  [lustre_idmap_hashfunc(luid)]);
+			list_add_tail(&e0->lie_rmt_gid_hash,
+					  &t->lit_idmaps[RMT_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(rgid)]);
+			list_add_tail(&e0->lie_lcl_gid_hash,
+					  &t->lit_idmaps[LCL_GIDMAP_IDX]
+					  [lustre_idmap_hashfunc(lgid)]);
+		}
+		spin_unlock(&t->lit_lock);
+		if (e1 != NULL) {
+			idmap_entry_free(e0);
+			if (IS_ERR(e1))
+				return PTR_ERR(e1);
+		}
+	} else if (IS_ERR(e0)) {
+		return PTR_ERR(e0);
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(lustre_idmap_add);
+
+int lustre_idmap_del(struct lustre_idmap_table *t,
+		    uid_t ruid, uid_t luid,
+		    gid_t rgid, gid_t lgid)
+{
+	struct lustre_idmap_entry *e;
+	int rc = 0;
+
+	LASSERT(t);
+
+	spin_lock(&t->lit_lock);
+	e = idmap_search_entry(t, ruid, luid, rgid, lgid);
+	if (IS_ERR(e))
+		rc = PTR_ERR(e);
+	else if (e)
+		idmap_entry_free(e);
+	spin_unlock(&t->lit_lock);
+
+	return rc;
+}
+EXPORT_SYMBOL(lustre_idmap_del);
+
+int lustre_idmap_lookup_uid(struct lu_ucred *mu,
+			    struct lustre_idmap_table *t,
+			    int reverse, uid_t uid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (uid == mu->uc_o_uid)
+				return mu->uc_uid;
+			else if (uid == mu->uc_o_fsuid)
+				return mu->uc_fsuid;
+		} else {
+			if (uid == mu->uc_uid)
+				return mu->uc_o_uid;
+			else if (uid == mu->uc_fsuid)
+				return mu->uc_o_fsuid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_UIDMAP_IDX : RMT_UIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	uid = idmap_lookup_uid(hash, reverse, uid);
+	spin_unlock(&t->lit_lock);
+
+	return uid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_uid);
+
+int lustre_idmap_lookup_gid(struct lu_ucred *mu, struct lustre_idmap_table *t,
+			    int reverse, gid_t gid)
+{
+	struct list_head *hash;
+
+	if (mu && (mu->uc_valid == UCRED_OLD || mu->uc_valid == UCRED_NEW)) {
+		if (!reverse) {
+			if (gid == mu->uc_o_gid)
+				return mu->uc_gid;
+			else if (gid == mu->uc_o_fsgid)
+				return mu->uc_fsgid;
+		} else {
+			if (gid == mu->uc_gid)
+				return mu->uc_o_gid;
+			else if (gid == mu->uc_fsgid)
+				return mu->uc_o_fsgid;
+		}
+	}
+
+	if (t == NULL)
+		return CFS_IDMAP_NOTFOUND;
+
+	hash = t->lit_idmaps[reverse ? LCL_GIDMAP_IDX : RMT_GIDMAP_IDX];
+
+	spin_lock(&t->lit_lock);
+	gid = idmap_lookup_gid(hash, reverse, gid);
+	spin_unlock(&t->lit_lock);
+
+	return gid;
+}
+EXPORT_SYMBOL(lustre_idmap_lookup_gid);
+
+struct lustre_idmap_table *lustre_idmap_init(void)
+{
+	struct lustre_idmap_table *t;
+	int i, j;
+
+	OBD_ALLOC_PTR(t);
+	if(unlikely(t == NULL))
+		return (ERR_PTR(-ENOMEM));
+
+	spin_lock_init(&t->lit_lock);
+	for (i = 0; i < ARRAY_SIZE(t->lit_idmaps); i++)
+		for (j = 0; j < ARRAY_SIZE(t->lit_idmaps[i]); j++)
+			INIT_LIST_HEAD(&t->lit_idmaps[i][j]);
+
+	return t;
+}
+EXPORT_SYMBOL(lustre_idmap_init);
+
+void lustre_idmap_fini(struct lustre_idmap_table *t)
+{
+	struct list_head *list;
+	struct lustre_idmap_entry *e;
+	int i;
+	LASSERT(t);
+
+	list = t->lit_idmaps[RMT_UIDMAP_IDX];
+	spin_lock(&t->lit_lock);
+	for (i = 0; i < CFS_IDMAP_HASHSIZE; i++)
+		while (!list_empty(&list[i])) {
+			e = list_entry(list[i].next,
+					   struct lustre_idmap_entry,
+					   lie_rmt_uid_hash);
+			idmap_entry_free(e);
+		}
+	spin_unlock(&t->lit_lock);
+
+	OBD_FREE_PTR(t);
+}
+EXPORT_SYMBOL(lustre_idmap_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/linkea.c b/drivers/staging/lustre/lustre/obdclass/linkea.c
new file mode 100644
index 000000000000..b5c19ac1470f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linkea.c
@@ -0,0 +1,194 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2013, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Di Wang <di.wang@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <lustre_linkea.h>
+
+int linkea_data_new(struct linkea_data *ldata, struct lu_buf *buf)
+{
+	ldata->ld_buf = lu_buf_check_and_alloc(buf, PAGE_CACHE_SIZE);
+	if (ldata->ld_buf->lb_buf == NULL)
+		return -ENOMEM;
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_leh->leh_magic = LINK_EA_MAGIC;
+	ldata->ld_leh->leh_len = sizeof(struct link_ea_header);
+	ldata->ld_leh->leh_reccount = 0;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_data_new);
+
+int linkea_init(struct linkea_data *ldata)
+{
+	struct link_ea_header *leh;
+
+	LASSERT(ldata->ld_buf != NULL);
+	leh = ldata->ld_buf->lb_buf;
+	if (leh->leh_magic == __swab32(LINK_EA_MAGIC)) {
+		leh->leh_magic = LINK_EA_MAGIC;
+		leh->leh_reccount = __swab32(leh->leh_reccount);
+		leh->leh_len = __swab64(leh->leh_len);
+		/* entries are swabbed by linkea_entry_unpack */
+	}
+	if (leh->leh_magic != LINK_EA_MAGIC)
+		return -EINVAL;
+	if (leh->leh_reccount == 0)
+		return -ENODATA;
+
+	ldata->ld_leh = leh;
+	return 0;
+}
+EXPORT_SYMBOL(linkea_init);
+
+/**
+ * Pack a link_ea_entry.
+ * All elements are stored as chars to avoid alignment issues.
+ * Numbers are always big-endian
+ * \retval record length
+ */
+static int linkea_entry_pack(struct link_ea_entry *lee,
+			     const struct lu_name *lname,
+			     const struct lu_fid *pfid)
+{
+	struct lu_fid   tmpfid;
+	int	     reclen;
+
+	fid_cpu_to_be(&tmpfid, pfid);
+	if (OBD_FAIL_CHECK(OBD_FAIL_LFSCK_LINKEA_CRASH))
+		tmpfid.f_ver = ~0;
+	memcpy(&lee->lee_parent_fid, &tmpfid, sizeof(tmpfid));
+	memcpy(lee->lee_name, lname->ln_name, lname->ln_namelen);
+	reclen = sizeof(struct link_ea_entry) + lname->ln_namelen;
+
+	lee->lee_reclen[0] = (reclen >> 8) & 0xff;
+	lee->lee_reclen[1] = reclen & 0xff;
+	return reclen;
+}
+
+void linkea_entry_unpack(const struct link_ea_entry *lee, int *reclen,
+			 struct lu_name *lname, struct lu_fid *pfid)
+{
+	*reclen = (lee->lee_reclen[0] << 8) | lee->lee_reclen[1];
+	memcpy(pfid, &lee->lee_parent_fid, sizeof(*pfid));
+	fid_be_to_cpu(pfid, pfid);
+	lname->ln_name = lee->lee_name;
+	lname->ln_namelen = *reclen - sizeof(struct link_ea_entry);
+}
+EXPORT_SYMBOL(linkea_entry_unpack);
+
+/**
+ * Add a record to the end of link ea buf
+ **/
+int linkea_add_buf(struct linkea_data *ldata, const struct lu_name *lname,
+		   const struct lu_fid *pfid)
+{
+	LASSERT(ldata->ld_leh != NULL);
+
+	if (lname == NULL || pfid == NULL)
+		return -EINVAL;
+
+	ldata->ld_reclen = lname->ln_namelen + sizeof(struct link_ea_entry);
+	if (ldata->ld_leh->leh_len + ldata->ld_reclen >
+	    ldata->ld_buf->lb_len) {
+		if (lu_buf_check_and_grow(ldata->ld_buf,
+					  ldata->ld_leh->leh_len +
+					  ldata->ld_reclen) < 0)
+			return -ENOMEM;
+	}
+
+	ldata->ld_leh = ldata->ld_buf->lb_buf;
+	ldata->ld_lee = ldata->ld_buf->lb_buf + ldata->ld_leh->leh_len;
+	ldata->ld_reclen = linkea_entry_pack(ldata->ld_lee, lname, pfid);
+	ldata->ld_leh->leh_len += ldata->ld_reclen;
+	ldata->ld_leh->leh_reccount++;
+	CDEBUG(D_INODE, "New link_ea name '%.*s' is added\n",
+	       lname->ln_namelen, lname->ln_name);
+	return 0;
+}
+EXPORT_SYMBOL(linkea_add_buf);
+
+/** Del the current record from the link ea buf */
+void linkea_del_buf(struct linkea_data *ldata, const struct lu_name *lname)
+{
+	LASSERT(ldata->ld_leh != NULL && ldata->ld_lee != NULL);
+
+	ldata->ld_leh->leh_reccount--;
+	ldata->ld_leh->leh_len -= ldata->ld_reclen;
+	memmove(ldata->ld_lee, (char *)ldata->ld_lee + ldata->ld_reclen,
+		(char *)ldata->ld_leh + ldata->ld_leh->leh_len -
+		(char *)ldata->ld_lee);
+	CDEBUG(D_INODE, "Old link_ea name '%.*s' is removed\n",
+	       lname->ln_namelen, lname->ln_name);
+}
+EXPORT_SYMBOL(linkea_del_buf);
+
+/**
+ * Check if such a link exists in linkEA.
+ *
+ * \param ldata link data the search to be done on
+ * \param lname name in the parent's directory entry pointing to this object
+ * \param pfid parent fid the link to be found for
+ *
+ * \retval   0 success
+ * \retval -ENOENT link does not exist
+ * \retval -ve on error
+ */
+int linkea_links_find(struct linkea_data *ldata, const struct lu_name *lname,
+		      const struct lu_fid  *pfid)
+{
+	struct lu_name tmpname;
+	struct lu_fid  tmpfid;
+	int count;
+
+	LASSERT(ldata->ld_leh != NULL);
+
+	/* link #0 */
+	ldata->ld_lee = (struct link_ea_entry *)(ldata->ld_leh + 1);
+
+	for (count = 0; count < ldata->ld_leh->leh_reccount; count++) {
+		linkea_entry_unpack(ldata->ld_lee, &ldata->ld_reclen,
+				    &tmpname, &tmpfid);
+		if (tmpname.ln_namelen == lname->ln_namelen &&
+		    lu_fid_eq(&tmpfid, pfid) &&
+		    (strncmp(tmpname.ln_name, lname->ln_name,
+			     tmpname.ln_namelen) == 0))
+			break;
+		ldata->ld_lee = (struct link_ea_entry *)((char *)ldata->ld_lee +
+							 ldata->ld_reclen);
+	}
+
+	if (count == ldata->ld_leh->leh_reccount) {
+		CDEBUG(D_INODE, "Old link_ea name '%.*s' not found\n",
+		       lname->ln_namelen, lname->ln_name);
+		ldata->ld_lee = NULL;
+		return -ENOENT;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(linkea_links_find);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
new file mode 100644
index 000000000000..d2c3072541d1
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-module.c
@@ -0,0 +1,408 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-module.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/major.h>
+#include <linux/sched.h>
+#include <linux/lp.h>
+#include <linux/slab.h>
+#include <linux/ioport.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <asm/io.h>
+#include <asm/ioctls.h>
+#include <asm/poll.h>
+#include <asm/uaccess.h>
+#include <linux/miscdevice.h>
+#include <linux/seq_file.h>
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <linux/lnet/lnetctl.h>
+#include <lprocfs_status.h>
+#include <lustre_ver.h>
+#include <lustre/lustre_build_version.h>
+
+int proc_version;
+
+/* buffer MUST be at least the size of obd_ioctl_hdr */
+int obd_ioctl_getdata(char **buf, int *len, void *arg)
+{
+	struct obd_ioctl_hdr hdr;
+	struct obd_ioctl_data *data;
+	int err;
+	int offset = 0;
+	ENTRY;
+
+	err = copy_from_user(&hdr, (void *)arg, sizeof(hdr));
+	if ( err )
+		RETURN(err);
+
+	if (hdr.ioc_version != OBD_IOCTL_VERSION) {
+		CERROR("Version mismatch kernel (%x) vs application (%x)\n",
+		       OBD_IOCTL_VERSION, hdr.ioc_version);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len > OBD_MAX_IOCTL_BUFFER) {
+		CERROR("User buffer len %d exceeds %d max buffer\n",
+		       hdr.ioc_len, OBD_MAX_IOCTL_BUFFER);
+		RETURN(-EINVAL);
+	}
+
+	if (hdr.ioc_len < sizeof(struct obd_ioctl_data)) {
+		CERROR("User buffer too small for ioctl (%d)\n", hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	/* When there are lots of processes calling vmalloc on multi-core
+	 * system, the high lock contention will hurt performance badly,
+	 * obdfilter-survey is an example, which relies on ioctl. So we'd
+	 * better avoid vmalloc on ioctl path. LU-66 */
+	OBD_ALLOC_LARGE(*buf, hdr.ioc_len);
+	if (*buf == NULL) {
+		CERROR("Cannot allocate control buffer of len %d\n",
+		       hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+	*len = hdr.ioc_len;
+	data = (struct obd_ioctl_data *)*buf;
+
+	err = copy_from_user(*buf, (void *)arg, hdr.ioc_len);
+	if ( err ) {
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(err);
+	}
+
+	if (obd_ioctl_is_invalid(data)) {
+		CERROR("ioctl not correctly formatted\n");
+		OBD_FREE_LARGE(*buf, hdr.ioc_len);
+		RETURN(-EINVAL);
+	}
+
+	if (data->ioc_inllen1) {
+		data->ioc_inlbuf1 = &data->ioc_bulk[0];
+		offset += cfs_size_round(data->ioc_inllen1);
+	}
+
+	if (data->ioc_inllen2) {
+		data->ioc_inlbuf2 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen2);
+	}
+
+	if (data->ioc_inllen3) {
+		data->ioc_inlbuf3 = &data->ioc_bulk[0] + offset;
+		offset += cfs_size_round(data->ioc_inllen3);
+	}
+
+	if (data->ioc_inllen4) {
+		data->ioc_inlbuf4 = &data->ioc_bulk[0] + offset;
+	}
+
+	EXIT;
+	return 0;
+}
+EXPORT_SYMBOL(obd_ioctl_getdata);
+
+int obd_ioctl_popdata(void *arg, void *data, int len)
+{
+	int err;
+
+	err = copy_to_user(arg, data, len);
+	if (err)
+		err = -EFAULT;
+	return err;
+}
+EXPORT_SYMBOL(obd_ioctl_popdata);
+
+/*  opening /dev/obd */
+static int obd_class_open(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	try_module_get(THIS_MODULE);
+	RETURN(0);
+}
+
+/*  closing /dev/obd */
+static int obd_class_release(struct inode * inode, struct file * file)
+{
+	ENTRY;
+
+	module_put(THIS_MODULE);
+	RETURN(0);
+}
+
+/* to control /dev/obd */
+static long obd_class_ioctl(struct file *filp, unsigned int cmd,
+			    unsigned long arg)
+{
+	int err = 0;
+	ENTRY;
+
+	/* Allow non-root access for OBD_IOC_PING_TARGET - used by lfs check */
+	if (!cfs_capable(CFS_CAP_SYS_ADMIN) && (cmd != OBD_IOC_PING_TARGET))
+		RETURN(err = -EACCES);
+	if ((cmd & 0xffffff00) == ((int)'T') << 8) /* ignore all tty ioctls */
+		RETURN(err = -ENOTTY);
+
+	err = class_handle_ioctl(cmd, (unsigned long)arg);
+
+	RETURN(err);
+}
+
+/* declare character device */
+static struct file_operations obd_psdev_fops = {
+	.owner	  = THIS_MODULE,
+	.unlocked_ioctl = obd_class_ioctl, /* unlocked_ioctl */
+	.open	   = obd_class_open,      /* open */
+	.release	= obd_class_release,   /* release */
+};
+
+/* modules setup */
+psdev_t obd_psdev = {
+	.minor = OBD_DEV_MINOR,
+	.name  = OBD_DEV_NAME,
+	.fops  = &obd_psdev_fops,
+};
+
+
+#ifdef LPROCFS
+int obd_proc_version_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "lustre: %s\nkernel: %s\nbuild:  %s\n",
+			LUSTRE_VERSION_STRING, "patchless_client",
+			BUILD_VERSION);
+}
+LPROC_SEQ_FOPS_RO(obd_proc_version);
+
+int obd_proc_pinger_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "%s\n", "on");
+}
+LPROC_SEQ_FOPS_RO(obd_proc_pinger);
+
+static int obd_proc_health_seq_show(struct seq_file *m, void *v)
+{
+	int rc = 0, i;
+
+	if (libcfs_catastrophe)
+		seq_printf(m, "LBUG\n");
+
+	read_lock(&obd_dev_lock);
+	for (i = 0; i < class_devno_max(); i++) {
+		struct obd_device *obd;
+
+		obd = class_num2obd(i);
+		if (obd == NULL || !obd->obd_attached || !obd->obd_set_up)
+			continue;
+
+		LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+		if (obd->obd_stopping)
+			continue;
+
+		class_incref(obd, __FUNCTION__, current);
+		read_unlock(&obd_dev_lock);
+
+		if (obd_health_check(NULL, obd)) {
+			seq_printf(m, "device %s reported unhealthy\n",
+				      obd->obd_name);
+			rc++;
+		}
+		class_decref(obd, __FUNCTION__, current);
+		read_lock(&obd_dev_lock);
+	}
+	read_unlock(&obd_dev_lock);
+
+	if (rc == 0)
+		return seq_printf(m, "healthy\n");
+
+	seq_printf(m, "NOT HEALTHY\n");
+	return 0;
+}
+LPROC_SEQ_FOPS_RO(obd_proc_health);
+
+static int obd_proc_jobid_var_seq_show(struct seq_file *m, void *v)
+{
+	return seq_printf(m, "%s\n", obd_jobid_var);
+}
+
+static ssize_t obd_proc_jobid_var_seq_write(struct file *file, const char *buffer,
+					size_t count, loff_t *off)
+{
+	if (!count || count > JOBSTATS_JOBID_VAR_MAX_LEN)
+		return -EINVAL;
+
+	memset(obd_jobid_var, 0, JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	/* Trim the trailing '\n' if any */
+	memcpy(obd_jobid_var, buffer, count - (buffer[count - 1] == '\n'));
+	return count;
+}
+LPROC_SEQ_FOPS(obd_proc_jobid_var);
+
+/* Root for /proc/fs/lustre */
+struct proc_dir_entry *proc_lustre_root = NULL;
+EXPORT_SYMBOL(proc_lustre_root);
+
+struct lprocfs_vars lprocfs_base[] = {
+	{ "version", &obd_proc_version_fops },
+	{ "pinger", &obd_proc_pinger_fops },
+	{ "health_check", &obd_proc_health_fops },
+	{ "jobid_var", &obd_proc_jobid_var_fops },
+	{ 0 }
+};
+#else
+#define lprocfs_base NULL
+#endif /* LPROCFS */
+
+static void *obd_device_list_seq_start(struct seq_file *p, loff_t *pos)
+{
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static void obd_device_list_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	++*pos;
+	if (*pos >= class_devno_max())
+		return NULL;
+
+	return pos;
+}
+
+static int obd_device_list_seq_show(struct seq_file *p, void *v)
+{
+	loff_t index = *(loff_t *)v;
+	struct obd_device *obd = class_num2obd((int)index);
+	char *status;
+
+	if (obd == NULL)
+		return 0;
+
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	if (obd->obd_stopping)
+		status = "ST";
+	else if (obd->obd_inactive)
+		status = "IN";
+	else if (obd->obd_set_up)
+		status = "UP";
+	else if (obd->obd_attached)
+		status = "AT";
+	else
+		status = "--";
+
+	return seq_printf(p, "%3d %s %s %s %s %d\n",
+			  (int)index, status, obd->obd_type->typ_name,
+			  obd->obd_name, obd->obd_uuid.uuid,
+			  atomic_read(&obd->obd_refcount));
+}
+
+struct seq_operations obd_device_list_sops = {
+	.start = obd_device_list_seq_start,
+	.stop = obd_device_list_seq_stop,
+	.next = obd_device_list_seq_next,
+	.show = obd_device_list_seq_show,
+};
+
+static int obd_device_list_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc = seq_open(file, &obd_device_list_sops);
+
+	if (rc)
+		return rc;
+
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+
+	return 0;
+}
+
+struct file_operations obd_device_list_fops = {
+	.owner   = THIS_MODULE,
+	.open    = obd_device_list_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+int class_procfs_init(void)
+{
+	int rc;
+	ENTRY;
+
+	obd_sysctl_init();
+	proc_lustre_root = lprocfs_register("fs/lustre", NULL,
+					    lprocfs_base, NULL);
+	rc = lprocfs_seq_create(proc_lustre_root, "devices", 0444,
+				&obd_device_list_fops, NULL);
+	if (rc)
+		CERROR("error adding /proc/fs/lustre/devices file\n");
+	RETURN(0);
+}
+
+int class_procfs_clean(void)
+{
+	ENTRY;
+	if (proc_lustre_root) {
+		lprocfs_remove(&proc_lustre_root);
+	}
+	RETURN(0);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
new file mode 100644
index 000000000000..6ee347153a16
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-obdo.c
@@ -0,0 +1,222 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/linux/linux-obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+#include <linux/fs.h>
+#include <linux/pagemap.h> /* for PAGE_CACHE_SIZE */
+
+/*FIXME: Just copy from obdo_from_inode*/
+void obdo_from_la(struct obdo *dst, struct lu_attr *la, __u64 valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & LA_ATIME) {
+		dst->o_atime = la->la_atime;
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & LA_MTIME) {
+		dst->o_mtime = la->la_mtime;
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & LA_CTIME) {
+		dst->o_ctime = la->la_ctime;
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & LA_SIZE) {
+		dst->o_size = la->la_size;
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & LA_BLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = la->la_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & LA_TYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (la->la_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & LA_MODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (la->la_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & LA_UID) {
+		dst->o_uid = la->la_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & LA_GID) {
+		dst->o_gid = la->la_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_la);
+
+/*FIXME: Just copy from obdo_from_inode*/
+void la_from_obdo(struct lu_attr *dst, struct obdo *obdo, obd_flag valid)
+{
+	__u64 newvalid = 0;
+
+	valid &= obdo->o_valid;
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->la_atime = obdo->o_atime;
+		newvalid |= LA_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->la_mtime = obdo->o_mtime;
+		newvalid |= LA_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->la_ctime = obdo->o_ctime;
+		newvalid |= LA_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->la_size = obdo->o_size;
+		newvalid |= LA_SIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {
+		dst->la_blocks = obdo->o_blocks;
+		newvalid |= LA_BLOCKS;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->la_mode = (dst->la_mode & S_IALLUGO) |
+			       (obdo->o_mode & S_IFMT);
+		newvalid |= LA_TYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->la_mode = (dst->la_mode & S_IFMT) |
+			       (obdo->o_mode & S_IALLUGO);
+		newvalid |= LA_MODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->la_uid = obdo->o_uid;
+		newvalid |= LA_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->la_gid = obdo->o_gid;
+		newvalid |= LA_GID;
+	}
+	dst->la_valid = newvalid;
+}
+EXPORT_SYMBOL(la_from_obdo);
+
+void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(dst->i_atime))
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME && src->o_mtime > LTIME_S(dst->i_mtime))
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	/* optimum IO size */
+	if (valid & OBD_MD_FLBLKSZ && src->o_blksize > (1 << dst->i_blkbits))
+		dst->i_blkbits = ffs(src->o_blksize) - 1;
+
+	if (dst->i_blkbits < PAGE_CACHE_SHIFT)
+		dst->i_blkbits = PAGE_CACHE_SHIFT;
+
+	/* allocation of space */
+	if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks)
+		/*
+		 * XXX shouldn't overflow be checked here like in
+		 * obdo_to_inode().
+		 */
+		dst->i_blocks = src->o_blocks;
+}
+EXPORT_SYMBOL(obdo_refresh_inode);
+
+void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid)
+{
+	valid &= src->o_valid;
+
+	LASSERTF(!(valid & (OBD_MD_FLTYPE | OBD_MD_FLGENER | OBD_MD_FLFID |
+			    OBD_MD_FLID | OBD_MD_FLGROUP)),
+		 "object "DOSTID", valid %x\n", POSTID(&src->o_oi), valid);
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE,
+		       "valid "LPX64", cur time %lu/%lu, new "LPU64"/"LPU64"\n",
+		       src->o_valid, LTIME_S(dst->i_mtime),
+		       LTIME_S(dst->i_ctime), src->o_mtime, src->o_ctime);
+
+	if (valid & OBD_MD_FLATIME)
+		LTIME_S(dst->i_atime) = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		LTIME_S(dst->i_mtime) = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime))
+		LTIME_S(dst->i_ctime) = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		i_size_write(dst, src->o_size);
+	if (valid & OBD_MD_FLBLOCKS) { /* allocation of space */
+		dst->i_blocks = src->o_blocks;
+		if (dst->i_blocks < src->o_blocks) /* overflow */
+			dst->i_blocks = -1;
+
+	}
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->i_blkbits = ffs(src->o_blksize)-1;
+	if (valid & OBD_MD_FLMODE)
+		dst->i_mode = (dst->i_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->i_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->i_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->i_flags = src->o_flags;
+}
+EXPORT_SYMBOL(obdo_to_inode);
diff --git a/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
new file mode 100644
index 000000000000..46aad6813cab
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/linux/linux-sysctl.c
@@ -0,0 +1,445 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/sysctl.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/sysctl.h>
+#include <linux/version.h>
+#include <linux/proc_fs.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/ctype.h>
+#include <asm/bitops.h>
+#include <asm/uaccess.h>
+#include <linux/utsname.h>
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lprocfs_status.h>
+
+#ifdef CONFIG_SYSCTL
+ctl_table_header_t *obd_table_header = NULL;
+#endif
+
+
+#define OBD_SYSCTL 300
+
+enum {
+	OBD_TIMEOUT = 3,	/* RPC timeout before recovery/intr */
+	OBD_DUMP_ON_TIMEOUT,    /* dump kernel debug log upon eviction */
+	OBD_MEMUSED,	    /* bytes currently OBD_ALLOCated */
+	OBD_PAGESUSED,	  /* pages currently OBD_PAGE_ALLOCated */
+	OBD_MAXMEMUSED,	 /* maximum bytes OBD_ALLOCated concurrently */
+	OBD_MAXPAGESUSED,       /* maximum pages OBD_PAGE_ALLOCated concurrently */
+	OBD_SYNCFILTER,	 /* XXX temporary, as we play with sync osts.. */
+	OBD_LDLM_TIMEOUT,       /* LDLM timeout for ASTs before client eviction */
+	OBD_DUMP_ON_EVICTION,   /* dump kernel debug log upon eviction */
+	OBD_DEBUG_PEER_ON_TIMEOUT, /* dump peer debug when RPC times out */
+	OBD_ALLOC_FAIL_RATE,    /* memory allocation random failure rate */
+	OBD_MAX_DIRTY_PAGES,    /* maximum dirty pages */
+	OBD_AT_MIN,	     /* Adaptive timeouts params */
+	OBD_AT_MAX,
+	OBD_AT_EXTRA,
+	OBD_AT_EARLY_MARGIN,
+	OBD_AT_HISTORY,
+};
+
+
+int LL_PROC_PROTO(proc_set_timeout)
+{
+	int rc;
+
+	rc = ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+	if (ldlm_timeout >= obd_timeout)
+		ldlm_timeout = max(obd_timeout / 3, 1U);
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_memory_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_alloc)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_sum());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_mem_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_memory_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_pages_max)
+{
+	char buf[22];
+	int len;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write)
+		return -EINVAL;
+
+	len = snprintf(buf, sizeof(buf), LPU64"\n", obd_pages_max());
+	if (len > *lenp)
+		len = *lenp;
+	buf[len] = '\0';
+	if (copy_to_user(buffer, buf, len))
+		return -EFAULT;
+	*lenp = len;
+	*ppos += *lenp;
+	return 0;
+}
+
+int LL_PROC_PROTO(proc_max_dirty_pages_in_mb)
+{
+	int rc = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		/* Don't allow them to let dirty pages exceed 90% of system
+		 * memory and set a hard minimum of 4MB. */
+		if (obd_max_dirty_pages > ((num_physpages / 10) * 9)) {
+			CERROR("Refusing to set max dirty pages to %u, which "
+			       "is more than 90%% of available RAM; setting "
+			       "to %lu\n", obd_max_dirty_pages,
+			       ((num_physpages / 10) * 9));
+			obd_max_dirty_pages = ((num_physpages / 10) * 9);
+		} else if (obd_max_dirty_pages < 4 << (20 - PAGE_CACHE_SHIFT)) {
+			obd_max_dirty_pages = 4 << (20 - PAGE_CACHE_SHIFT);
+		}
+	} else {
+		char buf[21];
+		int len;
+
+		len = lprocfs_read_frac_helper(buf, sizeof(buf),
+					       *(unsigned int*)table->data,
+					       1 << (20 - PAGE_CACHE_SHIFT));
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_alloc_fail_rate)
+{
+	int rc	  = 0;
+	DECLARE_LL_PROC_PPOS_DECL;
+
+	if (!table->data || !table->maxlen || !*lenp || (*ppos && !write)) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		rc = lprocfs_write_frac_helper(buffer, *lenp,
+					       (unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+	} else {
+		char buf[21];
+		int  len;
+
+		len = lprocfs_read_frac_helper(buf, 21,
+					       *(unsigned int*)table->data,
+					       OBD_ALLOC_FAIL_MULT);
+		if (len > *lenp)
+			len = *lenp;
+		buf[len] = '\0';
+		if (copy_to_user(buffer, buf, len))
+			return -EFAULT;
+		*lenp = len;
+	}
+	*ppos += *lenp;
+	return rc;
+}
+
+int LL_PROC_PROTO(proc_at_min)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_max)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_extra)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_early_margin)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+int LL_PROC_PROTO(proc_at_history)
+{
+	return ll_proc_dointvec(table, write, filp, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_SYSCTL
+static ctl_table_t obd_table[] = {
+	{
+		INIT_CTL_NAME(OBD_TIMEOUT)
+		.procname = "timeout",
+		.data     = &obd_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_DEBUG_PEER_ON_TIMEOUT)
+		.procname = "debug_peer_on_timeout",
+		.data     = &obd_debug_peer_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_TIMEOUT)
+		.procname = "dump_on_timeout",
+		.data     = &obd_dump_on_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_DUMP_ON_EVICTION)
+		.procname = "dump_on_eviction",
+		.data     = &obd_dump_on_eviction,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_dointvec
+	},
+	{
+		INIT_CTL_NAME(OBD_MEMUSED)
+		.procname = "memused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_memory_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_PAGESUSED)
+		.procname = "pagesused",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_alloc
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXMEMUSED)
+		.procname = "memused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_mem_max
+	},
+	{
+		INIT_CTL_NAME(OBD_MAXPAGESUSED)
+		.procname = "pagesused_max",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0444,
+		.proc_handler = &proc_pages_max
+	},
+	{
+		INIT_CTL_NAME(OBD_LDLM_TIMEOUT)
+		.procname = "ldlm_timeout",
+		.data     = &ldlm_timeout,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_set_timeout
+	},
+	{
+		INIT_CTL_NAME(OBD_ALLOC_FAIL_RATE)
+		.procname = "alloc_fail_rate",
+		.data     = &obd_alloc_fail_rate,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_alloc_fail_rate
+	},
+	{
+		INIT_CTL_NAME(OBD_MAX_DIRTY_PAGES)
+		.procname = "max_dirty_mb",
+		.data     = &obd_max_dirty_pages,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_max_dirty_pages_in_mb
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MIN)
+		.procname = "at_min",
+		.data     = &at_min,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_min
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_MAX)
+		.procname = "at_max",
+		.data     = &at_max,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_max
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EXTRA)
+		.procname = "at_extra",
+		.data     = &at_extra,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_extra
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_EARLY_MARGIN)
+		.procname = "at_early_margin",
+		.data     = &at_early_margin,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_early_margin
+	},
+	{
+		INIT_CTL_NAME(OBD_AT_HISTORY)
+		.procname = "at_history",
+		.data     = &at_history,
+		.maxlen   = sizeof(int),
+		.mode     = 0644,
+		.proc_handler = &proc_at_history
+	},
+	{       INIT_CTL_NAME(0)    }
+};
+
+static ctl_table_t parent_table[] = {
+	{
+		INIT_CTL_NAME(OBD_SYSCTL)
+		.procname = "lustre",
+		.data     = NULL,
+		.maxlen   = 0,
+		.mode     = 0555,
+		.child    = obd_table
+	},
+	{       INIT_CTL_NAME(0)   }
+};
+#endif
+
+void obd_sysctl_init (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( !obd_table_header )
+		obd_table_header = cfs_register_sysctl_table(parent_table, 0);
+#endif
+}
+
+void obd_sysctl_clean (void)
+{
+#ifdef CONFIG_SYSCTL
+	if ( obd_table_header )
+		unregister_sysctl_table(obd_table_header);
+	obd_table_header = NULL;
+#endif
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog.c b/drivers/staging/lustre/lustre/obdclass/llog.c
new file mode 100644
index 000000000000..b1d215e56991
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog.c
@@ -0,0 +1,966 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alex Zhuravlev <bzzz@whamcloud.com>
+ * Author: Mikhail Pershin <tappro@whamcloud.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/*
+ * Allocate a new log or catalog handle
+ * Used inside llog_open().
+ */
+struct llog_handle *llog_alloc_handle(void)
+{
+	struct llog_handle *loghandle;
+
+	OBD_ALLOC_PTR(loghandle);
+	if (loghandle == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	init_rwsem(&loghandle->lgh_lock);
+	spin_lock_init(&loghandle->lgh_hdr_lock);
+	INIT_LIST_HEAD(&loghandle->u.phd.phd_entry);
+	atomic_set(&loghandle->lgh_refcount, 1);
+
+	return loghandle;
+}
+
+/*
+ * Free llog handle and header data if exists. Used in llog_close() only
+ */
+void llog_free_handle(struct llog_handle *loghandle)
+{
+	LASSERT(loghandle != NULL);
+
+	/* failed llog_init_handle */
+	if (!loghandle->lgh_hdr)
+		goto out;
+
+	if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)
+		LASSERT(list_empty(&loghandle->u.phd.phd_entry));
+	else if (loghandle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		LASSERT(list_empty(&loghandle->u.chd.chd_head));
+	LASSERT(sizeof(*(loghandle->lgh_hdr)) == LLOG_CHUNK_SIZE);
+	OBD_FREE(loghandle->lgh_hdr, LLOG_CHUNK_SIZE);
+out:
+	OBD_FREE_PTR(loghandle);
+}
+
+void llog_handle_get(struct llog_handle *loghandle)
+{
+	atomic_inc(&loghandle->lgh_refcount);
+}
+
+void llog_handle_put(struct llog_handle *loghandle)
+{
+	LASSERT(atomic_read(&loghandle->lgh_refcount) > 0);
+	if (atomic_dec_and_test(&loghandle->lgh_refcount))
+		llog_free_handle(loghandle);
+}
+
+/* returns negative on error; 0 if success; 1 if success & log destroyed */
+int llog_cancel_rec(const struct lu_env *env, struct llog_handle *loghandle,
+		    int index)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	int rc = 0;
+	ENTRY;
+
+	CDEBUG(D_RPCTRACE, "Canceling %d in log "DOSTID"\n",
+	       index, POSTID(&loghandle->lgh_id.lgl_oi));
+
+	if (index == 0) {
+		CERROR("Can't cancel index 0 which is header\n");
+		RETURN(-EINVAL);
+	}
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (!ext2_clear_bit(index, llh->llh_bitmap)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		CDEBUG(D_RPCTRACE, "Catalog index %u already clear?\n", index);
+		RETURN(-ENOENT);
+	}
+
+	llh->llh_count--;
+
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1) &&
+	    (loghandle->lgh_last_idx == (LLOG_BITMAP_BYTES * 8) - 1)) {
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		rc = llog_destroy(env, loghandle);
+		if (rc < 0) {
+			CERROR("%s: can't destroy empty llog #"DOSTID
+			       "#%08x: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, rc);
+			GOTO(out_err, rc);
+		}
+		RETURN(1);
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	rc = llog_write(env, loghandle, &llh->llh_hdr, NULL, 0, NULL, 0);
+	if (rc < 0) {
+		CERROR("%s: fail to write header for llog #"DOSTID
+		       "#%08x: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&loghandle->lgh_id.lgl_oi),
+		       loghandle->lgh_id.lgl_ogen, rc);
+		GOTO(out_err, rc);
+	}
+	RETURN(0);
+out_err:
+	spin_lock(&loghandle->lgh_hdr_lock);
+	ext2_set_bit(index, llh->llh_bitmap);
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	return rc;
+}
+EXPORT_SYMBOL(llog_cancel_rec);
+
+static int llog_read_header(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct obd_uuid *uuid)
+{
+	struct llog_operations *lop;
+	int rc;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	if (lop->lop_read_header == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_read_header(env, handle);
+	if (rc == LLOG_EEMPTY) {
+		struct llog_log_hdr *llh = handle->lgh_hdr;
+
+		handle->lgh_last_idx = 0; /* header is record with index 0 */
+		llh->llh_count = 1;	 /* for the header record */
+		llh->llh_hdr.lrh_type = LLOG_HDR_MAGIC;
+		llh->llh_hdr.lrh_len = llh->llh_tail.lrt_len = LLOG_CHUNK_SIZE;
+		llh->llh_hdr.lrh_index = llh->llh_tail.lrt_index = 0;
+		llh->llh_timestamp = cfs_time_current_sec();
+		if (uuid)
+			memcpy(&llh->llh_tgtuuid, uuid,
+			       sizeof(llh->llh_tgtuuid));
+		llh->llh_bitmap_offset = offsetof(typeof(*llh), llh_bitmap);
+		ext2_set_bit(0, llh->llh_bitmap);
+		rc = 0;
+	}
+	return rc;
+}
+
+int llog_init_handle(const struct lu_env *env, struct llog_handle *handle,
+		     int flags, struct obd_uuid *uuid)
+{
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+	LASSERT(handle->lgh_hdr == NULL);
+
+	OBD_ALLOC_PTR(llh);
+	if (llh == NULL)
+		RETURN(-ENOMEM);
+	handle->lgh_hdr = llh;
+	/* first assign flags to use llog_client_ops */
+	llh->llh_flags = flags;
+	rc = llog_read_header(env, handle, uuid);
+	if (rc == 0) {
+		if (unlikely((llh->llh_flags & LLOG_F_IS_PLAIN &&
+			      flags & LLOG_F_IS_CAT) ||
+			     (llh->llh_flags & LLOG_F_IS_CAT &&
+			      flags & LLOG_F_IS_PLAIN))) {
+			CERROR("%s: llog type is %s but initializing %s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       llh->llh_flags & LLOG_F_IS_CAT ?
+			       "catalog" : "plain",
+			       flags & LLOG_F_IS_CAT ? "catalog" : "plain");
+			GOTO(out, rc = -EINVAL);
+		} else if (llh->llh_flags &
+			   (LLOG_F_IS_PLAIN | LLOG_F_IS_CAT)) {
+			/*
+			 * it is possible to open llog without specifying llog
+			 * type so it is taken from llh_flags
+			 */
+			flags = llh->llh_flags;
+		} else {
+			/* for some reason the llh_flags has no type set */
+			CERROR("llog type is not specified!\n");
+			GOTO(out, rc = -EINVAL);
+		}
+		if (unlikely(uuid &&
+			     !obd_uuid_equals(uuid, &llh->llh_tgtuuid))) {
+			CERROR("%s: llog uuid mismatch: %s/%s\n",
+			       handle->lgh_ctxt->loc_obd->obd_name,
+			       (char *)uuid->uuid,
+			       (char *)llh->llh_tgtuuid.uuid);
+			GOTO(out, rc = -EEXIST);
+		}
+	}
+	if (flags & LLOG_F_IS_CAT) {
+		LASSERT(list_empty(&handle->u.chd.chd_head));
+		INIT_LIST_HEAD(&handle->u.chd.chd_head);
+		llh->llh_size = sizeof(struct llog_logid_rec);
+	} else if (!(flags & LLOG_F_IS_PLAIN)) {
+		CERROR("%s: unknown flags: %#x (expected %#x or %#x)\n",
+		       handle->lgh_ctxt->loc_obd->obd_name,
+		       flags, LLOG_F_IS_CAT, LLOG_F_IS_PLAIN);
+		rc = -EINVAL;
+	}
+out:
+	if (rc) {
+		OBD_FREE_PTR(llh);
+		handle->lgh_hdr = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_init_handle);
+
+int llog_copy_handler(const struct lu_env *env,
+		      struct llog_handle *llh,
+		      struct llog_rec_hdr *rec,
+		      void *data)
+{
+	struct llog_rec_hdr local_rec = *rec;
+	struct llog_handle *local_llh = (struct llog_handle *)data;
+	char *cfg_buf = (char*) (rec + 1);
+	struct lustre_cfg *lcfg;
+	int rc = 0;
+	ENTRY;
+
+	/* Append all records */
+	local_rec.lrh_len -= sizeof(*rec) + sizeof(struct llog_rec_tail);
+	rc = llog_write(env, local_llh, &local_rec, NULL, 0,
+			(void *)cfg_buf, -1);
+
+	lcfg = (struct lustre_cfg *)cfg_buf;
+	CDEBUG(D_INFO, "idx=%d, rc=%d, len=%d, cmd %x %s %s\n",
+	       rec->lrh_index, rc, rec->lrh_len, lcfg->lcfg_command,
+	       lustre_cfg_string(lcfg, 0), lustre_cfg_string(lcfg, 1));
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_copy_handler);
+
+static int llog_process_thread(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct llog_handle		*loghandle = lpi->lpi_loghandle;
+	struct llog_log_hdr		*llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data	*cd  = lpi->lpi_catdata;
+	char				*buf;
+	__u64				 cur_offset = LLOG_CHUNK_SIZE;
+	__u64				 last_offset;
+	int				 rc = 0, index = 1, last_index;
+	int				 saved_index = 0;
+	int				 last_called_index = 0;
+
+	ENTRY;
+
+	LASSERT(llh);
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf) {
+		lpi->lpi_rc = -ENOMEM;
+		RETURN(0);
+	}
+
+	if (cd != NULL) {
+		last_called_index = cd->lpcd_first_idx;
+		index = cd->lpcd_first_idx + 1;
+	}
+	if (cd != NULL && cd->lpcd_last_idx)
+		last_index = cd->lpcd_last_idx;
+	else
+		last_index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+
+		/* skip records not set in bitmap */
+		while (index <= last_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			++index;
+
+		LASSERT(index <= last_index + 1);
+		if (index == last_index + 1)
+			break;
+repeat:
+		CDEBUG(D_OTHER, "index: %d last_index %d\n",
+		       index, last_index);
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		last_offset = cur_offset;
+		rc = llog_next_block(lpi->lpi_env, loghandle, &saved_index,
+				     index, &cur_offset, buf, LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		/* NB: when rec->lrh_len is accessed it is already swabbed
+		 * since it is used at the "end" of the loop and the rec
+		 * swabbing is done at the beginning of the loop. */
+		for (rec = (struct llog_rec_hdr *)buf;
+		     (char *)rec < buf + LLOG_CHUNK_SIZE;
+		     rec = (struct llog_rec_hdr *)((char *)rec + rec->lrh_len)){
+
+			CDEBUG(D_OTHER, "processing rec 0x%p type %#x\n",
+			       rec, rec->lrh_type);
+
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+
+			CDEBUG(D_OTHER, "after swabbing, type=%#x idx=%d\n",
+			       rec->lrh_type, rec->lrh_index);
+
+			if (rec->lrh_index == 0) {
+				/* probably another rec just got added? */
+				if (index <= loghandle->lgh_last_idx)
+					GOTO(repeat, rc = 0);
+				GOTO(out, rc = 0); /* no more records */
+			}
+			if (rec->lrh_len == 0 ||
+			    rec->lrh_len > LLOG_CHUNK_SIZE) {
+				CWARN("invalid length %d in llog record for "
+				      "index %d/%d\n", rec->lrh_len,
+				      rec->lrh_index, index);
+				GOTO(out, rc = -EINVAL);
+			}
+
+			if (rec->lrh_index < index) {
+				CDEBUG(D_OTHER, "skipping lrh_index %d\n",
+				       rec->lrh_index);
+				continue;
+			}
+
+			CDEBUG(D_OTHER,
+			       "lrh_index: %d lrh_len: %d (%d remains)\n",
+			       rec->lrh_index, rec->lrh_len,
+			       (int)(buf + LLOG_CHUNK_SIZE - (char *)rec));
+
+			loghandle->lgh_cur_idx = rec->lrh_index;
+			loghandle->lgh_cur_offset = (char *)rec - (char *)buf +
+						    last_offset;
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rc = lpi->lpi_cb(lpi->lpi_env, loghandle, rec,
+						 lpi->lpi_cbdata);
+				last_called_index = index;
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(lpi->lpi_env,
+							loghandle,
+							rec->lrh_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			} else {
+				CDEBUG(D_OTHER, "Skipped index %d\n", index);
+			}
+
+			/* next record, still in buffer? */
+			++index;
+			if (index > last_index)
+				GOTO(out, rc = 0);
+		}
+	}
+
+out:
+	if (cd != NULL)
+		cd->lpcd_last_idx = last_called_index;
+
+	OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	lpi->lpi_rc = rc;
+	return 0;
+}
+
+static int llog_process_thread_daemonize(void *arg)
+{
+	struct llog_process_info	*lpi = arg;
+	struct lu_env			 env;
+	int				 rc;
+
+	unshare_fs_struct();
+
+	/* client env has no keys, tags is just 0 */
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		goto out;
+	lpi->lpi_env = &env;
+
+	rc = llog_process_thread(arg);
+
+	lu_env_fini(&env);
+out:
+	complete(&lpi->lpi_completion);
+	return rc;
+}
+
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork)
+{
+	struct llog_process_info *lpi;
+	int		      rc;
+
+	ENTRY;
+
+	OBD_ALLOC_PTR(lpi);
+	if (lpi == NULL) {
+		CERROR("cannot alloc pointer\n");
+		RETURN(-ENOMEM);
+	}
+	lpi->lpi_loghandle = loghandle;
+	lpi->lpi_cb	= cb;
+	lpi->lpi_cbdata    = data;
+	lpi->lpi_catdata   = catdata;
+
+	if (fork) {
+		/* The new thread can't use parent env,
+		 * init the new one in llog_process_thread_daemonize. */
+		lpi->lpi_env = NULL;
+		init_completion(&lpi->lpi_completion);
+		rc = PTR_ERR(kthread_run(llog_process_thread_daemonize, lpi,
+					     "llog_process_thread"));
+		if (IS_ERR_VALUE(rc)) {
+			CERROR("%s: cannot start thread: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+			OBD_FREE_PTR(lpi);
+			RETURN(rc);
+		}
+		wait_for_completion(&lpi->lpi_completion);
+	} else {
+		lpi->lpi_env = env;
+		llog_process_thread(lpi);
+	}
+	rc = lpi->lpi_rc;
+	OBD_FREE_PTR(lpi);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_process_or_fork);
+
+int llog_process(const struct lu_env *env, struct llog_handle *loghandle,
+		 llog_cb_t cb, void *data, void *catdata)
+{
+	return llog_process_or_fork(env, loghandle, cb, data, catdata, true);
+}
+EXPORT_SYMBOL(llog_process);
+
+inline int llog_get_size(struct llog_handle *loghandle)
+{
+	if (loghandle && loghandle->lgh_hdr)
+		return loghandle->lgh_hdr->llh_count;
+	return 0;
+}
+EXPORT_SYMBOL(llog_get_size);
+
+int llog_reverse_process(const struct lu_env *env,
+			 struct llog_handle *loghandle, llog_cb_t cb,
+			 void *data, void *catdata)
+{
+	struct llog_log_hdr *llh = loghandle->lgh_hdr;
+	struct llog_process_cat_data *cd = catdata;
+	void *buf;
+	int rc = 0, first_index = 1, index, idx;
+	ENTRY;
+
+	OBD_ALLOC(buf, LLOG_CHUNK_SIZE);
+	if (!buf)
+		RETURN(-ENOMEM);
+
+	if (cd != NULL)
+		first_index = cd->lpcd_first_idx + 1;
+	if (cd != NULL && cd->lpcd_last_idx)
+		index = cd->lpcd_last_idx;
+	else
+		index = LLOG_BITMAP_BYTES * 8 - 1;
+
+	while (rc == 0) {
+		struct llog_rec_hdr *rec;
+		struct llog_rec_tail *tail;
+
+		/* skip records not set in bitmap */
+		while (index >= first_index &&
+		       !ext2_test_bit(index, llh->llh_bitmap))
+			--index;
+
+		LASSERT(index >= first_index - 1);
+		if (index == first_index - 1)
+			break;
+
+		/* get the buf with our target record; avoid old garbage */
+		memset(buf, 0, LLOG_CHUNK_SIZE);
+		rc = llog_prev_block(env, loghandle, index, buf,
+				     LLOG_CHUNK_SIZE);
+		if (rc)
+			GOTO(out, rc);
+
+		rec = buf;
+		idx = rec->lrh_index;
+		CDEBUG(D_RPCTRACE, "index %u : idx %u\n", index, idx);
+		while (idx < index) {
+			rec = (void *)rec + rec->lrh_len;
+			if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+				lustre_swab_llog_rec(rec);
+			idx ++;
+		}
+		LASSERT(idx == index);
+		tail = (void *)rec + rec->lrh_len - sizeof(*tail);
+
+		/* process records in buffer, starting where we found one */
+		while ((void *)tail > buf) {
+			if (tail->lrt_index == 0)
+				GOTO(out, rc = 0); /* no more records */
+
+			/* if set, process the callback on this record */
+			if (ext2_test_bit(index, llh->llh_bitmap)) {
+				rec = (void *)tail - tail->lrt_len +
+				      sizeof(*tail);
+
+				rc = cb(env, loghandle, rec, data);
+				if (rc == LLOG_PROC_BREAK) {
+					GOTO(out, rc);
+				} else if (rc == LLOG_DEL_RECORD) {
+					llog_cancel_rec(env, loghandle,
+							tail->lrt_index);
+					rc = 0;
+				}
+				if (rc)
+					GOTO(out, rc);
+			}
+
+			/* previous record, still in buffer? */
+			--index;
+			if (index < first_index)
+				GOTO(out, rc = 0);
+			tail = (void *)tail - tail->lrt_len;
+		}
+	}
+
+out:
+	if (buf)
+		OBD_FREE(buf, LLOG_CHUNK_SIZE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_reverse_process);
+
+/**
+ * new llog API
+ *
+ * API functions:
+ *      llog_open - open llog, may not exist
+ *      llog_exist - check if llog exists
+ *      llog_close - close opened llog, pair for open, frees llog_handle
+ *      llog_declare_create - declare llog creation
+ *      llog_create - create new llog on disk, need transaction handle
+ *      llog_declare_write_rec - declaration of llog write
+ *      llog_write_rec - write llog record on disk, need transaction handle
+ *      llog_declare_add - declare llog catalog record addition
+ *      llog_add - add llog record in catalog, need transaction handle
+ */
+int llog_exist(struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_exist == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	rc = lop->lop_exist(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_exist);
+
+int llog_declare_create(const struct lu_env *env,
+			struct llog_handle *loghandle, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_declare_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_create(env, loghandle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_create);
+
+int llog_create(const struct lu_env *env, struct llog_handle *handle,
+		struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	if (lop->lop_create == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_create(env, handle, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_create);
+
+int llog_declare_write_rec(const struct lu_env *env,
+			   struct llog_handle *handle,
+			   struct llog_rec_hdr *rec, int idx,
+			   struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+	LASSERT(lop);
+	if (lop->lop_declare_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_declare_write_rec(env, handle, rec, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_write_rec);
+
+int llog_write_rec(const struct lu_env *env, struct llog_handle *handle,
+		   struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+		   int numcookies, void *buf, int idx, struct thandle *th)
+{
+	struct llog_operations	*lop;
+	int			 raised, rc, buflen;
+
+	ENTRY;
+
+	rc = llog_handle2ops(handle, &lop);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lop);
+	if (lop->lop_write_rec == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	if (buf)
+		buflen = rec->lrh_len + sizeof(struct llog_rec_hdr) +
+			 sizeof(struct llog_rec_tail);
+	else
+		buflen = rec->lrh_len;
+	LASSERT(cfs_size_round(buflen) == buflen);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lop->lop_write_rec(env, handle, rec, logcookies, numcookies,
+				buf, idx, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write_rec);
+
+int llog_add(const struct lu_env *env, struct llog_handle *lgh,
+	     struct llog_rec_hdr *rec, struct llog_cookie *logcookies,
+	     void *buf, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_add(env, lgh, rec, logcookies, buf, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_add);
+
+int llog_declare_add(const struct lu_env *env, struct llog_handle *lgh,
+		     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	int raised, rc;
+
+	ENTRY;
+
+	if (lgh->lgh_logops->lop_declare_add == NULL)
+		RETURN(-EOPNOTSUPP);
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = lgh->lgh_logops->lop_declare_add(env, lgh, rec, th);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_declare_add);
+
+/**
+ * Helper function to open llog or create it if doesn't exist.
+ * It hides all transaction handling from caller.
+ */
+int llog_open_create(const struct lu_env *env, struct llog_ctxt *ctxt,
+		     struct llog_handle **res, struct llog_logid *logid,
+		     char *name)
+{
+	struct thandle	*th;
+	int		 rc;
+
+	ENTRY;
+
+	rc = llog_open(env, ctxt, res, logid, name, LLOG_OPEN_NEW);
+	if (rc)
+		RETURN(rc);
+
+	if (llog_exist(*res))
+		RETURN(0);
+
+	if ((*res)->lgh_obj != NULL) {
+		struct dt_device *d;
+
+		d = lu2dt_dev((*res)->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		rc = llog_declare_create(env, *res, th);
+		if (rc == 0) {
+			rc = dt_trans_start_local(env, d, th);
+			if (rc == 0)
+				rc = llog_create(env, *res, th);
+		}
+		dt_trans_stop(env, d, th);
+	} else {
+		/* lvfs compat code */
+		LASSERT((*res)->lgh_file == NULL);
+		rc = llog_create(env, *res, NULL);
+	}
+out:
+	if (rc)
+		llog_close(env, *res);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open_create);
+
+/**
+ * Helper function to delete existent llog.
+ */
+int llog_erase(const struct lu_env *env, struct llog_ctxt *ctxt,
+	       struct llog_logid *logid, char *name)
+{
+	struct llog_handle	*handle;
+	int			 rc = 0, rc2;
+
+	ENTRY;
+
+	/* nothing to erase */
+	if (name == NULL && logid == NULL)
+		RETURN(0);
+
+	rc = llog_open(env, ctxt, &handle, logid, name, LLOG_OPEN_EXISTS);
+	if (rc < 0)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, handle, LLOG_F_IS_PLAIN, NULL);
+	if (rc == 0)
+		rc = llog_destroy(env, handle);
+
+	rc2 = llog_close(env, handle);
+	if (rc == 0)
+		rc = rc2;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_erase);
+
+/*
+ * Helper function for write record in llog.
+ * It hides all transaction handling from caller.
+ * Valid only with local llog.
+ */
+int llog_write(const struct lu_env *env, struct llog_handle *loghandle,
+	       struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+	       int cookiecount, void *buf, int idx)
+{
+	int rc;
+
+	ENTRY;
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	if (loghandle->lgh_obj != NULL) {
+		struct dt_device	*dt;
+		struct thandle		*th;
+
+		dt = lu2dt_dev(loghandle->lgh_obj->do_lu.lo_dev);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_declare_write_rec(env, loghandle, rec, idx, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, th);
+		up_write(&loghandle->lgh_lock);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compatibility */
+		down_write(&loghandle->lgh_lock);
+		rc = llog_write_rec(env, loghandle, rec, reccookie,
+				    cookiecount, buf, idx, NULL);
+		up_write(&loghandle->lgh_lock);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_write);
+
+int llog_open(const struct lu_env *env, struct llog_ctxt *ctxt,
+	      struct llog_handle **lgh, struct llog_logid *logid,
+	      char *name, enum llog_open_param open_param)
+{
+	int	 raised;
+	int	 rc;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_logops);
+
+	if (ctxt->loc_logops->lop_open == NULL) {
+		*lgh = NULL;
+		RETURN(-EOPNOTSUPP);
+	}
+
+	*lgh = llog_alloc_handle();
+	if (*lgh == NULL)
+		RETURN(-ENOMEM);
+	(*lgh)->lgh_ctxt = ctxt;
+	(*lgh)->lgh_logops = ctxt->loc_logops;
+
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = ctxt->loc_logops->lop_open(env, *lgh, logid, name, open_param);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	if (rc) {
+		llog_free_handle(*lgh);
+		*lgh = NULL;
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_open);
+
+int llog_close(const struct lu_env *env, struct llog_handle *loghandle)
+{
+	struct llog_operations	*lop;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_handle2ops(loghandle, &lop);
+	if (rc)
+		GOTO(out, rc);
+	if (lop->lop_close == NULL)
+		GOTO(out, rc = -EOPNOTSUPP);
+	rc = lop->lop_close(env, loghandle);
+out:
+	llog_handle_put(loghandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_close);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_cat.c b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
new file mode 100644
index 000000000000..cf00b2f550ac
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_cat.c
@@ -0,0 +1,833 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_cat.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ *
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+
+#include "llog_internal.h"
+
+/* Create a new log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ */
+static int llog_cat_new_log(const struct lu_env *env,
+			    struct llog_handle *cathandle,
+			    struct llog_handle *loghandle,
+			    struct thandle *th)
+{
+
+	struct llog_log_hdr *llh;
+	struct llog_logid_rec rec = { { 0 }, };
+	int rc, index, bitmap_size;
+	ENTRY;
+
+	llh = cathandle->lgh_hdr;
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+
+	index = (cathandle->lgh_last_idx + 1) % bitmap_size;
+
+	/* maximum number of available slots in catlog is bitmap_size - 2 */
+	if (llh->llh_cat_idx == index) {
+		CERROR("no free catalog slots for log...\n");
+		RETURN(-ENOSPC);
+	}
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_MDS_LLOG_CREATE_FAILED))
+		RETURN(-ENOSPC);
+
+	rc = llog_create(env, loghandle, th);
+	/* if llog is already created, no need to initialize it */
+	if (rc == -EEXIST) {
+		RETURN(0);
+	} else if (rc != 0) {
+		CERROR("%s: can't create new plain llog in catalog: rc = %d\n",
+		       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &cathandle->lgh_hdr->llh_tgtuuid);
+	if (rc)
+		GOTO(out_destroy, rc);
+
+	if (index == 0)
+		index = 1;
+
+	spin_lock(&loghandle->lgh_hdr_lock);
+	llh->llh_count++;
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n",
+		       index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	spin_unlock(&loghandle->lgh_hdr_lock);
+
+	cathandle->lgh_last_idx = index;
+	llh->llh_tail.lrt_index = index;
+
+	CDEBUG(D_RPCTRACE,"new recovery log "DOSTID":%x for index %u of catalog"
+	       DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+	       loghandle->lgh_id.lgl_ogen, index,
+	       POSTID(&cathandle->lgh_id.lgl_oi));
+	/* build the record for this log in the catalog */
+	rec.lid_hdr.lrh_len = sizeof(rec);
+	rec.lid_hdr.lrh_index = index;
+	rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+	rec.lid_id = loghandle->lgh_id;
+	rec.lid_tail.lrt_len = sizeof(rec);
+	rec.lid_tail.lrt_index = index;
+
+	/* update the catalog: header and record */
+	rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+			    &loghandle->u.phd.phd_cookie, 1, NULL, index, th);
+	if (rc < 0)
+		GOTO(out_destroy, rc);
+
+	loghandle->lgh_hdr->llh_cat_idx = index;
+	RETURN(0);
+out_destroy:
+	llog_destroy(env, loghandle);
+	RETURN(rc);
+}
+
+/* Open an existent log handle and add it to the open list.
+ * This log handle will be closed when all of the records in it are removed.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ * We return a lock on the handle to ensure nobody yanks it from us.
+ *
+ * This takes extra reference on llog_handle via llog_handle_get() and require
+ * this reference to be put by caller using llog_handle_put()
+ */
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid)
+{
+	struct llog_handle	*loghandle;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle == NULL)
+		RETURN(-EBADF);
+
+	down_write(&cathandle->lgh_lock);
+	list_for_each_entry(loghandle, &cathandle->u.chd.chd_head,
+				u.phd.phd_entry) {
+		struct llog_logid *cgl = &loghandle->lgh_id;
+
+		if (ostid_id(&cgl->lgl_oi) == ostid_id(&logid->lgl_oi) &&
+		    ostid_seq(&cgl->lgl_oi) == ostid_seq(&logid->lgl_oi)) {
+			if (cgl->lgl_ogen != logid->lgl_ogen) {
+				CERROR("%s: log "DOSTID" generation %x != %x\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       POSTID(&logid->lgl_oi), cgl->lgl_ogen,
+				       logid->lgl_ogen);
+				continue;
+			}
+			loghandle->u.phd.phd_cat_handle = cathandle;
+			up_write(&cathandle->lgh_lock);
+			GOTO(out, rc = 0);
+		}
+	}
+	up_write(&cathandle->lgh_lock);
+
+	rc = llog_open(env, cathandle->lgh_ctxt, &loghandle, logid, NULL,
+		       LLOG_OPEN_EXISTS);
+	if (rc < 0) {
+		CERROR("%s: error opening log id "DOSTID":%x: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, NULL);
+	if (rc < 0) {
+		llog_close(env, loghandle);
+		loghandle = NULL;
+		RETURN(rc);
+	}
+
+	down_write(&cathandle->lgh_lock);
+	list_add(&loghandle->u.phd.phd_entry, &cathandle->u.chd.chd_head);
+	up_write(&cathandle->lgh_lock);
+
+	loghandle->u.phd.phd_cat_handle = cathandle;
+	loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id;
+	loghandle->u.phd.phd_cookie.lgc_index =
+				loghandle->lgh_hdr->llh_cat_idx;
+	EXIT;
+out:
+	llog_handle_get(loghandle);
+	*res = loghandle;
+	return 0;
+}
+
+int llog_cat_close(const struct lu_env *env, struct llog_handle *cathandle)
+{
+	struct llog_handle	*loghandle, *n;
+	int			 rc;
+
+	ENTRY;
+
+	list_for_each_entry_safe(loghandle, n, &cathandle->u.chd.chd_head,
+				     u.phd.phd_entry) {
+		struct llog_log_hdr	*llh = loghandle->lgh_hdr;
+		int			 index;
+
+		/* unlink open-not-created llogs */
+		list_del_init(&loghandle->u.phd.phd_entry);
+		llh = loghandle->lgh_hdr;
+		if (loghandle->lgh_obj != NULL && llh != NULL &&
+		    (llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+		    (llh->llh_count == 1)) {
+			rc = llog_destroy(env, loghandle);
+			if (rc)
+				CERROR("%s: failure destroying log during "
+				       "cleanup: rc = %d\n",
+				       loghandle->lgh_ctxt->loc_obd->obd_name,
+				       rc);
+
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			llog_cat_cleanup(env, cathandle, NULL, index);
+		}
+		llog_close(env, loghandle);
+	}
+	/* if handle was stored in ctxt, remove it too */
+	if (cathandle->lgh_ctxt->loc_handle == cathandle)
+		cathandle->lgh_ctxt->loc_handle = NULL;
+	rc = llog_close(env, cathandle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_close);
+
+/**
+ * lockdep markers for nested struct llog_handle::lgh_lock locking.
+ */
+enum {
+	LLOGH_CAT,
+	LLOGH_LOG
+};
+
+/** Return the currently active log handle.  If the current log handle doesn't
+ * have enough space left for the current record, start a new one.
+ *
+ * If reclen is 0, we only want to know what the currently active log is,
+ * otherwise we get a lock on this log so nobody can steal our space.
+ *
+ * Assumes caller has already pushed us into the kernel context and is locking.
+ *
+ * NOTE: loghandle is write-locked upon successful return
+ */
+static struct llog_handle *llog_cat_current_log(struct llog_handle *cathandle,
+						struct thandle *th)
+{
+	struct llog_handle *loghandle = NULL;
+	ENTRY;
+
+	down_read_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		if (llh == NULL ||
+		    loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_read(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+	up_read(&cathandle->lgh_lock);
+
+	/* time to use next log */
+
+	/* first, we have to make sure the state hasn't changed */
+	down_write_nested(&cathandle->lgh_lock, LLOGH_CAT);
+	loghandle = cathandle->u.chd.chd_current_log;
+	if (loghandle) {
+		struct llog_log_hdr *llh;
+
+		down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+		llh = loghandle->lgh_hdr;
+		LASSERT(llh);
+		if (loghandle->lgh_last_idx < LLOG_BITMAP_SIZE(llh) - 1) {
+			up_write(&cathandle->lgh_lock);
+			RETURN(loghandle);
+		} else {
+			up_write(&loghandle->lgh_lock);
+		}
+	}
+
+	CDEBUG(D_INODE, "use next log\n");
+
+	loghandle = cathandle->u.chd.chd_next_log;
+	cathandle->u.chd.chd_current_log = loghandle;
+	cathandle->u.chd.chd_next_log = NULL;
+	down_write_nested(&loghandle->lgh_lock, LLOGH_LOG);
+	up_write(&cathandle->lgh_lock);
+	LASSERT(loghandle);
+	RETURN(loghandle);
+}
+
+/* Add a single record to the recovery log(s) using a catalog
+ * Returns as llog_write_record
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_add_rec(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		     void *buf, struct thandle *th)
+{
+	struct llog_handle *loghandle;
+	int rc;
+	ENTRY;
+
+	LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
+	loghandle = llog_cat_current_log(cathandle, th);
+	LASSERT(!IS_ERR(loghandle));
+
+	/* loghandle is already locked by llog_cat_current_log() for us */
+	if (!llog_exist(loghandle)) {
+		rc = llog_cat_new_log(env, cathandle, loghandle, th);
+		if (rc < 0) {
+			up_write(&loghandle->lgh_lock);
+			RETURN(rc);
+		}
+	}
+	/* now let's try to add the record */
+	rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf, -1, th);
+	if (rc < 0)
+		CDEBUG_LIMIT(rc == -ENOSPC ? D_HA : D_ERROR,
+			     "llog_write_rec %d: lh=%p\n", rc, loghandle);
+	up_write(&loghandle->lgh_lock);
+	if (rc == -ENOSPC) {
+		/* try to use next log */
+		loghandle = llog_cat_current_log(cathandle, th);
+		LASSERT(!IS_ERR(loghandle));
+		/* new llog can be created concurrently */
+		if (!llog_exist(loghandle)) {
+			rc = llog_cat_new_log(env, cathandle, loghandle, th);
+			if (rc < 0) {
+				up_write(&loghandle->lgh_lock);
+				RETURN(rc);
+			}
+		}
+		/* now let's try to add the record */
+		rc = llog_write_rec(env, loghandle, rec, reccookie, 1, buf,
+				    -1, th);
+		if (rc < 0)
+			CERROR("llog_write_rec %d: lh=%p\n", rc, loghandle);
+		up_write(&loghandle->lgh_lock);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add_rec);
+
+int llog_cat_declare_add_rec(const struct lu_env *env,
+			     struct llog_handle *cathandle,
+			     struct llog_rec_hdr *rec, struct thandle *th)
+{
+	struct llog_handle	*loghandle, *next;
+	int			 rc = 0;
+
+	ENTRY;
+
+	if (cathandle->u.chd.chd_current_log == NULL) {
+		/* declare new plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_current_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	} else if (cathandle->u.chd.chd_next_log == NULL) {
+		/* declare next plain llog */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_next_log == NULL) {
+			rc = llog_open(env, cathandle->lgh_ctxt, &loghandle,
+				       NULL, NULL, LLOG_OPEN_NEW);
+			if (rc == 0) {
+				cathandle->u.chd.chd_next_log = loghandle;
+				list_add_tail(&loghandle->u.phd.phd_entry,
+						  &cathandle->u.chd.chd_head);
+			}
+		}
+		up_write(&cathandle->lgh_lock);
+	}
+	if (rc)
+		GOTO(out, rc);
+
+	if (!llog_exist(cathandle->u.chd.chd_current_log)) {
+		rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
+					 th);
+		if (rc)
+			GOTO(out, rc);
+		llog_declare_write_rec(env, cathandle, NULL, -1, th);
+	}
+	/* declare records in the llogs */
+	rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
+				    rec, -1, th);
+	if (rc)
+		GOTO(out, rc);
+
+	next = cathandle->u.chd.chd_next_log;
+	if (next) {
+		if (!llog_exist(next)) {
+			rc = llog_declare_create(env, next, th);
+			llog_declare_write_rec(env, cathandle, NULL, -1, th);
+		}
+		llog_declare_write_rec(env, next, rec, -1, th);
+	}
+out:
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_declare_add_rec);
+
+int llog_cat_add(const struct lu_env *env, struct llog_handle *cathandle,
+		 struct llog_rec_hdr *rec, struct llog_cookie *reccookie,
+		 void *buf)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_device	*dt;
+	struct thandle		*th = NULL;
+	int			 rc;
+
+	ctxt = cathandle->lgh_ctxt;
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+
+	if (cathandle->lgh_obj != NULL) {
+		dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+		LASSERT(dt);
+
+		th = dt_trans_create(env, dt);
+		if (IS_ERR(th))
+			RETURN(PTR_ERR(th));
+
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dt, th);
+		if (rc)
+			GOTO(out_trans, rc);
+		rc = llog_cat_add_rec(env, cathandle, rec, reccookie, buf, th);
+out_trans:
+		dt_trans_stop(env, dt, th);
+	} else { /* lvfs compat code */
+		LASSERT(cathandle->lgh_file != NULL);
+		rc = llog_cat_declare_add_rec(env, cathandle, rec, th);
+		if (rc == 0)
+			rc = llog_cat_add_rec(env, cathandle, rec, reccookie,
+					      buf, th);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_add);
+
+/* For each cookie in the cookie array, we clear the log in-use bit and either:
+ * - the log is empty, so mark it free in the catalog header and delete it
+ * - the log is not empty, just write out the log header
+ *
+ * The cookies may be in different log files, so we need to get new logs
+ * each time.
+ *
+ * Assumes caller has already pushed us into the kernel context.
+ */
+int llog_cat_cancel_records(const struct lu_env *env,
+			    struct llog_handle *cathandle, int count,
+			    struct llog_cookie *cookies)
+{
+	int i, index, rc = 0, failed = 0;
+
+	ENTRY;
+
+	for (i = 0; i < count; i++, cookies++) {
+		struct llog_handle	*loghandle;
+		struct llog_logid	*lgl = &cookies->lgc_lgl;
+		int			 lrc;
+
+		rc = llog_cat_id2handle(env, cathandle, &loghandle, lgl);
+		if (rc) {
+			CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+			       cathandle->lgh_ctxt->loc_obd->obd_name,
+			       POSTID(&lgl->lgl_oi), rc);
+			failed++;
+			continue;
+		}
+
+		lrc = llog_cancel_rec(env, loghandle, cookies->lgc_index);
+		if (lrc == 1) {	  /* log has been destroyed */
+			index = loghandle->u.phd.phd_cookie.lgc_index;
+			rc = llog_cat_cleanup(env, cathandle, loghandle,
+					      index);
+		} else if (lrc == -ENOENT) {
+			if (rc == 0) /* ENOENT shouldn't rewrite any error */
+				rc = lrc;
+		} else if (lrc < 0) {
+			failed++;
+			rc = lrc;
+		}
+		llog_handle_put(loghandle);
+	}
+	if (rc)
+		CERROR("%s: fail to cancel %d of %d llog-records: rc = %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name, failed, count,
+		       rc);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_cancel_records);
+
+int llog_cat_process_cb(const struct lu_env *env, struct llog_handle *cat_llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	if (rec->lrh_index < d->lpd_startcat)
+		/* Skip processing of the logs until startcat */
+		RETURN(0);
+
+	if (d->lpd_startidx > 0) {
+		struct llog_process_cat_data cd;
+
+		cd.lpcd_first_idx = d->lpd_startidx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  &cd, false);
+		/* Continue processing the next log from idx 0 */
+		d->lpd_startidx = 0;
+	} else {
+		rc = llog_process_or_fork(env, llh, d->lpd_cb, d->lpd_data,
+					  NULL, false);
+	}
+	llog_handle_put(llh);
+
+	RETURN(rc);
+}
+
+int llog_cat_process_or_fork(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data, int startcat,
+			     int startidx, bool fork)
+{
+	struct llog_process_data d;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+	d.lpd_startcat = startcat;
+	d.lpd_startidx = startidx;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		struct llog_process_cat_data cd;
+
+		CWARN("catlog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = llh->llh_cat_idx;
+		cd.lpcd_last_idx = 0;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, &cd, fork);
+	} else {
+		rc = llog_process_or_fork(env, cat_llh, llog_cat_process_cb,
+					  &d, NULL, fork);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_process_or_fork);
+
+int llog_cat_process(const struct lu_env *env, struct llog_handle *cat_llh,
+		     llog_cb_t cb, void *data, int startcat, int startidx)
+{
+	return llog_cat_process_or_fork(env, cat_llh, cb, data, startcat,
+					startidx, false);
+}
+EXPORT_SYMBOL(llog_cat_process);
+
+static int llog_cat_reverse_process_cb(const struct lu_env *env,
+				       struct llog_handle *cat_llh,
+				       struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_process_data *d = data;
+	struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+	struct llog_handle *llh;
+	int rc;
+
+	if (le32_to_cpu(rec->lrh_type) != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       le32_to_cpu(rec->lrh_index), POSTID(&cat_llh->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cat_llh, &llh, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cat_llh->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		RETURN(rc);
+	}
+
+	rc = llog_reverse_process(env, llh, d->lpd_cb, d->lpd_data, NULL);
+	llog_handle_put(llh);
+	RETURN(rc);
+}
+
+int llog_cat_reverse_process(const struct lu_env *env,
+			     struct llog_handle *cat_llh,
+			     llog_cb_t cb, void *data)
+{
+	struct llog_process_data d;
+	struct llog_process_cat_data cd;
+	struct llog_log_hdr *llh = cat_llh->lgh_hdr;
+	int rc;
+	ENTRY;
+
+	LASSERT(llh->llh_flags & LLOG_F_IS_CAT);
+	d.lpd_data = data;
+	d.lpd_cb = cb;
+
+	if (llh->llh_cat_idx > cat_llh->lgh_last_idx) {
+		CWARN("catalog "DOSTID" crosses index zero\n",
+		      POSTID(&cat_llh->lgh_id.lgl_oi));
+
+		cd.lpcd_first_idx = 0;
+		cd.lpcd_last_idx = cat_llh->lgh_last_idx;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+		if (rc != 0)
+			RETURN(rc);
+
+		cd.lpcd_first_idx = le32_to_cpu(llh->llh_cat_idx);
+		cd.lpcd_last_idx = 0;
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, &cd);
+	} else {
+		rc = llog_reverse_process(env, cat_llh,
+					  llog_cat_reverse_process_cb,
+					  &d, NULL);
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cat_reverse_process);
+
+int llog_cat_set_first_idx(struct llog_handle *cathandle, int index)
+{
+	struct llog_log_hdr *llh = cathandle->lgh_hdr;
+	int i, bitmap_size, idx;
+	ENTRY;
+
+	bitmap_size = LLOG_BITMAP_SIZE(llh);
+	if (llh->llh_cat_idx == (index - 1)) {
+		idx = llh->llh_cat_idx + 1;
+		llh->llh_cat_idx = idx;
+		if (idx == cathandle->lgh_last_idx)
+			goto out;
+		for (i = (index + 1) % bitmap_size;
+		     i != cathandle->lgh_last_idx;
+		     i = (i + 1) % bitmap_size) {
+			if (!ext2_test_bit(i, llh->llh_bitmap)) {
+				idx = llh->llh_cat_idx + 1;
+				llh->llh_cat_idx = idx;
+			} else if (i == 0) {
+				llh->llh_cat_idx = 0;
+			} else {
+				break;
+			}
+		}
+out:
+		CDEBUG(D_RPCTRACE, "set catlog "DOSTID" first idx %u\n",
+		       POSTID(&cathandle->lgh_id.lgl_oi), llh->llh_cat_idx);
+	}
+
+	RETURN(0);
+}
+
+/* Cleanup deleted plain llog traces from catalog */
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index)
+{
+	int rc;
+
+	LASSERT(index);
+	if (loghandle != NULL) {
+		/* remove destroyed llog from catalog list and
+		 * chd_current_log variable */
+		down_write(&cathandle->lgh_lock);
+		if (cathandle->u.chd.chd_current_log == loghandle)
+			cathandle->u.chd.chd_current_log = NULL;
+		list_del_init(&loghandle->u.phd.phd_entry);
+		up_write(&cathandle->lgh_lock);
+		LASSERT(index == loghandle->u.phd.phd_cookie.lgc_index);
+		/* llog was opened and keep in a list, close it now */
+		llog_close(env, loghandle);
+	}
+	/* remove plain llog entry from catalog by index */
+	llog_cat_set_first_idx(cathandle, index);
+	rc = llog_cancel_rec(env, cathandle, index);
+	if (rc == 0)
+		CDEBUG(D_HA, "cancel plain log at index"
+		       " %u of catalog "DOSTID"\n",
+		       index, POSTID(&cathandle->lgh_id.lgl_oi));
+	return rc;
+}
+
+int cat_cancel_cb(const struct lu_env *env, struct llog_handle *cathandle,
+		  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct llog_handle	*loghandle;
+	struct llog_log_hdr	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	CDEBUG(D_HA, "processing log "DOSTID":%x at index %u of catalog "
+	       DOSTID"\n", POSTID(&lir->lid_id.lgl_oi), lir->lid_id.lgl_ogen,
+	       rec->lrh_index, POSTID(&cathandle->lgh_id.lgl_oi));
+
+	rc = llog_cat_id2handle(env, cathandle, &loghandle, &lir->lid_id);
+	if (rc) {
+		CERROR("%s: cannot find handle for llog "DOSTID": %d\n",
+		       cathandle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&lir->lid_id.lgl_oi), rc);
+		if (rc == -ENOENT || rc == -ESTALE) {
+			/* remove index from catalog */
+			llog_cat_cleanup(env, cathandle, NULL, rec->lrh_index);
+		}
+		RETURN(rc);
+	}
+
+	llh = loghandle->lgh_hdr;
+	if ((llh->llh_flags & LLOG_F_ZAP_WHEN_EMPTY) &&
+	    (llh->llh_count == 1)) {
+		rc = llog_destroy(env, loghandle);
+		if (rc)
+			CERROR("%s: fail to destroy empty log: rc = %d\n",
+			       loghandle->lgh_ctxt->loc_obd->obd_name, rc);
+
+		llog_cat_cleanup(env, cathandle, loghandle,
+				 loghandle->u.phd.phd_cookie.lgc_index);
+	}
+	llog_handle_put(loghandle);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(cat_cancel_cb);
+
+/* helper to initialize catalog llog and process it to cancel */
+int llog_cat_init_and_process(const struct lu_env *env,
+			      struct llog_handle *llh)
+{
+	int rc;
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, NULL);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_process_or_fork(env, llh, cat_cancel_cb, NULL, NULL, false);
+	if (rc)
+		CERROR("%s: llog_process() with cat_cancel_cb failed: rc = "
+		       "%d\n", llh->lgh_ctxt->loc_obd->obd_name, rc);
+	RETURN(0);
+}
+EXPORT_SYMBOL(llog_cat_init_and_process);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_internal.h b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
new file mode 100644
index 000000000000..539e1d4f9d4c
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_internal.h
@@ -0,0 +1,98 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#ifndef __LLOG_INTERNAL_H__
+#define __LLOG_INTERNAL_H__
+
+#include <lustre_log.h>
+
+struct llog_process_info {
+	struct llog_handle *lpi_loghandle;
+	llog_cb_t	   lpi_cb;
+	void	       *lpi_cbdata;
+	void	       *lpi_catdata;
+	int		 lpi_rc;
+	struct completion	lpi_completion;
+	const struct lu_env	*lpi_env;
+
+};
+
+struct llog_thread_info {
+	struct lu_attr			 lgi_attr;
+	struct lu_fid			 lgi_fid;
+	struct dt_object_format		 lgi_dof;
+	struct lu_buf			 lgi_buf;
+	loff_t				 lgi_off;
+	struct llog_rec_hdr		 lgi_lrh;
+	struct llog_rec_tail		 lgi_tail;
+};
+
+extern struct lu_context_key llog_thread_key;
+
+static inline struct llog_thread_info *llog_info(const struct lu_env *env)
+{
+	struct llog_thread_info *lgi;
+
+	lgi = lu_context_key_get(&env->le_ctx, &llog_thread_key);
+	LASSERT(lgi);
+	return lgi;
+}
+
+static inline void
+lustre_build_llog_lvfs_oid(struct llog_logid *logid, __u64 ino, __u32 gen)
+{
+	ostid_set_seq_llog(&logid->lgl_oi);
+	ostid_set_id(&logid->lgl_oi, ino);
+	logid->lgl_ogen = gen;
+}
+
+int llog_info_init(void);
+void llog_info_fini(void);
+
+void llog_handle_get(struct llog_handle *loghandle);
+void llog_handle_put(struct llog_handle *loghandle);
+int llog_cat_id2handle(const struct lu_env *env, struct llog_handle *cathandle,
+		       struct llog_handle **res, struct llog_logid *logid);
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data);
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size);
+int llog_process_or_fork(const struct lu_env *env,
+			 struct llog_handle *loghandle,
+			 llog_cb_t cb, void *data, void *catdata, bool fork);
+int llog_cat_cleanup(const struct lu_env *env, struct llog_handle *cathandle,
+		     struct llog_handle *loghandle, int index);
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
new file mode 100644
index 000000000000..0732874e26c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_ioctl.c
@@ -0,0 +1,427 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+static int str2logid(struct llog_logid *logid, char *str, int len)
+{
+	char *start, *end, *endp;
+	__u64 id, seq;
+
+	ENTRY;
+	start = str;
+	if (*start != '#')
+		RETURN(-EINVAL);
+
+	start++;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	id = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	end = strchr(start, '#');
+	if (end == NULL || end == start)
+		RETURN(-EINVAL);
+
+	*end = '\0';
+	seq = simple_strtoull(start, &endp, 0);
+	if (endp != end)
+		RETURN(-EINVAL);
+
+	ostid_set_seq(&logid->lgl_oi, seq);
+	ostid_set_id(&logid->lgl_oi, id);
+
+	start = ++end;
+	if (start - str >= len - 1)
+		RETURN(-EINVAL);
+	logid->lgl_ogen = simple_strtoul(start, &endp, 16);
+	if (*endp != '\0')
+		RETURN(-EINVAL);
+
+	RETURN(0);
+}
+
+static int llog_check_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index, rc = 0;
+
+	ENTRY;
+
+	if (ioc_data && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		ioc_data->ioc_inllen1 = 0;
+		out = ioc_data->ioc_bulk;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+		struct llog_handle	*loghandle;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			l = snprintf(out, remains, "[index]: %05d  [type]: "
+				     "%02x  [len]: %04d failed\n",
+				     cur_index, rec->lrh_type,
+				     rec->lrh_len);
+		}
+		if (handle->lgh_ctxt == NULL)
+			RETURN(-EOPNOTSUPP);
+		rc = llog_cat_id2handle(env, handle, &loghandle, &lir->lid_id);
+		if (rc) {
+			CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+			       POSTID(&lir->lid_id.lgl_oi),
+			       lir->lid_id.lgl_ogen);
+			RETURN(rc);
+		}
+		rc = llog_process(env, loghandle, llog_check_cb, NULL, NULL);
+		llog_handle_put(loghandle);
+	} else {
+		bool ok;
+
+		switch (rec->lrh_type) {
+		case OST_SZ_REC:
+		case MDS_UNLINK_REC:
+		case MDS_UNLINK64_REC:
+		case MDS_SETATTR64_REC:
+		case OBD_CFG_REC:
+		case LLOG_GEN_REC:
+		case LLOG_HDR_MAGIC:
+			ok = true;
+			break;
+		default:
+			ok = false;
+		}
+
+		l = snprintf(out, remains, "[index]: %05d  [type]: "
+			     "%02x  [len]: %04d %s\n",
+			     cur_index, rec->lrh_type, rec->lrh_len,
+			     ok ? "ok" : "failed");
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: no space to print log records\n",
+			       handle->lgh_ctxt->loc_obd->obd_name);
+			RETURN(-LLOG_EEMPTY);
+		}
+	}
+	RETURN(rc);
+}
+
+static int llog_print_cb(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_rec_hdr *rec, void *data)
+{
+	struct obd_ioctl_data *ioc_data = (struct obd_ioctl_data *)data;
+	static int l, remains, from, to;
+	static char *out;
+	char *endp;
+	int cur_index;
+
+	ENTRY;
+	if (ioc_data != NULL && ioc_data->ioc_inllen1 > 0) {
+		l = 0;
+		remains = ioc_data->ioc_inllen4 +
+			cfs_size_round(ioc_data->ioc_inllen1) +
+			cfs_size_round(ioc_data->ioc_inllen2) +
+			cfs_size_round(ioc_data->ioc_inllen3);
+		from = simple_strtol(ioc_data->ioc_inlbuf2, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		to = simple_strtol(ioc_data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			RETURN(-EINVAL);
+		out = ioc_data->ioc_bulk;
+		ioc_data->ioc_inllen1 = 0;
+	}
+
+	cur_index = rec->lrh_index;
+	if (cur_index < from)
+		RETURN(0);
+	if (to > 0 && cur_index > to)
+		RETURN(-LLOG_EEMPTY);
+
+	if (handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT) {
+		struct llog_logid_rec *lir = (struct llog_logid_rec *)rec;
+
+		if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+			CERROR("invalid record in catalog\n");
+			RETURN(-EINVAL);
+		}
+
+		l = snprintf(out, remains,
+			     "[index]: %05d  [logid]: #"DOSTID"#%08x\n",
+			     cur_index, POSTID(&lir->lid_id.lgl_oi),
+			     lir->lid_id.lgl_ogen);
+	} else if (rec->lrh_type == OBD_CFG_REC) {
+		int rc;
+
+		rc = class_config_parse_rec(rec, out, remains);
+		if (rc < 0)
+			RETURN(rc);
+		l = rc;
+	} else {
+		l = snprintf(out, remains,
+			     "[index]: %05d  [type]: %02x  [len]: %04d\n",
+			     cur_index, rec->lrh_type, rec->lrh_len);
+	}
+	out += l;
+	remains -= l;
+	if (remains <= 0) {
+		CERROR("not enough space for print log records\n");
+		RETURN(-LLOG_EEMPTY);
+	}
+
+	RETURN(0);
+}
+static int llog_remove_log(const struct lu_env *env, struct llog_handle *cat,
+			   struct llog_logid *logid)
+{
+	struct llog_handle	*log;
+	int			 rc;
+
+	ENTRY;
+
+	rc = llog_cat_id2handle(env, cat, &log, logid);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot find log #"DOSTID"#%08x\n",
+		       POSTID(&logid->lgl_oi), logid->lgl_ogen);
+		RETURN(-ENOENT);
+	}
+
+	rc = llog_destroy(env, log);
+	if (rc) {
+		CDEBUG(D_IOCTL, "cannot destroy log\n");
+		GOTO(out, rc);
+	}
+	llog_cat_cleanup(env, cat, log, log->u.phd.phd_cookie.lgc_index);
+out:
+	llog_handle_put(log);
+	RETURN(rc);
+
+}
+
+static int llog_delete_cb(const struct lu_env *env, struct llog_handle *handle,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	int			 rc;
+
+	ENTRY;
+	if (rec->lrh_type != LLOG_LOGID_MAGIC)
+		RETURN(-EINVAL);
+	rc = llog_remove_log(env, handle, &lir->lid_id);
+
+	RETURN(rc);
+}
+
+
+int llog_ioctl(const struct lu_env *env, struct llog_ctxt *ctxt, int cmd,
+	       struct obd_ioctl_data *data)
+{
+	struct llog_logid	 logid;
+	int			 rc = 0;
+	struct llog_handle	*handle = NULL;
+
+	ENTRY;
+
+	if (*data->ioc_inlbuf1 == '#') {
+		rc = str2logid(&logid, data->ioc_inlbuf1, data->ioc_inllen1);
+		if (rc)
+			RETURN(rc);
+		rc = llog_open(env, ctxt, &handle, &logid, NULL,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else if (*data->ioc_inlbuf1 == '$') {
+		char *name = data->ioc_inlbuf1 + 1;
+
+		rc = llog_open(env, ctxt, &handle, NULL, name,
+			       LLOG_OPEN_EXISTS);
+		if (rc)
+			RETURN(rc);
+	} else {
+		RETURN(-EINVAL);
+	}
+
+	rc = llog_init_handle(env, handle, 0, NULL);
+	if (rc)
+		GOTO(out_close, rc = -ENOENT);
+
+	switch (cmd) {
+	case OBD_IOC_LLOG_INFO: {
+		int	 l;
+		int	 remains = data->ioc_inllen2 +
+				   cfs_size_round(data->ioc_inllen1);
+		char	*out = data->ioc_bulk;
+
+		l = snprintf(out, remains,
+			     "logid:	    #"DOSTID"#%08x\n"
+			     "flags:	    %x (%s)\n"
+			     "records count:    %d\n"
+			     "last index:       %d\n",
+			     POSTID(&handle->lgh_id.lgl_oi),
+			     handle->lgh_id.lgl_ogen,
+			     handle->lgh_hdr->llh_flags,
+			     handle->lgh_hdr->llh_flags &
+			     LLOG_F_IS_CAT ? "cat" : "plain",
+			     handle->lgh_hdr->llh_count,
+			     handle->lgh_last_idx);
+		out += l;
+		remains -= l;
+		if (remains <= 0) {
+			CERROR("%s: not enough space for log header info\n",
+			       ctxt->loc_obd->obd_name);
+			rc = -ENOSPC;
+		}
+		break;
+	}
+	case OBD_IOC_LLOG_CHECK:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_check_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_PRINT:
+		LASSERT(data->ioc_inllen1 > 0);
+		rc = llog_process(env, handle, llog_print_cb, data, NULL);
+		if (rc == -LLOG_EEMPTY)
+			rc = 0;
+		else if (rc)
+			GOTO(out_close, rc);
+		break;
+	case OBD_IOC_LLOG_CANCEL: {
+		struct llog_cookie cookie;
+		struct llog_logid plain;
+		char *endp;
+
+		cookie.lgc_index = simple_strtoul(data->ioc_inlbuf3, &endp, 0);
+		if (*endp != '\0')
+			GOTO(out_close, rc = -EINVAL);
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_cancel_rec(NULL, handle, cookie.lgc_index);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 == NULL) /* catalog but no logid */
+			GOTO(out_close, rc = -ENOTTY);
+
+		rc = str2logid(&plain, data->ioc_inlbuf2, data->ioc_inllen2);
+		if (rc)
+			GOTO(out_close, rc);
+		cookie.lgc_lgl = plain;
+		rc = llog_cat_cancel_records(env, handle, 1, &cookie);
+		if (rc)
+			GOTO(out_close, rc);
+		break;
+	}
+	case OBD_IOC_LLOG_REMOVE: {
+		struct llog_logid plain;
+
+		if (handle->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN) {
+			rc = llog_destroy(env, handle);
+			GOTO(out_close, rc);
+		} else if (!(handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)) {
+			GOTO(out_close, rc = -EINVAL);
+		}
+
+		if (data->ioc_inlbuf2 > 0) {
+			/* remove indicate log from the catalog */
+			rc = str2logid(&plain, data->ioc_inlbuf2,
+				       data->ioc_inllen2);
+			if (rc)
+				GOTO(out_close, rc);
+			rc = llog_remove_log(env, handle, &plain);
+		} else {
+			/* remove all the log of the catalog */
+			rc = llog_process(env, handle, llog_delete_cb, NULL,
+					  NULL);
+			if (rc)
+				GOTO(out_close, rc);
+		}
+		break;
+	}
+	default:
+		CERROR("%s: Unknown ioctl cmd %#x\n",
+		       ctxt->loc_obd->obd_name, cmd);
+		GOTO(out_close, rc = -ENOTTY);
+	}
+
+out_close:
+	if (handle->lgh_hdr &&
+	    handle->lgh_hdr->llh_flags & LLOG_F_IS_CAT)
+		llog_cat_close(env, handle);
+	else
+		llog_close(env, handle);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_ioctl);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
new file mode 100644
index 000000000000..7e12dc62141f
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_lvfs.c
@@ -0,0 +1,862 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_lvfs.c
+ *
+ * OST<->MDS recovery logging infrastructure.
+ * Invariants in implementation:
+ * - we do not share logs among different OST<->MDS connections, so that
+ *   if an OST or MDS fails it need only look at log(s) relevant to itself
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_log.h>
+#include <obd_ost.h>
+#include <linux/list.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <lustre_disk.h>
+#include "llog_internal.h"
+
+#if  defined(LLOG_LVFS)
+
+static int llog_lvfs_pad(struct obd_device *obd, struct l_file *file,
+				int len, int index)
+{
+	struct llog_rec_hdr rec = { 0 };
+	struct llog_rec_tail tail;
+	int rc;
+	ENTRY;
+
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	tail.lrt_len = rec.lrh_len = len;
+	tail.lrt_index = rec.lrh_index = index;
+	rec.lrh_type = LLOG_PAD_MAGIC;
+
+	rc = fsfilt_write_record(obd, file, &rec, sizeof(rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+	file->f_pos += len - sizeof(rec) - sizeof(tail);
+	rc = fsfilt_write_record(obd, file, &tail, sizeof(tail),&file->f_pos,0);
+	if (rc) {
+		CERROR("error writing padding record: rc %d\n", rc);
+		goto out;
+	}
+
+ out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file,
+				struct llog_rec_hdr *rec, void *buf, loff_t off)
+{
+	int rc;
+	struct llog_rec_tail end;
+	loff_t saved_off = file->f_pos;
+	int buflen = rec->lrh_len;
+
+	ENTRY;
+
+	file->f_pos = off;
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	if (!buf) {
+		rc = fsfilt_write_record(obd, file, rec, buflen,&file->f_pos,0);
+		if (rc) {
+			CERROR("error writing log record: rc %d\n", rc);
+			goto out;
+		}
+		GOTO(out, rc = 0);
+	}
+
+	/* the buf case */
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(end);
+	rc = fsfilt_write_record(obd, file, rec, sizeof(*rec), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log hdr: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = fsfilt_write_record(obd, file, buf, buflen, &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log buffer: rc %d\n", rc);
+		goto out;
+	}
+
+	end.lrt_len = rec->lrh_len;
+	end.lrt_index = rec->lrh_index;
+	rc = fsfilt_write_record(obd, file, &end, sizeof(end), &file->f_pos, 0);
+	if (rc) {
+		CERROR("error writing log tail: rc %d\n", rc);
+		goto out;
+	}
+
+	rc = 0;
+ out:
+	if (saved_off > file->f_pos)
+		file->f_pos = saved_off;
+	LASSERT(rc <= 0);
+	RETURN(rc);
+}
+
+static int llog_lvfs_read_blob(struct obd_device *obd, struct l_file *file,
+				void *buf, int size, loff_t off)
+{
+	loff_t offset = off;
+	int rc;
+	ENTRY;
+
+	rc = fsfilt_read_record(obd, file, buf, size, &offset);
+	if (rc) {
+		CERROR("error reading log record: rc %d\n", rc);
+		RETURN(rc);
+	}
+	RETURN(0);
+}
+
+static int llog_lvfs_read_header(const struct lu_env *env,
+				 struct llog_handle *handle)
+{
+	struct obd_device *obd;
+	int rc;
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	obd = handle->lgh_ctxt->loc_exp->exp_obd;
+
+	if (i_size_read(handle->lgh_file->f_dentry->d_inode) == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	rc = llog_lvfs_read_blob(obd, handle->lgh_file, handle->lgh_hdr,
+				 LLOG_CHUNK_SIZE, 0);
+	if (rc) {
+		CERROR("error reading log header from %.*s\n",
+		       handle->lgh_file->f_dentry->d_name.len,
+		       handle->lgh_file->f_dentry->d_name.name);
+	} else {
+		struct llog_rec_hdr *llh_hdr = &handle->lgh_hdr->llh_hdr;
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+			lustre_swab_llog_hdr(handle->lgh_hdr);
+
+		if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+			CERROR("bad log %.*s header magic: %#x (expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+			rc = -EIO;
+		} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+			CERROR("incorrectly sized log %.*s header: %#x "
+			       "(expected %#x)\n",
+			       handle->lgh_file->f_dentry->d_name.len,
+			       handle->lgh_file->f_dentry->d_name.name,
+			       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+			CERROR("you may need to re-run lconf --write_conf.\n");
+			rc = -EIO;
+		}
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+	handle->lgh_file->f_pos = i_size_read(handle->lgh_file->f_dentry->d_inode);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_lvfs_write_rec(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       struct llog_rec_hdr *rec,
+			       struct llog_cookie *reccookie, int cookiecount,
+			       void *buf, int idx, struct thandle *th)
+{
+	struct llog_log_hdr *llh;
+	int reclen = rec->lrh_len, index, rc;
+	struct llog_rec_tail *lrt;
+	struct obd_device *obd;
+	struct file *file;
+	size_t left;
+	ENTRY;
+
+	llh = loghandle->lgh_hdr;
+	file = loghandle->lgh_file;
+	obd = loghandle->lgh_ctxt->loc_exp->exp_obd;
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		loff_t saved_offset;
+
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && !i_size_read(file->f_dentry->d_inode)) {
+			CERROR("idx != -1 in empty log\n");
+			LBUG();
+		}
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("Modify unset record %u\n", idx);
+		if (idx != rec->lrh_index)
+			CERROR("Index mismatch %d %u\n", idx, rec->lrh_index);
+
+		rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			saved_offset = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx, rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(saved_offset - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("modify idx mismatch %u/%d\n",
+				       idx, loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			saved_offset = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_lvfs_write_blob(obd, file, rec, buf, saved_offset);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	left = LLOG_CHUNK_SIZE - (file->f_pos & (LLOG_CHUNK_SIZE - 1));
+
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		 index = loghandle->lgh_last_idx + 1;
+		 rc = llog_lvfs_pad(obd, file, left, index);
+		 if (rc)
+			 RETURN(rc);
+		 loghandle->lgh_last_idx++; /*for pad rec*/
+	 }
+	 /* if it's the last idx in log file, then return -ENOSPC */
+	 if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		 RETURN(-ENOSPC);
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)
+			((char *)rec + rec->lrh_len - sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/*The caller should make sure only 1 process access the lgh_last_idx,
+	 *Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("argh, index %u already set in log bitmap?\n", index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	llh->llh_tail.lrt_index = index;
+
+	rc = llog_lvfs_write_blob(obd, file, &llh->llh_hdr, NULL, 0);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_lvfs_write_blob(obd, file, rec, buf, file->f_pos);
+	if (rc)
+		RETURN(rc);
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u \n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	if (rc == 0 && rec->lrh_type == LLOG_GEN_REC)
+		rc = 1;
+
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+* minimum sized log records we are skipping.  If it turns out
+* that we are not far enough along the log (because the
+* actual records are larger than minimum size) we just skip
+* some more records. */
+
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal-curr-1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_lvfs_next_block(const struct lu_env *env,
+				struct llog_handle *loghandle, int *cur_idx,
+				int next_idx, __u64 *cur_offset, void *buf,
+				int len)
+{
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	while (*cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos;
+		int llen;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		ppos = *cur_offset;
+		llen = LLOG_CHUNK_SIZE - (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, llen,
+					cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       *cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = *cur_offset - ppos;
+		if (rc < len) {
+			/* signal the end of the valid buffer to llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset "
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, next_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static int llog_lvfs_prev_block(const struct lu_env *env,
+				struct llog_handle *loghandle,
+				int prev_idx, void *buf, int len)
+{
+	__u64 cur_offset;
+	int rc;
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	while (cur_offset < i_size_read(loghandle->lgh_file->f_dentry->d_inode)) {
+		struct llog_rec_hdr *rec, *last_rec;
+		struct llog_rec_tail *tail;
+		loff_t ppos = cur_offset;
+
+		rc = fsfilt_read_record(loghandle->lgh_ctxt->loc_exp->exp_obd,
+					loghandle->lgh_file, buf, len,
+					&cur_offset);
+		if (rc < 0) {
+			CERROR("Cant read llog block at log id "DOSTID
+			       "/%u offset "LPU64"\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen,
+			       cur_offset);
+			RETURN(rc);
+		}
+
+		/* put number of bytes read into rc to make code simpler */
+		rc = cur_offset - ppos;
+
+		if (rc == 0) /* end of file, nothing to do */
+			RETURN(0);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("Invalid llog block at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)(buf + rc -
+						sizeof(struct llog_rec_tail));
+
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)(buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("Invalid llog tail at log id "DOSTID"/%u offset"
+			       LPU64"\n", POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			RETURN(-EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("missed desired record? %u > %u\n",
+			       rec->lrh_index, prev_idx);
+			RETURN(-ENOENT);
+		}
+		RETURN(0);
+	}
+	RETURN(-EIO);
+}
+
+static struct file *llog_filp_open(char *dir, char *name, int flags, int mode)
+{
+	char *logname;
+	struct file *filp;
+	int len;
+
+	OBD_ALLOC(logname, PATH_MAX);
+	if (logname == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	len = snprintf(logname, PATH_MAX, "%s/%s", dir, name);
+	if (len >= PATH_MAX - 1) {
+		filp = ERR_PTR(-ENAMETOOLONG);
+	} else {
+		filp = l_filp_open(logname, flags, mode);
+		if (IS_ERR(filp) && PTR_ERR(filp) != -ENOENT)
+			CERROR("logfile creation %s: %ld\n", logname,
+			       PTR_ERR(filp));
+	}
+	OBD_FREE(logname, PATH_MAX);
+	return filp;
+}
+
+static int llog_lvfs_open(const struct lu_env *env,  struct llog_handle *handle,
+			  struct llog_logid *logid, char *name,
+			  enum llog_open_param open_param)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct l_dentry		*dchild = NULL;
+	struct obd_device	*obd;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	obd = ctxt->loc_exp->exp_obd;
+
+	LASSERT(handle);
+	if (logid != NULL) {
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &logid->lgl_oi,
+					     logid->lgl_ogen);
+		if (IS_ERR(dchild)) {
+			rc = PTR_ERR(dchild);
+			CERROR("%s: error looking up logfile #"DOSTID "#%08x:"
+			       " rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		if (dchild->d_inode == NULL) {
+			l_dput(dchild);
+			rc = -ENOENT;
+			CERROR("%s: nonexistent llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild,
+						 O_RDWR | O_LARGEFILE);
+		l_dput(dchild);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			CERROR("%s: error opening llog #"DOSTID"#%08x:"
+			       "rc = %d\n", ctxt->loc_obd->obd_name,
+			       POSTID(&logid->lgl_oi), logid->lgl_ogen, rc);
+			GOTO(out, rc);
+		}
+		handle->lgh_id = *logid;
+	} else if (name) {
+		handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, name,
+						  O_RDWR | O_LARGEFILE, 0644);
+		if (IS_ERR(handle->lgh_file)) {
+			rc = PTR_ERR(handle->lgh_file);
+			handle->lgh_file = NULL;
+			if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+				OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+				if (handle->lgh_name)
+					strcpy(handle->lgh_name, name);
+				else
+					GOTO(out, rc = -ENOMEM);
+				rc = 0;
+			} else {
+				GOTO(out, rc);
+			}
+		} else {
+			lustre_build_llog_lvfs_oid(&handle->lgh_id,
+			    handle->lgh_file->f_dentry->d_inode->i_ino,
+			    handle->lgh_file->f_dentry->d_inode->i_generation);
+		}
+	} else {
+		LASSERTF(open_param == LLOG_OPEN_NEW, "%#x\n", open_param);
+		handle->lgh_file = NULL;
+	}
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && handle->lgh_file == NULL)
+		GOTO(out_name, rc = -ENOENT);
+
+	RETURN(0);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	RETURN(rc);
+}
+
+static int llog_lvfs_exist(struct llog_handle *handle)
+{
+	return (handle->lgh_file != NULL);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_lvfs_create(const struct lu_env *env,
+			    struct llog_handle *handle,
+			    struct thandle *th)
+{
+	struct llog_ctxt	*ctxt = handle->lgh_ctxt;
+	struct obd_device	*obd;
+	struct l_dentry		*dchild = NULL;
+	struct file		*file;
+	struct obdo		*oa = NULL;
+	int			 rc = 0;
+	int			 open_flags = O_RDWR | O_CREAT | O_LARGEFILE;
+
+	ENTRY;
+
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	obd = ctxt->loc_exp->exp_obd;
+	LASSERT(handle->lgh_file == NULL);
+
+	if (handle->lgh_name) {
+		file = llog_filp_open(MOUNT_CONFIGS_DIR, handle->lgh_name,
+				      open_flags, 0644);
+		if (IS_ERR(file))
+			RETURN(PTR_ERR(file));
+
+		lustre_build_llog_lvfs_oid(&handle->lgh_id,
+				file->f_dentry->d_inode->i_ino,
+				file->f_dentry->d_inode->i_generation);
+		handle->lgh_file = file;
+	} else {
+		OBDO_ALLOC(oa);
+		if (oa == NULL)
+			RETURN(-ENOMEM);
+
+		ostid_set_seq_llog(&oa->o_oi);
+		oa->o_valid = OBD_MD_FLGENER | OBD_MD_FLGROUP;
+
+		rc = obd_create(NULL, ctxt->loc_exp, oa, NULL, NULL);
+		if (rc)
+			GOTO(out, rc);
+
+		/* FIXME: rationalize the misuse of o_generation in
+		 *	this API along with mds_obd_{create,destroy}.
+		 *	Hopefully it is only an internal API issue. */
+#define o_generation o_parent_oid
+		dchild = obd_lvfs_fid2dentry(ctxt->loc_exp, &oa->o_oi,
+					     oa->o_generation);
+		if (IS_ERR(dchild))
+			GOTO(out, rc = PTR_ERR(dchild));
+
+		file = l_dentry_open(&obd->obd_lvfs_ctxt, dchild, open_flags);
+		l_dput(dchild);
+		if (IS_ERR(file))
+			GOTO(out, rc = PTR_ERR(file));
+		handle->lgh_id.lgl_oi = oa->o_oi;
+		handle->lgh_id.lgl_ogen = oa->o_generation;
+		handle->lgh_file = file;
+out:
+		OBDO_FREE(oa);
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_close(const struct lu_env *env,
+			   struct llog_handle *handle)
+{
+	int rc;
+
+	ENTRY;
+
+	if (handle->lgh_file == NULL)
+		RETURN(0);
+	rc = filp_close(handle->lgh_file, 0);
+	if (rc)
+		CERROR("%s: error closing llog #"DOSTID"#%08x: "
+		       "rc = %d\n", handle->lgh_ctxt->loc_obd->obd_name,
+		       POSTID(&handle->lgh_id.lgl_oi),
+		       handle->lgh_id.lgl_ogen, rc);
+	handle->lgh_file = NULL;
+	if (handle->lgh_name) {
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+		handle->lgh_name = NULL;
+	}
+	RETURN(rc);
+}
+
+static int llog_lvfs_destroy(const struct lu_env *env,
+			     struct llog_handle *handle)
+{
+	struct dentry *fdentry;
+	struct obdo *oa;
+	struct obd_device *obd = handle->lgh_ctxt->loc_exp->exp_obd;
+	char *dir;
+	void *th;
+	struct inode *inode;
+	int rc, rc1;
+	ENTRY;
+
+	dir = MOUNT_CONFIGS_DIR;
+
+	LASSERT(handle->lgh_file);
+	fdentry = handle->lgh_file->f_dentry;
+	inode = fdentry->d_parent->d_inode;
+	if (strcmp(fdentry->d_parent->d_name.name, dir) == 0) {
+		struct lvfs_run_ctxt saved;
+		struct vfsmount *mnt = mntget(handle->lgh_file->f_vfsmnt);
+
+		push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		dget(fdentry);
+		rc = llog_lvfs_close(env, handle);
+		if (rc == 0) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			rc = ll_vfs_unlink(inode, fdentry, mnt);
+			mutex_unlock(&inode->i_mutex);
+		}
+		mntput(mnt);
+
+		dput(fdentry);
+		pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
+		RETURN(rc);
+	}
+
+	OBDO_ALLOC(oa);
+	if (oa == NULL)
+		RETURN(-ENOMEM);
+
+	oa->o_oi = handle->lgh_id.lgl_oi;
+	oa->o_generation = handle->lgh_id.lgl_ogen;
+#undef o_generation
+	oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLGENER;
+
+	rc = llog_lvfs_close(env, handle);
+	if (rc)
+		GOTO(out, rc);
+
+	th = fsfilt_start_log(obd, inode, FSFILT_OP_UNLINK, NULL, 1);
+	if (IS_ERR(th)) {
+		CERROR("fsfilt_start failed: %ld\n", PTR_ERR(th));
+		GOTO(out, rc = PTR_ERR(th));
+	}
+
+	rc = obd_destroy(NULL, handle->lgh_ctxt->loc_exp, oa,
+			 NULL, NULL, NULL, NULL);
+
+	rc1 = fsfilt_commit(obd, inode, th, 0);
+	if (rc == 0 && rc1 != 0)
+		rc = rc1;
+ out:
+	OBDO_FREE(oa);
+	RETURN(rc);
+}
+
+static int llog_lvfs_declare_create(const struct lu_env *env,
+				    struct llog_handle *res,
+				    struct thandle *th)
+{
+	return 0;
+}
+
+static int llog_lvfs_declare_write_rec(const struct lu_env *env,
+				       struct llog_handle *loghandle,
+				       struct llog_rec_hdr *rec,
+				       int idx, struct thandle *th)
+{
+	return 0;
+}
+
+struct llog_operations llog_lvfs_ops = {
+	.lop_write_rec		= llog_lvfs_write_rec,
+	.lop_next_block		= llog_lvfs_next_block,
+	.lop_prev_block		= llog_lvfs_prev_block,
+	.lop_read_header	= llog_lvfs_read_header,
+	.lop_create		= llog_lvfs_create,
+	.lop_destroy		= llog_lvfs_destroy,
+	.lop_close		= llog_lvfs_close,
+	.lop_open		= llog_lvfs_open,
+	.lop_exist		= llog_lvfs_exist,
+	.lop_declare_create	= llog_lvfs_declare_create,
+	.lop_declare_write_rec	= llog_lvfs_declare_write_rec,
+};
+EXPORT_SYMBOL(llog_lvfs_ops);
+#else /* !__KERNEL__ */
+struct llog_operations llog_lvfs_ops = {};
+#endif
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_obd.c b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
new file mode 100644
index 000000000000..7e2290796315
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_obd.c
@@ -0,0 +1,319 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <obd_class.h>
+#include <lustre_log.h>
+#include "llog_internal.h"
+
+/* helper functions for calling the llog obd methods */
+static struct llog_ctxt* llog_new_ctxt(struct obd_device *obd)
+{
+	struct llog_ctxt *ctxt;
+
+	OBD_ALLOC_PTR(ctxt);
+	if (!ctxt)
+		return NULL;
+
+	ctxt->loc_obd = obd;
+	atomic_set(&ctxt->loc_refcount, 1);
+
+	return ctxt;
+}
+
+static void llog_ctxt_destroy(struct llog_ctxt *ctxt)
+{
+	if (ctxt->loc_exp) {
+		class_export_put(ctxt->loc_exp);
+		ctxt->loc_exp = NULL;
+	}
+	if (ctxt->loc_imp) {
+		class_import_put(ctxt->loc_imp);
+		ctxt->loc_imp = NULL;
+	}
+	OBD_FREE_PTR(ctxt);
+}
+
+int __llog_ctxt_put(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct obd_llog_group *olg = ctxt->loc_olg;
+	struct obd_device *obd;
+	int rc = 0;
+
+	spin_lock(&olg->olg_lock);
+	if (!atomic_dec_and_test(&ctxt->loc_refcount)) {
+		spin_unlock(&olg->olg_lock);
+		return rc;
+	}
+	olg->olg_ctxts[ctxt->loc_idx] = NULL;
+	spin_unlock(&olg->olg_lock);
+
+	obd = ctxt->loc_obd;
+	spin_lock(&obd->obd_dev_lock);
+	/* sync with llog ctxt user thread */
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* obd->obd_starting is needed for the case of cleanup
+	 * in error case while obd is starting up. */
+	LASSERTF(obd->obd_starting == 1 ||
+		 obd->obd_stopping == 1 || obd->obd_set_up == 0,
+		 "wrong obd state: %d/%d/%d\n", !!obd->obd_starting,
+		 !!obd->obd_stopping, !!obd->obd_set_up);
+
+	/* cleanup the llog ctxt here */
+	if (CTXTP(ctxt, cleanup))
+		rc = CTXTP(ctxt, cleanup)(env, ctxt);
+
+	llog_ctxt_destroy(ctxt);
+	wake_up(&olg->olg_waitq);
+	return rc;
+}
+EXPORT_SYMBOL(__llog_ctxt_put);
+
+int llog_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL);
+	struct obd_llog_group *olg;
+	int rc, idx;
+	ENTRY;
+
+	LASSERT(ctxt != NULL);
+	LASSERT(ctxt != LP_POISON);
+
+	olg = ctxt->loc_olg;
+	LASSERT(olg != NULL);
+	LASSERT(olg != LP_POISON);
+
+	idx = ctxt->loc_idx;
+
+	/*
+	 * Banlance the ctxt get when calling llog_cleanup()
+	 */
+	LASSERT(atomic_read(&ctxt->loc_refcount) < LI_POISON);
+	LASSERT(atomic_read(&ctxt->loc_refcount) > 1);
+	llog_ctxt_put(ctxt);
+
+	/*
+	 * Try to free the ctxt.
+	 */
+	rc = __llog_ctxt_put(env, ctxt);
+	if (rc)
+		CERROR("Error %d while cleaning up ctxt %p\n",
+		       rc, ctxt);
+
+	l_wait_event(olg->olg_waitq,
+		     llog_group_ctxt_null(olg, idx), &lwi);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cleanup);
+
+int llog_setup(const struct lu_env *env, struct obd_device *obd,
+	       struct obd_llog_group *olg, int index,
+	       struct obd_device *disk_obd, struct llog_operations *op)
+{
+	struct llog_ctxt *ctxt;
+	int rc = 0;
+	ENTRY;
+
+	if (index < 0 || index >= LLOG_MAX_CTXTS)
+		RETURN(-EINVAL);
+
+	LASSERT(olg != NULL);
+
+	ctxt = llog_new_ctxt(obd);
+	if (!ctxt)
+		RETURN(-ENOMEM);
+
+	ctxt->loc_obd = obd;
+	ctxt->loc_olg = olg;
+	ctxt->loc_idx = index;
+	ctxt->loc_logops = op;
+	mutex_init(&ctxt->loc_mutex);
+	ctxt->loc_exp = class_export_get(disk_obd->obd_self_export);
+	ctxt->loc_flags = LLOG_CTXT_FLAG_UNINITIALIZED;
+
+	rc = llog_group_set_ctxt(olg, ctxt, index);
+	if (rc) {
+		llog_ctxt_destroy(ctxt);
+		if (rc == -EEXIST) {
+			ctxt = llog_group_get_ctxt(olg, index);
+			if (ctxt) {
+				/*
+				 * mds_lov_update_desc() might call here multiple
+				 * times. So if the llog is already set up then
+				 * don't to do it again.
+				 */
+				CDEBUG(D_CONFIG, "obd %s ctxt %d already set up\n",
+				       obd->obd_name, index);
+				LASSERT(ctxt->loc_olg == olg);
+				LASSERT(ctxt->loc_obd == obd);
+				LASSERT(ctxt->loc_exp == disk_obd->obd_self_export);
+				LASSERT(ctxt->loc_logops == op);
+				llog_ctxt_put(ctxt);
+			}
+			rc = 0;
+		}
+		RETURN(rc);
+	}
+
+	if (op->lop_setup) {
+		if (OBD_FAIL_CHECK(OBD_FAIL_OBD_LLOG_SETUP))
+			rc = -EOPNOTSUPP;
+		else
+			rc = op->lop_setup(env, obd, olg, index, disk_obd);
+	}
+
+	if (rc) {
+		CERROR("%s: ctxt %d lop_setup=%p failed: rc = %d\n",
+		       obd->obd_name, index, op->lop_setup, rc);
+		llog_group_clear_ctxt(olg, index);
+		llog_ctxt_destroy(ctxt);
+	} else {
+		CDEBUG(D_CONFIG, "obd %s ctxt %d is initialized\n",
+		       obd->obd_name, index);
+		ctxt->loc_flags &= ~LLOG_CTXT_FLAG_UNINITIALIZED;
+	}
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_setup);
+
+int llog_sync(struct llog_ctxt *ctxt, struct obd_export *exp, int flags)
+{
+	int rc = 0;
+	ENTRY;
+
+	if (!ctxt)
+		RETURN(0);
+
+	if (CTXTP(ctxt, sync))
+		rc = CTXTP(ctxt, sync)(ctxt, exp, flags);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_sync);
+
+int llog_obd_add(const struct lu_env *env, struct llog_ctxt *ctxt,
+		 struct llog_rec_hdr *rec, struct lov_stripe_md *lsm,
+		 struct llog_cookie *logcookies, int numcookies)
+{
+	int raised, rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	if (ctxt->loc_flags & LLOG_CTXT_FLAG_UNINITIALIZED)
+		RETURN(-ENXIO);
+
+	CTXT_CHECK_OP(ctxt, obd_add, -EOPNOTSUPP);
+	raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE);
+	if (!raised)
+		cfs_cap_raise(CFS_CAP_SYS_RESOURCE);
+	rc = CTXTP(ctxt, obd_add)(env, ctxt, rec, lsm, logcookies,
+				  numcookies);
+	if (!raised)
+		cfs_cap_lower(CFS_CAP_SYS_RESOURCE);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_obd_add);
+
+int llog_cancel(const struct lu_env *env, struct llog_ctxt *ctxt,
+		struct lov_stripe_md *lsm, int count,
+		struct llog_cookie *cookies, int flags)
+{
+	int rc;
+	ENTRY;
+
+	if (!ctxt) {
+		CERROR("No ctxt\n");
+		RETURN(-ENODEV);
+	}
+
+	CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP);
+	rc = CTXTP(ctxt, cancel)(env, ctxt, lsm, count, cookies, flags);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_cancel);
+
+int obd_llog_init(struct obd_device *obd, struct obd_llog_group *olg,
+		  struct obd_device *disk_obd, int *index)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_init, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_init);
+
+	rc = OBP(obd, llog_init)(obd, olg, disk_obd, index);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_init);
+
+int obd_llog_finish(struct obd_device *obd, int count)
+{
+	int rc;
+	ENTRY;
+	OBD_CHECK_DT_OP(obd, llog_finish, 0);
+	OBD_COUNTER_INCREMENT(obd, llog_finish);
+
+	rc = OBP(obd, llog_finish)(obd, count);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(obd_llog_finish);
+
+/* context key constructor/destructor: llog_key_init, llog_key_fini */
+LU_KEY_INIT_FINI(llog, struct llog_thread_info);
+/* context key: llog_thread_key */
+LU_CONTEXT_KEY_DEFINE(llog, LCT_MD_THREAD | LCT_MG_THREAD | LCT_LOCAL);
+LU_KEY_INIT_GENERIC(llog);
+EXPORT_SYMBOL(llog_thread_key);
+
+int llog_info_init(void)
+{
+	llog_key_init_generic(&llog_thread_key, NULL);
+	lu_context_key_register(&llog_thread_key);
+	return 0;
+}
+
+void llog_info_fini(void)
+{
+	lu_context_key_degister(&llog_thread_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_osd.c b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
new file mode 100644
index 000000000000..6dbd21a863c2
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_osd.c
@@ -0,0 +1,1323 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_osd.c - low level llog routines on top of OSD API
+ *
+ * Author: Alexey Zhuravlev <alexey.zhuravlev@intel.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+#ifndef EXPORT_SYMTAB
+#define EXPORT_SYMTAB
+#endif
+
+#include <obd.h>
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <dt_object.h>
+
+#include "llog_internal.h"
+#include "local_storage.h"
+
+/*
+ * - multi-chunks or big-declaration approach
+ * - use unique sequence instead of llog sb tracking unique ids
+ * - re-use existing environment
+ * - named llog support (can be used for testing only at the present)
+ * - llog_origin_connect() work with OSD API
+ */
+
+static int llog_osd_declare_new_object(const struct lu_env *env,
+				       struct local_oid_storage *los,
+				       struct dt_object *o,
+				       struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_declare_create(env, los, o, &lgi->lgi_attr,
+					   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_create_new_object(const struct lu_env *env,
+				      struct local_oid_storage *los,
+				      struct dt_object *o,
+				      struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+
+	lgi->lgi_attr.la_valid = LA_MODE;
+	lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+	lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+	return local_object_create(env, los, o, &lgi->lgi_attr,
+				   &lgi->lgi_dof, th);
+}
+
+static int llog_osd_pad(const struct lu_env *env, struct dt_object *o,
+			loff_t *off, int len, int index, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(th);
+	LASSERT(off);
+	LASSERT(len >= LLOG_MIN_REC_SIZE && (len & 0x7) == 0);
+
+	lgi->lgi_tail.lrt_len = lgi->lgi_lrh.lrh_len = len;
+	lgi->lgi_tail.lrt_index = lgi->lgi_lrh.lrh_index = index;
+	lgi->lgi_lrh.lrh_type = LLOG_PAD_MAGIC;
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_lrh;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_lrh);
+	dt_write_lock(env, o, 0);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	*off += len - sizeof(lgi->lgi_lrh) - sizeof(lgi->lgi_tail);
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing padding record: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+out:
+	dt_write_unlock(env, o);
+	RETURN(rc);
+}
+
+static int llog_osd_write_blob(const struct lu_env *env, struct dt_object *o,
+			       struct llog_rec_hdr *rec, void *buf,
+			       loff_t *off, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	int			 buflen = rec->lrh_len;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(o);
+
+	if (buflen == 0)
+		CWARN("0-length record\n");
+
+	CDEBUG(D_OTHER, "write blob with type %x, buf %p/%u at off %llu\n",
+	       rec->lrh_type, buf, buflen, *off);
+
+	lgi->lgi_attr.la_valid = LA_SIZE;
+	lgi->lgi_attr.la_size = *off;
+
+	if (!buf) {
+		lgi->lgi_buf.lb_len = buflen;
+		lgi->lgi_buf.lb_buf = rec;
+		rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+		if (rc)
+			CERROR("%s: error writing log record: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out, rc);
+	}
+
+	/* the buf case */
+	/* protect the following 3 writes from concurrent read */
+	dt_write_lock(env, o, 0);
+	rec->lrh_len = sizeof(*rec) + buflen + sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_len = sizeof(*rec);
+	lgi->lgi_buf.lb_buf = rec;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log hdr: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_buf.lb_len = buflen;
+	lgi->lgi_buf.lb_buf = buf;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc) {
+		CERROR("%s: error writing log buffer: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out_unlock, rc);
+	}
+
+	lgi->lgi_tail.lrt_len = rec->lrh_len;
+	lgi->lgi_tail.lrt_index = rec->lrh_index;
+	lgi->lgi_buf.lb_len = sizeof(lgi->lgi_tail);
+	lgi->lgi_buf.lb_buf = &lgi->lgi_tail;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, off, th);
+	if (rc)
+		CERROR("%s: error writing log tail: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, rc);
+
+out_unlock:
+	dt_write_unlock(env, o);
+
+out:
+	/* cleanup the content written above */
+	if (rc) {
+		dt_punch(env, o, lgi->lgi_attr.la_size, OBD_OBJECT_EOF, th,
+			 BYPASS_CAPA);
+		dt_attr_set(env, o, &lgi->lgi_attr, th, BYPASS_CAPA);
+	}
+
+	RETURN(rc);
+}
+
+static int llog_osd_read_header(const struct lu_env *env,
+				struct llog_handle *handle)
+{
+	struct llog_rec_hdr	*llh_hdr;
+	struct dt_object	*o;
+	struct llog_thread_info	*lgi;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(sizeof(*handle->lgh_hdr) == LLOG_CHUNK_SIZE);
+
+	o = handle->lgh_obj;
+	LASSERT(o);
+
+	lgi = llog_info(env);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+
+	if (lgi->lgi_attr.la_size == 0) {
+		CDEBUG(D_HA, "not reading header from 0-byte log\n");
+		RETURN(LLOG_EEMPTY);
+	}
+
+	lgi->lgi_off = 0;
+	lgi->lgi_buf.lb_buf = handle->lgh_hdr;
+	lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
+
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading log header from "DFID": rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       PFID(lu_object_fid(&o->do_lu)), rc);
+		RETURN(rc);
+	}
+
+	llh_hdr = &handle->lgh_hdr->llh_hdr;
+	if (LLOG_REC_HDR_NEEDS_SWABBING(llh_hdr))
+		lustre_swab_llog_hdr(handle->lgh_hdr);
+
+	if (llh_hdr->lrh_type != LLOG_HDR_MAGIC) {
+		CERROR("%s: bad log %s "DFID" header magic: %#x "
+		       "(expected %#x)\n", o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_type, LLOG_HDR_MAGIC);
+		RETURN(-EIO);
+	} else if (llh_hdr->lrh_len != LLOG_CHUNK_SIZE) {
+		CERROR("%s: incorrectly sized log %s "DFID" header: "
+		       "%#x (expected %#x)\n"
+		       "you may need to re-run lconf --write_conf.\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       handle->lgh_name ? handle->lgh_name : "",
+		       PFID(lu_object_fid(&o->do_lu)),
+		       llh_hdr->lrh_len, LLOG_CHUNK_SIZE);
+		RETURN(-EIO);
+	}
+
+	handle->lgh_last_idx = handle->lgh_hdr->llh_tail.lrt_index;
+
+	RETURN(0);
+}
+
+static int llog_osd_declare_write_rec(const struct lu_env *env,
+				      struct llog_handle *loghandle,
+				      struct llog_rec_hdr *rec,
+				      int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(th);
+	LASSERT(loghandle);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	/* each time we update header */
+	rc = dt_declare_record_write(env, o, sizeof(struct llog_log_hdr), 0,
+				     th);
+	if (rc || idx == 0) /* if error or just header */
+		RETURN(rc);
+
+	if (dt_object_exists(o)) {
+		rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+		lgi->lgi_off = lgi->lgi_attr.la_size;
+		LASSERT(ergo(rc == 0, lgi->lgi_attr.la_valid & LA_SIZE));
+		if (rc)
+			RETURN(rc);
+
+		rc = dt_declare_punch(env, o, lgi->lgi_off, OBD_OBJECT_EOF, th);
+		if (rc)
+			RETURN(rc);
+	} else {
+		lgi->lgi_off = 0;
+	}
+
+	/* XXX: implement declared window or multi-chunks approach */
+	rc = dt_declare_record_write(env, o, 32 * 1024, lgi->lgi_off, th);
+
+	RETURN(rc);
+}
+
+/* returns negative in on error; 0 if success && reccookie == 0; 1 otherwise */
+/* appends if idx == -1, otherwise overwrites record idx. */
+static int llog_osd_write_rec(const struct lu_env *env,
+			      struct llog_handle *loghandle,
+			      struct llog_rec_hdr *rec,
+			      struct llog_cookie *reccookie, int cookiecount,
+			      void *buf, int idx, struct thandle *th)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct llog_log_hdr	*llh;
+	int			 reclen = rec->lrh_len;
+	int			 index, rc, old_tail_idx;
+	struct llog_rec_tail	*lrt;
+	struct dt_object	*o;
+	size_t			 left;
+
+	ENTRY;
+
+	LASSERT(env);
+	llh = loghandle->lgh_hdr;
+	LASSERT(llh);
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(th);
+
+	CDEBUG(D_OTHER, "new record %x to "DFID"\n",
+	       rec->lrh_type, PFID(lu_object_fid(&o->do_lu)));
+
+	/* record length should not bigger than LLOG_CHUNK_SIZE */
+	if (buf)
+		rc = (reclen > LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		      sizeof(struct llog_rec_tail)) ? -E2BIG : 0;
+	else
+		rc = (reclen > LLOG_CHUNK_SIZE) ? -E2BIG : 0;
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		RETURN(rc);
+
+	if (buf)
+		/* write_blob adds header and tail to lrh_len. */
+		reclen = sizeof(*rec) + rec->lrh_len +
+			 sizeof(struct llog_rec_tail);
+
+	if (idx != -1) {
+		/* no header: only allowed to insert record 1 */
+		if (idx != 1 && lgi->lgi_attr.la_size == 0)
+			LBUG();
+
+		if (idx && llh->llh_size && llh->llh_size != rec->lrh_len)
+			RETURN(-EINVAL);
+
+		if (!ext2_test_bit(idx, llh->llh_bitmap))
+			CERROR("%s: modify unset record %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx);
+		if (idx != rec->lrh_index)
+			CERROR("%s: index mismatch %d %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+			       rec->lrh_index);
+
+		lgi->lgi_off = 0;
+		rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+					 &lgi->lgi_off, th);
+		/* we are done if we only write the header or on error */
+		if (rc || idx == 0)
+			RETURN(rc);
+
+		if (buf) {
+			/* We assume that caller has set lgh_cur_* */
+			lgi->lgi_off = loghandle->lgh_cur_offset;
+			CDEBUG(D_OTHER,
+			       "modify record "DOSTID": idx:%d/%u/%d, len:%u "
+			       "offset %llu\n",
+			       POSTID(&loghandle->lgh_id.lgl_oi), idx,
+			       rec->lrh_index,
+			       loghandle->lgh_cur_idx, rec->lrh_len,
+			       (long long)(lgi->lgi_off - sizeof(*llh)));
+			if (rec->lrh_index != loghandle->lgh_cur_idx) {
+				CERROR("%s: modify idx mismatch %u/%d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name, idx,
+				       loghandle->lgh_cur_idx);
+				RETURN(-EFAULT);
+			}
+		} else {
+			/* Assumes constant lrh_len */
+			lgi->lgi_off = sizeof(*llh) + (idx - 1) * reclen;
+		}
+
+		rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+		if (rc == 0 && reccookie) {
+			reccookie->lgc_lgl = loghandle->lgh_id;
+			reccookie->lgc_index = idx;
+			rc = 1;
+		}
+		RETURN(rc);
+	}
+
+	/* Make sure that records don't cross a chunk boundary, so we can
+	 * process them page-at-a-time if needed.  If it will cross a chunk
+	 * boundary, write in a fake (but referenced) entry to pad the chunk.
+	 *
+	 * We know that llog_current_log() will return a loghandle that is
+	 * big enough to hold reclen, so all we care about is padding here.
+	 */
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+	left = LLOG_CHUNK_SIZE - (lgi->lgi_off & (LLOG_CHUNK_SIZE - 1));
+	/* NOTE: padding is a record, but no bit is set */
+	if (left != 0 && left != reclen &&
+	    left < (reclen + LLOG_MIN_REC_SIZE)) {
+		index = loghandle->lgh_last_idx + 1;
+		rc = llog_osd_pad(env, o, &lgi->lgi_off, left, index, th);
+		if (rc)
+			RETURN(rc);
+		loghandle->lgh_last_idx++; /*for pad rec*/
+	}
+	/* if it's the last idx in log file, then return -ENOSPC */
+	if (loghandle->lgh_last_idx >= LLOG_BITMAP_SIZE(llh) - 1)
+		RETURN(-ENOSPC);
+
+	loghandle->lgh_last_idx++;
+	index = loghandle->lgh_last_idx;
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	rec->lrh_index = index;
+	if (buf == NULL) {
+		lrt = (struct llog_rec_tail *)((char *)rec + rec->lrh_len -
+					       sizeof(*lrt));
+		lrt->lrt_len = rec->lrh_len;
+		lrt->lrt_index = rec->lrh_index;
+	}
+	/* The caller should make sure only 1 process access the lgh_last_idx,
+	 * Otherwise it might hit the assert.*/
+	LASSERT(index < LLOG_BITMAP_SIZE(llh));
+	spin_lock(&loghandle->lgh_hdr_lock);
+	if (ext2_set_bit(index, llh->llh_bitmap)) {
+		CERROR("%s: index %u already set in log bitmap\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, index);
+		spin_unlock(&loghandle->lgh_hdr_lock);
+		LBUG(); /* should never happen */
+	}
+	llh->llh_count++;
+	spin_unlock(&loghandle->lgh_hdr_lock);
+	old_tail_idx = llh->llh_tail.lrt_index;
+	llh->llh_tail.lrt_index = index;
+
+	lgi->lgi_off = 0;
+	rc = llog_osd_write_blob(env, o, &llh->llh_hdr, NULL, &lgi->lgi_off,
+				 th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, NULL);
+	if (rc)
+		GOTO(out, rc);
+
+	LASSERT(lgi->lgi_attr.la_valid & LA_SIZE);
+	lgi->lgi_off = lgi->lgi_attr.la_size;
+
+	rc = llog_osd_write_blob(env, o, rec, buf, &lgi->lgi_off, th);
+
+out:
+	/* cleanup llog for error case */
+	if (rc) {
+		spin_lock(&loghandle->lgh_hdr_lock);
+		ext2_clear_bit(index, llh->llh_bitmap);
+		llh->llh_count--;
+		spin_unlock(&loghandle->lgh_hdr_lock);
+
+		/* restore the header */
+		loghandle->lgh_last_idx--;
+		llh->llh_tail.lrt_index = old_tail_idx;
+		lgi->lgi_off = 0;
+		llog_osd_write_blob(env, o, &llh->llh_hdr, NULL,
+				    &lgi->lgi_off, th);
+	}
+
+	CDEBUG(D_RPCTRACE, "added record "DOSTID": idx: %u, %u\n",
+	       POSTID(&loghandle->lgh_id.lgl_oi), index, rec->lrh_len);
+	if (rc == 0 && reccookie) {
+		reccookie->lgc_lgl = loghandle->lgh_id;
+		reccookie->lgc_index = index;
+		if ((rec->lrh_type == MDS_UNLINK_REC) ||
+		    (rec->lrh_type == MDS_SETATTR64_REC))
+			reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT;
+		else if (rec->lrh_type == OST_SZ_REC)
+			reccookie->lgc_subsys = LLOG_SIZE_ORIG_CTXT;
+		else
+			reccookie->lgc_subsys = -1;
+		rc = 1;
+	}
+	RETURN(rc);
+}
+
+/* We can skip reading at least as many log blocks as the number of
+ * minimum sized log records we are skipping.  If it turns out
+ * that we are not far enough along the log (because the
+ * actual records are larger than minimum size) we just skip
+ * some more records.
+ */
+static void llog_skip_over(__u64 *off, int curr, int goal)
+{
+	if (goal <= curr)
+		return;
+	*off = (*off + (goal - curr - 1) * LLOG_MIN_REC_SIZE) &
+		~(LLOG_CHUNK_SIZE - 1);
+}
+
+/* sets:
+ *  - cur_offset to the furthest point read in the log file
+ *  - cur_idx to the log index preceeding cur_offset
+ * returns -EIO/-EINVAL on error
+ */
+static int llog_osd_next_block(const struct lu_env *env,
+			       struct llog_handle *loghandle, int *cur_idx,
+			       int next_idx, __u64 *cur_offset, void *buf,
+			       int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	int			 rc;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(lgi);
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u (cur idx %u off "LPU64")\n",
+	       next_idx, *cur_idx, *cur_offset);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (*cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		llog_skip_over(cur_offset, *cur_idx, next_idx);
+
+		/* read up to next LLOG_CHUNK_SIZE block */
+		lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE -
+				      (*cur_offset & (LLOG_CHUNK_SIZE - 1));
+		lgi->lgi_buf.lb_buf = buf;
+
+		/* Note: read lock is not needed around la_size get above at
+		 * the time of dt_attr_get(). There are only two cases that
+		 * matter. Either la_size == cur_offset, in which case the
+		 * entire read is skipped, or la_size > cur_offset and the loop
+		 * is entered and this thread is blocked at dt_read_lock()
+		 * until the write is completed. When the write completes, then
+		 * the dt_read() will be done with the full length, and will
+		 * get the full data.
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), *cur_offset,
+			       rc);
+			GOTO(out, rc);
+		}
+
+		if (rc < len) {
+			/* signal the end of the valid buffer to
+			 * llog_process */
+			memset(buf + rc, 0, len - rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		*cur_idx = tail->lrt_index;
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, *cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < next_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > next_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, next_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+static int llog_osd_prev_block(const struct lu_env *env,
+			       struct llog_handle *loghandle,
+			       int prev_idx, void *buf, int len)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o;
+	struct dt_device	*dt;
+	loff_t			 cur_offset;
+	int			 rc;
+
+	ENTRY;
+
+	if (len == 0 || len & (LLOG_CHUNK_SIZE - 1))
+		RETURN(-EINVAL);
+
+	CDEBUG(D_OTHER, "looking for log index %u\n", prev_idx);
+
+	LASSERT(loghandle);
+	LASSERT(loghandle->lgh_ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+	LASSERT(dt_object_exists(o));
+	dt = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(dt);
+
+	cur_offset = LLOG_CHUNK_SIZE;
+	llog_skip_over(&cur_offset, 0, prev_idx);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	while (cur_offset < lgi->lgi_attr.la_size) {
+		struct llog_rec_hdr	*rec, *last_rec;
+		struct llog_rec_tail	*tail;
+
+		lgi->lgi_buf.lb_len = len;
+		lgi->lgi_buf.lb_buf = buf;
+		/* It is OK to have locking around dt_read() only, see
+		 * comment in llog_osd_next_block for details
+		 */
+		dt_read_lock(env, o, 0);
+		rc = dt_read(env, o, &lgi->lgi_buf, &cur_offset);
+		dt_read_unlock(env, o);
+		if (rc < 0) {
+			CERROR("%s: can't read llog block from log "DFID
+			       " offset "LPU64": rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       PFID(lu_object_fid(&o->do_lu)), cur_offset, rc);
+			GOTO(out, rc);
+		}
+
+		if (rc == 0) /* end of file, nothing to do */
+			GOTO(out, rc);
+
+		if (rc < sizeof(*tail)) {
+			CERROR("%s: invalid llog block at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+
+		rec = buf;
+		if (LLOG_REC_HDR_NEEDS_SWABBING(rec))
+			lustre_swab_llog_rec(rec);
+
+		tail = (struct llog_rec_tail *)((char *)buf + rc -
+						sizeof(struct llog_rec_tail));
+		/* get the last record in block */
+		last_rec = (struct llog_rec_hdr *)((char *)buf + rc -
+						   le32_to_cpu(tail->lrt_len));
+
+		if (LLOG_REC_HDR_NEEDS_SWABBING(last_rec))
+			lustre_swab_llog_rec(last_rec);
+		LASSERT(last_rec->lrh_index == tail->lrt_index);
+
+		/* this shouldn't happen */
+		if (tail->lrt_index == 0) {
+			CERROR("%s: invalid llog tail at log id "DOSTID"/%u "
+			       "offset "LPU64"\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       POSTID(&loghandle->lgh_id.lgl_oi),
+			       loghandle->lgh_id.lgl_ogen, cur_offset);
+			GOTO(out, rc = -EINVAL);
+		}
+		if (tail->lrt_index < prev_idx)
+			continue;
+
+		/* sanity check that the start of the new buffer is no farther
+		 * than the record that we wanted.  This shouldn't happen. */
+		if (rec->lrh_index > prev_idx) {
+			CERROR("%s: missed desired record? %u > %u\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       rec->lrh_index, prev_idx);
+			GOTO(out, rc = -ENOENT);
+		}
+		GOTO(out, rc = 0);
+	}
+	GOTO(out, rc = -EIO);
+out:
+	return rc;
+}
+
+struct dt_object *llog_osd_dir_get(const struct lu_env *env,
+				   struct llog_ctxt *ctxt)
+{
+	struct dt_device	*dt;
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dir;
+	int			 rc;
+
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	if (ctxt->loc_dir == NULL) {
+		rc = dt_root_get(env, dt, &dti->dti_fid);
+		if (rc)
+			return ERR_PTR(rc);
+		dir = dt_locate(env, dt, &dti->dti_fid);
+	} else {
+		lu_object_get(&ctxt->loc_dir->do_lu);
+		dir = ctxt->loc_dir;
+	}
+
+	return dir;
+}
+
+static int llog_osd_open(const struct lu_env *env, struct llog_handle *handle,
+			 struct llog_logid *logid, char *name,
+			 enum llog_open_param open_param)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt = handle->lgh_ctxt;
+	struct dt_object		*o;
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	LASSERT(ctxt);
+	LASSERT(ctxt->loc_exp);
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	LASSERT(dt);
+
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, name != NULL ? FID_SEQ_LLOG_NAME : FID_SEQ_LLOG);
+	mutex_unlock(&ls->ls_los_mutex);
+	LASSERT(los);
+	ls_device_put(env, ls);
+
+	LASSERT(handle);
+
+	if (logid != NULL) {
+		logid_to_fid(logid, &lgi->lgi_fid);
+	} else if (name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out, rc = PTR_ERR(llog_dir));
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_lookup_dir(env, llog_dir, name, &lgi->lgi_fid);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc == -ENOENT && open_param == LLOG_OPEN_NEW) {
+			/* generate fid for new llog */
+			rc = local_object_fid_generate(env, los,
+						       &lgi->lgi_fid);
+		}
+		if (rc < 0)
+			GOTO(out, rc);
+		OBD_ALLOC(handle->lgh_name, strlen(name) + 1);
+		if (handle->lgh_name)
+			strcpy(handle->lgh_name, name);
+		else
+			GOTO(out, rc = -ENOMEM);
+	} else {
+		LASSERTF(open_param & LLOG_OPEN_NEW, "%#x\n", open_param);
+		/* generate fid for new llog */
+		rc = local_object_fid_generate(env, los, &lgi->lgi_fid);
+		if (rc < 0)
+			GOTO(out, rc);
+	}
+
+	o = ls_locate(env, ls, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		GOTO(out_name, rc = PTR_ERR(o));
+
+	/* No new llog is expected but doesn't exist */
+	if (open_param != LLOG_OPEN_NEW && !dt_object_exists(o))
+		GOTO(out_put, rc = -ENOENT);
+
+	fid_to_logid(&lgi->lgi_fid, &handle->lgh_id);
+	handle->lgh_obj = o;
+	handle->private_data = los;
+	LASSERT(handle->lgh_ctxt);
+
+	RETURN(rc);
+
+out_put:
+	lu_object_put(env, &o->do_lu);
+out_name:
+	if (handle->lgh_name != NULL)
+		OBD_FREE(handle->lgh_name, strlen(name) + 1);
+out:
+	dt_los_put(los);
+	RETURN(rc);
+}
+
+static int llog_osd_exist(struct llog_handle *handle)
+{
+	LASSERT(handle->lgh_obj);
+	return (dt_object_exists(handle->lgh_obj) &&
+		!lu_object_is_dying(handle->lgh_obj->do_lu.lo_header));
+}
+
+static int llog_osd_declare_create(const struct lu_env *env,
+				   struct llog_handle *res, struct thandle *th)
+{
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct local_oid_storage	*los;
+	struct dt_object		*o;
+	int				 rc;
+
+	ENTRY;
+
+	LASSERT(res->lgh_obj);
+	LASSERT(th);
+
+	/* object can be created by another thread */
+	o = res->lgh_obj;
+	if (dt_object_exists(o))
+		RETURN(0);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	rc = llog_osd_declare_new_object(env, los, o, th);
+	if (rc)
+		RETURN(rc);
+
+	rc = dt_declare_record_write(env, o, LLOG_CHUNK_SIZE, 0, th);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		rc = dt_declare_insert(env, llog_dir,
+				       (struct dt_rec *)&lgi->lgi_fid,
+				       (struct dt_key *)res->lgh_name, th);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't declare named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+/* This is a callback from the llog_* functions.
+ * Assumes caller has already pushed us into the kernel context. */
+static int llog_osd_create(const struct lu_env *env, struct llog_handle *res,
+			   struct thandle *th)
+{
+	struct llog_thread_info *lgi = llog_info(env);
+	struct local_oid_storage *los;
+	struct dt_object	*o;
+	int		      rc = 0;
+
+	ENTRY;
+
+	LASSERT(env);
+	o = res->lgh_obj;
+	LASSERT(o);
+
+	/* llog can be already created */
+	if (dt_object_exists(o))
+		RETURN(-EEXIST);
+
+	los = res->private_data;
+	LASSERT(los);
+
+	dt_write_lock(env, o, 0);
+	if (!dt_object_exists(o))
+		rc = llog_osd_create_new_object(env, los, o, th);
+	else
+		rc = -EEXIST;
+
+	dt_write_unlock(env, o);
+	if (rc)
+		RETURN(rc);
+
+	if (res->lgh_name) {
+		struct dt_object *llog_dir;
+
+		llog_dir = llog_osd_dir_get(env, res->lgh_ctxt);
+		if (IS_ERR(llog_dir))
+			RETURN(PTR_ERR(llog_dir));
+
+		logid_to_fid(&res->lgh_id, &lgi->lgi_fid);
+		dt_read_lock(env, llog_dir, 0);
+		rc = dt_insert(env, llog_dir,
+			       (struct dt_rec *)&lgi->lgi_fid,
+			       (struct dt_key *)res->lgh_name,
+			       th, BYPASS_CAPA, 1);
+		dt_read_unlock(env, llog_dir);
+		lu_object_put(env, &llog_dir->do_lu);
+		if (rc)
+			CERROR("%s: can't create named llog %s: rc = %d\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       res->lgh_name, rc);
+	}
+	RETURN(rc);
+}
+
+static int llog_osd_close(const struct lu_env *env, struct llog_handle *handle)
+{
+	struct local_oid_storage	*los;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(handle->lgh_obj);
+
+	lu_object_put(env, &handle->lgh_obj->do_lu);
+
+	los = handle->private_data;
+	LASSERT(los);
+	dt_los_put(los);
+
+	if (handle->lgh_name)
+		OBD_FREE(handle->lgh_name, strlen(handle->lgh_name) + 1);
+
+	RETURN(rc);
+}
+
+static int llog_osd_destroy(const struct lu_env *env,
+			    struct llog_handle *loghandle)
+{
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o, *llog_dir = NULL;
+	struct dt_device	*d;
+	struct thandle		*th;
+	char			*name = NULL;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = loghandle->lgh_ctxt;
+	LASSERT(ctxt);
+
+	o = loghandle->lgh_obj;
+	LASSERT(o);
+
+	d = lu2dt_dev(o->do_lu.lo_dev);
+	LASSERT(d);
+	LASSERT(d == ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt);
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		RETURN(PTR_ERR(th));
+
+	if (loghandle->lgh_name) {
+		llog_dir = llog_osd_dir_get(env, ctxt);
+		if (IS_ERR(llog_dir))
+			GOTO(out_trans, rc = PTR_ERR(llog_dir));
+
+		name = loghandle->lgh_name;
+		rc = dt_declare_delete(env, llog_dir,
+				       (struct dt_key *)name, th);
+		if (rc)
+			GOTO(out_trans, rc);
+	}
+
+	dt_declare_ref_del(env, o, th);
+
+	rc = dt_declare_destroy(env, o, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	dt_write_lock(env, o, 0);
+	if (dt_object_exists(o)) {
+		if (name) {
+			dt_read_lock(env, llog_dir, 0);
+			rc = dt_delete(env, llog_dir,
+				       (struct dt_key *) name,
+				       th, BYPASS_CAPA);
+			dt_read_unlock(env, llog_dir);
+			if (rc) {
+				CERROR("%s: can't remove llog %s: rc = %d\n",
+				       o->do_lu.lo_dev->ld_obd->obd_name,
+				       name, rc);
+				GOTO(out_unlock, rc);
+			}
+		}
+		dt_ref_del(env, o, th);
+		rc = dt_destroy(env, o, th);
+		if (rc)
+			GOTO(out_unlock, rc);
+	}
+out_unlock:
+	dt_write_unlock(env, o);
+out_trans:
+	dt_trans_stop(env, d, th);
+	if (llog_dir != NULL)
+		lu_object_put(env, &llog_dir->do_lu);
+	RETURN(rc);
+}
+
+static int llog_osd_setup(const struct lu_env *env, struct obd_device *obd,
+			  struct obd_llog_group *olg, int ctxt_idx,
+			  struct obd_device *disk_obd)
+{
+	struct local_oid_storage	*los;
+	struct llog_thread_info		*lgi = llog_info(env);
+	struct llog_ctxt		*ctxt;
+	int				 rc = 0;
+
+	ENTRY;
+
+	LASSERT(obd);
+	LASSERT(olg->olg_ctxts[ctxt_idx]);
+
+	ctxt = llog_ctxt_get(olg->olg_ctxts[ctxt_idx]);
+	LASSERT(ctxt);
+
+	/* initialize data allowing to generate new fids,
+	 * literally we need a sequece */
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	if (rc < 0)
+		return rc;
+
+	lgi->lgi_fid.f_seq = FID_SEQ_LLOG_NAME;
+	lgi->lgi_fid.f_oid = 1;
+	lgi->lgi_fid.f_ver = 0;
+	rc = local_oid_storage_init(env, disk_obd->obd_lvfs_ctxt.dt,
+				    &lgi->lgi_fid, &los);
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+static int llog_osd_cleanup(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct dt_device		*dt;
+	struct ls_device		*ls;
+	struct local_oid_storage	*los, *nlos;
+
+	LASSERT(ctxt->loc_exp->exp_obd);
+	dt = ctxt->loc_exp->exp_obd->obd_lvfs_ctxt.dt;
+	ls = ls_device_get(dt);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	los = dt_los_find(ls, FID_SEQ_LLOG);
+	nlos = dt_los_find(ls, FID_SEQ_LLOG_NAME);
+	mutex_unlock(&ls->ls_los_mutex);
+	if (los != NULL) {
+		dt_los_put(los);
+		local_oid_storage_fini(env, los);
+	}
+	if (nlos != NULL) {
+		dt_los_put(nlos);
+		local_oid_storage_fini(env, nlos);
+	}
+	ls_device_put(env, ls);
+	return 0;
+}
+
+struct llog_operations llog_osd_ops = {
+	.lop_next_block		= llog_osd_next_block,
+	.lop_prev_block		= llog_osd_prev_block,
+	.lop_read_header	= llog_osd_read_header,
+	.lop_destroy		= llog_osd_destroy,
+	.lop_setup		= llog_osd_setup,
+	.lop_cleanup		= llog_osd_cleanup,
+	.lop_open		= llog_osd_open,
+	.lop_exist		= llog_osd_exist,
+	.lop_declare_create	= llog_osd_declare_create,
+	.lop_create		= llog_osd_create,
+	.lop_declare_write_rec	= llog_osd_declare_write_rec,
+	.lop_write_rec		= llog_osd_write_rec,
+	.lop_close		= llog_osd_close,
+};
+EXPORT_SYMBOL(llog_osd_ops);
+
+/* reads the catalog list */
+int llog_osd_get_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	ENTRY;
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx *  sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		th = dt_trans_create(env, d);
+		if (IS_ERR(th))
+			GOTO(out, rc = PTR_ERR(th));
+
+		lgi->lgi_attr.la_valid = LA_MODE;
+		lgi->lgi_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		lgi->lgi_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, d, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (!dt_object_exists(o))
+			rc = dt_create(env, o, &lgi->lgi_attr, NULL,
+				       &lgi->lgi_dof, th);
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, d, th);
+		if (rc)
+			GOTO(out, rc);
+	}
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	CDEBUG(D_CONFIG, "cat list: disk size=%d, read=%d\n",
+	       (int)lgi->lgi_attr.la_size, size);
+
+	/* return just number of llogs */
+	if (idarray == NULL) {
+		rc = lgi->lgi_attr.la_size / sizeof(*idarray);
+		GOTO(out, rc);
+	}
+
+	/* read for new ost index or for empty file */
+	memset(idarray, 0, size);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		GOTO(out, rc = 0);
+	if (lgi->lgi_attr.la_size < lgi->lgi_off + size)
+		size = lgi->lgi_attr.la_size - lgi->lgi_off;
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_read(env, o, &lgi->lgi_buf, &lgi->lgi_off);
+	if (rc) {
+		CERROR("%s: error reading CATALOGS: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,  rc);
+		GOTO(out, rc);
+	}
+
+	EXIT;
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_get_cat_list);
+
+/* writes the cat list */
+int llog_osd_put_cat_list(const struct lu_env *env, struct dt_device *d,
+			  int idx, int count, struct llog_catid *idarray)
+{
+	struct llog_thread_info	*lgi = llog_info(env);
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	int			 rc, size;
+
+	if (!count)
+		RETURN(0);
+
+	LASSERT(d);
+
+	size = sizeof(*idarray) * count;
+	lgi->lgi_off = idx * sizeof(*idarray);
+
+	lu_local_obj_fid(&lgi->lgi_fid, LLOG_CATALOGS_OID);
+
+	o = dt_locate(env, d, &lgi->lgi_fid);
+	if (IS_ERR(o))
+		RETURN(PTR_ERR(o));
+
+	if (!dt_object_exists(o))
+		GOTO(out, rc = -ENOENT);
+
+	rc = dt_attr_get(env, o, &lgi->lgi_attr, BYPASS_CAPA);
+	if (rc)
+		GOTO(out, rc);
+
+	if (!S_ISREG(lgi->lgi_attr.la_mode)) {
+		CERROR("%s: CATALOGS is not a regular file!: mode = %o\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name,
+		       lgi->lgi_attr.la_mode);
+		GOTO(out, rc = -ENOENT);
+	}
+
+	th = dt_trans_create(env, d);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = dt_declare_record_write(env, o, size, lgi->lgi_off, th);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = dt_trans_start_local(env, d, th);
+	if (rc)
+		GOTO(out_trans, rc);
+
+	lgi->lgi_buf.lb_buf = idarray;
+	lgi->lgi_buf.lb_len = size;
+	rc = dt_record_write(env, o, &lgi->lgi_buf, &lgi->lgi_off, th);
+	if (rc)
+		CDEBUG(D_INODE, "error writeing CATALOGS: rc = %d\n", rc);
+out_trans:
+	dt_trans_stop(env, d, th);
+out:
+	lu_object_put(env, &o->do_lu);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(llog_osd_put_cat_list);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_swab.c b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
new file mode 100644
index 000000000000..dedfecff95bc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_swab.c
@@ -0,0 +1,407 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_swab.c
+ *
+ * Swabbing of llog datatypes (from disk or over the wire).
+ *
+ * Author: jacob berkman  <jacob@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_LOG
+
+
+#include <lustre_log.h>
+
+static void print_llogd_body(struct llogd_body *d)
+{
+	CDEBUG(D_OTHER, "llogd body: %p\n", d);
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_oi: "DOSTID"\n",
+	       POSTID(&d->lgd_logid.lgl_oi));
+	CDEBUG(D_OTHER, "\tlgd_logid.lgl_ogen: %#x\n", d->lgd_logid.lgl_ogen);
+	CDEBUG(D_OTHER, "\tlgd_ctxt_idx: %#x\n", d->lgd_ctxt_idx);
+	CDEBUG(D_OTHER, "\tlgd_llh_flags: %#x\n", d->lgd_llh_flags);
+	CDEBUG(D_OTHER, "\tlgd_index: %#x\n", d->lgd_index);
+	CDEBUG(D_OTHER, "\tlgd_saved_index: %#x\n", d->lgd_saved_index);
+	CDEBUG(D_OTHER, "\tlgd_len: %#x\n", d->lgd_len);
+	CDEBUG(D_OTHER, "\tlgd_cur_offset: "LPX64"\n", d->lgd_cur_offset);
+}
+
+void lustre_swab_lu_fid(struct lu_fid *fid)
+{
+	__swab64s (&fid->f_seq);
+	__swab32s (&fid->f_oid);
+	__swab32s (&fid->f_ver);
+}
+EXPORT_SYMBOL(lustre_swab_lu_fid);
+
+void lustre_swab_ost_id(struct ost_id *oid)
+{
+	if (fid_seq_is_mdt0(oid->oi.oi_seq)) {
+		__swab64s(&oid->oi.oi_id);
+		__swab64s(&oid->oi.oi_seq);
+	} else {
+		lustre_swab_lu_fid(&oid->oi_fid);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_ost_id);
+
+void lustre_swab_llog_id(struct llog_logid *log_id)
+{
+	__swab64s(&log_id->lgl_oi.oi.oi_id);
+	__swab64s(&log_id->lgl_oi.oi.oi_seq);
+        __swab32s(&log_id->lgl_ogen);
+}
+EXPORT_SYMBOL(lustre_swab_llog_id);
+
+void lustre_swab_llogd_body (struct llogd_body *d)
+{
+	ENTRY;
+	print_llogd_body(d);
+	lustre_swab_llog_id(&d->lgd_logid);
+	__swab32s (&d->lgd_ctxt_idx);
+	__swab32s (&d->lgd_llh_flags);
+	__swab32s (&d->lgd_index);
+	__swab32s (&d->lgd_saved_index);
+	__swab32s (&d->lgd_len);
+	__swab64s (&d->lgd_cur_offset);
+	print_llogd_body(d);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llogd_body);
+
+void lustre_swab_llogd_conn_body (struct llogd_conn_body *d)
+{
+	__swab64s (&d->lgdc_gen.mnt_cnt);
+	__swab64s (&d->lgdc_gen.conn_cnt);
+	lustre_swab_llog_id(&d->lgdc_logid);
+	__swab32s (&d->lgdc_ctxt_idx);
+}
+EXPORT_SYMBOL(lustre_swab_llogd_conn_body);
+
+void lustre_swab_ll_fid(struct ll_fid *fid)
+{
+	__swab64s (&fid->id);
+	__swab32s (&fid->generation);
+	__swab32s (&fid->f_type);
+}
+EXPORT_SYMBOL(lustre_swab_ll_fid);
+
+void lustre_swab_lu_seq_range(struct lu_seq_range *range)
+{
+	__swab64s (&range->lsr_start);
+	__swab64s (&range->lsr_end);
+	__swab32s (&range->lsr_index);
+	__swab32s (&range->lsr_flags);
+}
+EXPORT_SYMBOL(lustre_swab_lu_seq_range);
+
+void lustre_swab_llog_rec(struct llog_rec_hdr *rec)
+{
+	struct llog_rec_tail *tail = NULL;
+
+	__swab32s(&rec->lrh_len);
+	__swab32s(&rec->lrh_index);
+	__swab32s(&rec->lrh_type);
+	__swab32s(&rec->lrh_id);
+
+	switch (rec->lrh_type) {
+	case OST_SZ_REC:
+	{
+		struct llog_size_change_rec *lsc =
+			(struct llog_size_change_rec *)rec;
+
+		lustre_swab_ll_fid(&lsc->lsc_fid);
+		__swab32s(&lsc->lsc_ioepoch);
+		tail = &lsc->lsc_tail;
+		break;
+	}
+	case MDS_UNLINK_REC:
+	{
+		struct llog_unlink_rec *lur = (struct llog_unlink_rec *)rec;
+
+		__swab64s(&lur->lur_oid);
+		__swab32s(&lur->lur_oseq);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case MDS_UNLINK64_REC:
+	{
+		struct llog_unlink64_rec *lur =
+			(struct llog_unlink64_rec *)rec;
+
+		lustre_swab_lu_fid(&lur->lur_fid);
+		__swab32s(&lur->lur_count);
+		tail = &lur->lur_tail;
+		break;
+	}
+	case CHANGELOG_REC:
+	{
+		struct llog_changelog_rec *cr = (struct llog_changelog_rec*)rec;
+
+		__swab16s(&cr->cr.cr_namelen);
+		__swab16s(&cr->cr.cr_flags);
+		__swab32s(&cr->cr.cr_type);
+		__swab64s(&cr->cr.cr_index);
+		__swab64s(&cr->cr.cr_prev);
+		__swab64s(&cr->cr.cr_time);
+		lustre_swab_lu_fid(&cr->cr.cr_tfid);
+		lustre_swab_lu_fid(&cr->cr.cr_pfid);
+		if (CHANGELOG_REC_EXTENDED(&cr->cr)) {
+			struct llog_changelog_ext_rec *ext =
+				(struct llog_changelog_ext_rec *)rec;
+
+			lustre_swab_lu_fid(&ext->cr.cr_sfid);
+			lustre_swab_lu_fid(&ext->cr.cr_spfid);
+			tail = &ext->cr_tail;
+		} else {
+			tail = &cr->cr_tail;
+		}
+		break;
+	}
+	case CHANGELOG_USER_REC:
+	{
+		struct llog_changelog_user_rec *cur =
+			(struct llog_changelog_user_rec*)rec;
+
+		__swab32s(&cur->cur_id);
+		__swab64s(&cur->cur_endrec);
+		tail = &cur->cur_tail;
+		break;
+	}
+
+	case MDS_SETATTR64_REC:
+	{
+		struct llog_setattr64_rec *lsr =
+			(struct llog_setattr64_rec *)rec;
+
+		lustre_swab_ost_id(&lsr->lsr_oi);
+		__swab32s(&lsr->lsr_uid);
+		__swab32s(&lsr->lsr_uid_h);
+		__swab32s(&lsr->lsr_gid);
+		__swab32s(&lsr->lsr_gid_h);
+		tail = &lsr->lsr_tail;
+		break;
+	}
+	case OBD_CFG_REC:
+		/* these are swabbed as they are consumed */
+		break;
+	case LLOG_HDR_MAGIC:
+	{
+		struct llog_log_hdr *llh = (struct llog_log_hdr *)rec;
+
+		__swab64s(&llh->llh_timestamp);
+		__swab32s(&llh->llh_count);
+		__swab32s(&llh->llh_bitmap_offset);
+		__swab32s(&llh->llh_flags);
+		__swab32s(&llh->llh_size);
+		__swab32s(&llh->llh_cat_idx);
+		tail = &llh->llh_tail;
+		break;
+	}
+	case LLOG_LOGID_MAGIC:
+	{
+		struct llog_logid_rec *lid = (struct llog_logid_rec *)rec;
+
+		lustre_swab_llog_id(&lid->lid_id);
+		tail = &lid->lid_tail;
+		break;
+	}
+	case LLOG_GEN_REC:
+	{
+		struct llog_gen_rec *lgr = (struct llog_gen_rec *)rec;
+
+		__swab64s(&lgr->lgr_gen.mnt_cnt);
+		__swab64s(&lgr->lgr_gen.conn_cnt);
+		tail = &lgr->lgr_tail;
+		break;
+	}
+	case LLOG_PAD_MAGIC:
+		break;
+	default:
+		CERROR("Unknown llog rec type %#x swabbing rec %p\n",
+		       rec->lrh_type, rec);
+	}
+
+	if (tail) {
+		__swab32s(&tail->lrt_len);
+		__swab32s(&tail->lrt_index);
+	}
+}
+EXPORT_SYMBOL(lustre_swab_llog_rec);
+
+static void print_llog_hdr(struct llog_log_hdr *h)
+{
+	CDEBUG(D_OTHER, "llog header: %p\n", h);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_index: %#x\n", h->llh_hdr.lrh_index);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_len: %#x\n", h->llh_hdr.lrh_len);
+	CDEBUG(D_OTHER, "\tllh_hdr.lrh_type: %#x\n", h->llh_hdr.lrh_type);
+	CDEBUG(D_OTHER, "\tllh_timestamp: "LPX64"\n", h->llh_timestamp);
+	CDEBUG(D_OTHER, "\tllh_count: %#x\n", h->llh_count);
+	CDEBUG(D_OTHER, "\tllh_bitmap_offset: %#x\n", h->llh_bitmap_offset);
+	CDEBUG(D_OTHER, "\tllh_flags: %#x\n", h->llh_flags);
+	CDEBUG(D_OTHER, "\tllh_size: %#x\n", h->llh_size);
+	CDEBUG(D_OTHER, "\tllh_cat_idx: %#x\n", h->llh_cat_idx);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_index: %#x\n", h->llh_tail.lrt_index);
+	CDEBUG(D_OTHER, "\tllh_tail.lrt_len: %#x\n", h->llh_tail.lrt_len);
+}
+
+void lustre_swab_llog_hdr (struct llog_log_hdr *h)
+{
+	ENTRY;
+	print_llog_hdr(h);
+
+	lustre_swab_llog_rec(&h->llh_hdr);
+
+	print_llog_hdr(h);
+	EXIT;
+}
+EXPORT_SYMBOL(lustre_swab_llog_hdr);
+
+static void print_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	if (!(libcfs_debug & D_OTHER)) /* don't loop on nothing */
+		return;
+	CDEBUG(D_OTHER, "lustre_cfg: %p\n", lcfg);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_version: %#x\n", lcfg->lcfg_version);
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_command: %#x\n", lcfg->lcfg_command);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_num: %#x\n", lcfg->lcfg_num);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_flags: %#x\n", lcfg->lcfg_flags);
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_nid: %s\n", libcfs_nid2str(lcfg->lcfg_nid));
+
+	CDEBUG(D_OTHER, "\tlcfg->lcfg_bufcount: %d\n", lcfg->lcfg_bufcount);
+	if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT)
+		for (i = 0; i < lcfg->lcfg_bufcount; i++)
+			CDEBUG(D_OTHER, "\tlcfg->lcfg_buflens[%d]: %d\n",
+			       i, lcfg->lcfg_buflens[i]);
+	EXIT;
+}
+
+void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg)
+{
+	int i;
+	ENTRY;
+
+	__swab32s(&lcfg->lcfg_version);
+
+	if (lcfg->lcfg_version != LUSTRE_CFG_VERSION) {
+		CERROR("not swabbing lustre_cfg version %#x (expecting %#x)\n",
+		       lcfg->lcfg_version, LUSTRE_CFG_VERSION);
+		EXIT;
+		return;
+	}
+
+	__swab32s(&lcfg->lcfg_command);
+	__swab32s(&lcfg->lcfg_num);
+	__swab32s(&lcfg->lcfg_flags);
+	__swab64s(&lcfg->lcfg_nid);
+	__swab32s(&lcfg->lcfg_bufcount);
+	for (i = 0; i < lcfg->lcfg_bufcount && i < LUSTRE_CFG_MAX_BUFCOUNT; i++)
+		__swab32s(&lcfg->lcfg_buflens[i]);
+
+	print_lustre_cfg(lcfg);
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_lustre_cfg);
+
+/* used only for compatibility with old on-disk cfg_marker data */
+struct cfg_marker32 {
+	__u32   cm_step;
+	__u32   cm_flags;
+	__u32   cm_vers;
+	__u32   padding;
+	__u32   cm_createtime;
+	__u32   cm_canceltime;
+	char    cm_tgtname[MTI_NAME_MAXLEN];
+	char    cm_comment[MTI_NAME_MAXLEN];
+};
+
+#define MTI_NAMELEN32    (MTI_NAME_MAXLEN - \
+	(sizeof(struct cfg_marker) - sizeof(struct cfg_marker32)))
+
+void lustre_swab_cfg_marker(struct cfg_marker *marker, int swab, int size)
+{
+	struct cfg_marker32 *cm32 = (struct cfg_marker32*)marker;
+	ENTRY;
+
+	if (swab) {
+		__swab32s(&marker->cm_step);
+		__swab32s(&marker->cm_flags);
+		__swab32s(&marker->cm_vers);
+	}
+	if (size == sizeof(*cm32)) {
+		__u32 createtime, canceltime;
+		/* There was a problem with the original declaration of
+		 * cfg_marker on 32-bit systems because it used time_t as
+		 * a wire protocol structure, and didn't verify this in
+		 * wirecheck.  We now have to convert the offsets of the
+		 * later fields in order to work on 32- and 64-bit systems.
+		 *
+		 * Fortunately, the cm_comment field has no functional use
+		 * so can be sacrificed when converting the timestamp size.
+		 *
+		 * Overwrite fields from the end first, so they are not
+		 * clobbered, and use memmove() instead of memcpy() because
+		 * the source and target buffers overlap.  bug 16771 */
+		createtime = cm32->cm_createtime;
+		canceltime = cm32->cm_canceltime;
+		memmove(marker->cm_comment, cm32->cm_comment, MTI_NAMELEN32);
+		marker->cm_comment[MTI_NAMELEN32 - 1] = '\0';
+		memmove(marker->cm_tgtname, cm32->cm_tgtname,
+			sizeof(marker->cm_tgtname));
+		if (swab) {
+			__swab32s(&createtime);
+			__swab32s(&canceltime);
+		}
+		marker->cm_createtime = createtime;
+		marker->cm_canceltime = canceltime;
+		CDEBUG(D_CONFIG, "Find old cfg_marker(Srv32b,Clt64b) "
+		       "for target %s, converting\n",
+		       marker->cm_tgtname);
+	} else if (swab) {
+		__swab64s(&marker->cm_createtime);
+		__swab64s(&marker->cm_canceltime);
+	}
+
+	EXIT;
+	return;
+}
+EXPORT_SYMBOL(lustre_swab_cfg_marker);
diff --git a/drivers/staging/lustre/lustre/obdclass/llog_test.c b/drivers/staging/lustre/lustre/obdclass/llog_test.c
new file mode 100644
index 000000000000..d397f781ec43
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/llog_test.c
@@ -0,0 +1,1087 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/llog_test.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include <obd_class.h>
+#include <lustre_fid.h>
+#include <lustre_log.h>
+
+/* This is slightly more than the number of records that can fit into a
+ * single llog file, because the llog_log_header takes up some of the
+ * space in the first block that cannot be used for the bitmap. */
+#define LLOG_TEST_RECNUM  (LLOG_CHUNK_SIZE * 8)
+
+static int llog_test_rand;
+static struct obd_uuid uuid = { .uuid = "test_uuid" };
+static struct llog_logid cat_logid;
+
+struct llog_mini_rec {
+	struct llog_rec_hdr     lmr_hdr;
+	struct llog_rec_tail    lmr_tail;
+} __attribute__((packed));
+
+static int verify_handle(char *test, struct llog_handle *llh, int num_recs)
+{
+	int i;
+	int last_idx = 0;
+	int active_recs = 0;
+
+	for (i = 0; i < LLOG_BITMAP_BYTES * 8; i++) {
+		if (ext2_test_bit(i, llh->lgh_hdr->llh_bitmap)) {
+			last_idx = i;
+			active_recs++;
+		}
+	}
+
+	if (active_recs != num_recs) {
+		CERROR("%s: expected %d active recs after write, found %d\n",
+		       test, num_recs, active_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_hdr->llh_count != num_recs) {
+		CERROR("%s: handle->count is %d, expected %d after write\n",
+		       test, llh->lgh_hdr->llh_count, num_recs);
+		RETURN(-ERANGE);
+	}
+
+	if (llh->lgh_last_idx < last_idx) {
+		CERROR("%s: handle->last_idx is %d, expected %d after write\n",
+		       test, llh->lgh_last_idx, last_idx);
+		RETURN(-ERANGE);
+	}
+
+	RETURN(0);
+}
+
+/* Test named-log create/open, close */
+static int llog_test_1(const struct lu_env *env,
+		       struct obd_device *obd, char *name)
+{
+	struct llog_handle	*llh;
+	struct llog_ctxt	*ctxt;
+	int rc;
+	int rc2;
+
+	ENTRY;
+
+	CWARN("1a: create a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, name);
+	if (rc) {
+		CERROR("1a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(out, rc);
+	}
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("1a: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	rc = verify_handle("1", llh, 1);
+
+	CWARN("1b: close newly-created log\n");
+out_close:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("1b: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* Test named-log reopen; returns opened log on success */
+static int llog_test_2(const struct lu_env *env, struct obd_device *obd,
+		       char *name, struct llog_handle **llh)
+{
+	struct llog_ctxt	*ctxt;
+	struct llog_handle	*loghandle;
+	struct llog_logid	 logid;
+	int			 rc;
+
+	ENTRY;
+
+	CWARN("2a: re-open a log with name: %s\n", name);
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	rc = llog_open(env, ctxt, llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2a: re-open log with name %s failed: %d\n", name, rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, *llh, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2a: can't init llog handle: %d\n", rc);
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = verify_handle("2", *llh, 1);
+	if (rc)
+		GOTO(out_close_llh, rc);
+
+	/* XXX: there is known issue with tests 2b, MGS is not able to create
+	 * anonymous llog, exit now to allow following tests run.
+	 * It is fixed in upcoming llog over OSD code */
+	GOTO(out_put, rc);
+
+	CWARN("2b: create a log without specified NAME & LOGID\n");
+	rc = llog_open_create(env, ctxt, &loghandle, NULL, NULL);
+	if (rc) {
+		CERROR("2b: create log failed\n");
+		GOTO(out_close_llh, rc);
+	}
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2b: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	logid = loghandle->lgh_id;
+	llog_close(env, loghandle);
+
+	CWARN("2c: re-open the log by LOGID\n");
+	rc = llog_open(env, ctxt, &loghandle, &logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("2c: re-open log by LOGID failed\n");
+		GOTO(out_close_llh, rc);
+	}
+
+	rc = llog_init_handle(env, loghandle, LLOG_F_IS_PLAIN, &uuid);
+	if (rc) {
+		CERROR("2c: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+
+	CWARN("2b: destroy this log\n");
+	rc = llog_destroy(env, loghandle);
+	if (rc)
+		CERROR("2d: destroy log failed\n");
+out_close:
+	llog_close(env, loghandle);
+out_close_llh:
+	if (rc)
+		llog_close(env, *llh);
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test record writing, single and in bulk */
+static int llog_test_3(const struct lu_env *env, struct obd_device *obd,
+		       struct llog_handle *llh)
+{
+	struct llog_gen_rec	 lgr;
+	int			 rc, i;
+	int			 num_recs = 1; /* 1 for the header */
+
+	ENTRY;
+
+	lgr.lgr_hdr.lrh_len = lgr.lgr_tail.lrt_len = sizeof(lgr);
+	lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	CWARN("3a: write one create_rec\n");
+	rc = llog_write(env, llh,  &lgr.lgr_hdr, NULL, 0, NULL, -1);
+	num_recs++;
+	if (rc < 0) {
+		CERROR("3a: write one log record failed: %d\n", rc);
+		RETURN(rc);
+	}
+
+	rc = verify_handle("3a", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3b: write 10 cfg log records with 8 bytes bufs\n");
+	for (i = 0; i < 10; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf[8];
+
+		hdr.lrh_len = 8;
+		hdr.lrh_type = OBD_CFG_REC;
+		memset(buf, 0, sizeof buf);
+		rc = llog_write(env, llh, &hdr, NULL, 0, buf, -1);
+		if (rc < 0) {
+			CERROR("3b: write 10 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3b", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3c: write 1000 more log records\n");
+	for (i = 0; i < 1000; i++) {
+		rc = llog_write(env, llh, &lgr.lgr_hdr, NULL, 0, NULL, -1);
+		if (rc < 0) {
+			CERROR("3c: write 1000 records failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+
+	rc = verify_handle("3c", llh, num_recs);
+	if (rc)
+		RETURN(rc);
+
+	CWARN("3d: write log more than BITMAP_SIZE, return -ENOSPC\n");
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr) + 1; i++) {
+		struct llog_rec_hdr	hdr;
+		char			buf_even[24];
+		char			buf_odd[32];
+
+		memset(buf_odd, 0, sizeof buf_odd);
+		memset(buf_even, 0, sizeof buf_even);
+		if ((i % 2) == 0) {
+			hdr.lrh_len = 24;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_even, -1);
+		} else {
+			hdr.lrh_len = 32;
+			hdr.lrh_type = OBD_CFG_REC;
+			rc = llog_write(env, llh, &hdr, NULL, 0, buf_odd, -1);
+		}
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("3d: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			RETURN(rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("3d: write record more than BITMAP size!\n");
+		RETURN(-EINVAL);
+	}
+	CWARN("3d: wrote %d more records before end of llog is reached\n",
+	      num_recs);
+
+	rc = verify_handle("3d", llh, num_recs);
+
+	RETURN(rc);
+}
+
+/* Test catalogue additions */
+static int llog_test_4(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*cath;
+	char			 name[10];
+	int			 rc, rc2, i, buflen;
+	struct llog_mini_rec	 lmr;
+	struct llog_cookie	 cookie;
+	struct llog_ctxt	*ctxt;
+	int			 num_recs = 0;
+	char			*buf;
+	struct llog_rec_hdr	 rec;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	sprintf(name, "%x", llog_test_rand + 1);
+	CWARN("4a: create a catalog log with name: %s\n", name);
+	rc = llog_open_create(env, ctxt, &cath, NULL, name);
+	if (rc) {
+		CERROR("4a: llog_create with name %s failed: %d\n", name, rc);
+		GOTO(ctxt_release, rc);
+	}
+	rc = llog_init_handle(env, cath, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("4a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	num_recs++;
+	cat_logid = cath->lgh_id;
+
+	CWARN("4b: write 1 record into the catalog\n");
+	rc = llog_cat_add(env, cath, &lmr.lmr_hdr, &cookie, NULL);
+	if (rc != 1) {
+		CERROR("4b: write 1 catalog record failed at: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs++;
+	rc = verify_handle("4b", cath, 2);
+	if (rc)
+		GOTO(out, rc);
+
+	rc = verify_handle("4b", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4c: cancel 1 log record\n");
+	rc = llog_cat_cancel_records(env, cath, 1, &cookie);
+	if (rc) {
+		CERROR("4c: cancel 1 catalog based record failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	num_recs--;
+
+	rc = verify_handle("4c", cath->u.chd.chd_current_log, num_recs);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4d: write %d more log records\n", LLOG_TEST_RECNUM);
+	for (i = 0; i < LLOG_TEST_RECNUM; i++) {
+		rc = llog_cat_add(env, cath, &lmr.lmr_hdr, NULL, NULL);
+		if (rc) {
+			CERROR("4d: write %d records failed at #%d: %d\n",
+			       LLOG_TEST_RECNUM, i + 1, rc);
+			GOTO(out, rc);
+		}
+		num_recs++;
+	}
+
+	/* make sure new plain llog appears */
+	rc = verify_handle("4d", cath, 3);
+	if (rc)
+		GOTO(out, rc);
+
+	CWARN("4e: add 5 large records, one record per block\n");
+	buflen = LLOG_CHUNK_SIZE - sizeof(struct llog_rec_hdr) -
+		 sizeof(struct llog_rec_tail);
+	OBD_ALLOC(buf, buflen);
+	if (buf == NULL)
+		GOTO(out, rc = -ENOMEM);
+	for (i = 0; i < 5; i++) {
+		rec.lrh_len = buflen;
+		rec.lrh_type = OBD_CFG_REC;
+		rc = llog_cat_add(env, cath, &rec, NULL, buf);
+		if (rc) {
+			CERROR("4e: write 5 records failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_free, rc);
+		}
+		num_recs++;
+	}
+out_free:
+	OBD_FREE(buf, buflen);
+out:
+	CWARN("4f: put newly-created catalog\n");
+	rc2 = llog_cat_close(env, cath);
+	if (rc2) {
+		CERROR("4: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static int cat_counter;
+
+static int cat_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_logid_rec	*lir = (struct llog_logid_rec *)rec;
+	struct lu_fid		 fid = {0};
+
+	if (rec->lrh_type != LLOG_LOGID_MAGIC) {
+		CERROR("invalid record in catalog\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&lir->lid_id, &fid);
+
+	CWARN("seeing record at index %d - "DFID" in log "DFID"\n",
+	      rec->lrh_index, PFID(&fid),
+	      PFID(lu_object_fid(&llh->lgh_obj->do_lu)));
+
+	cat_counter++;
+
+	RETURN(0);
+}
+
+static int plain_counter;
+
+static int plain_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			  struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_INFO, "seeing record at index %d in log "DFID"\n",
+	       rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+
+	RETURN(0);
+}
+
+static int cancel_count;
+
+static int llog_cancel_rec_cb(const struct lu_env *env,
+			      struct llog_handle *llh,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct llog_cookie cookie;
+
+	if (!(llh->lgh_hdr->llh_flags & LLOG_F_IS_PLAIN)) {
+		CERROR("log is not plain\n");
+		RETURN(-EINVAL);
+	}
+
+	cookie.lgc_lgl = llh->lgh_id;
+	cookie.lgc_index = rec->lrh_index;
+
+	llog_cat_cancel_records(env, llh->u.phd.phd_cat_handle, 1, &cookie);
+	cancel_count++;
+	if (cancel_count == LLOG_TEST_RECNUM)
+		RETURN(-LLOG_EEMPTY);
+	RETURN(0);
+}
+
+/* Test log and catalogue processing */
+static int llog_test_5(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	char			 name[10];
+	int			 rc, rc2;
+	struct llog_mini_rec	 lmr;
+	struct llog_ctxt	*ctxt;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	lmr.lmr_hdr.lrh_len = lmr.lmr_tail.lrt_len = LLOG_MIN_REC_SIZE;
+	lmr.lmr_hdr.lrh_type = 0xf00f00;
+
+	CWARN("5a: re-open catalog by id\n");
+	rc = llog_open(env, ctxt, &llh, &cat_logid, NULL, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("5a: llog_create with logid failed: %d\n", rc);
+		GOTO(out_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_CAT, &uuid);
+	if (rc) {
+		CERROR("5a: can't init llog handle: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5b: print the catalog entries.. we expect 2\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5b: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 2) {
+		CERROR("5b: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5c: Cancel %d records, see one log zapped\n", LLOG_TEST_RECNUM);
+	cancel_count = 0;
+	rc = llog_cat_process(env, llh, llog_cancel_rec_cb, "foobar", 0, 0);
+	if (rc != -LLOG_EEMPTY) {
+		CERROR("5c: process with cat_cancel_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	CWARN("5c: print the catalog entries.. we expect 1\n");
+	cat_counter = 0;
+	rc = llog_process(env, llh, cat_print_cb, "test 5", NULL);
+	if (rc) {
+		CERROR("5c: process with cat_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (cat_counter != 1) {
+		CERROR("5c: %d entries in catalog\n", cat_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5d: add 1 record to the log with many canceled empty pages\n");
+	rc = llog_cat_add(env, llh, &lmr.lmr_hdr, NULL, NULL);
+	if (rc) {
+		CERROR("5d: add record to the log with many canceled empty "
+		       "pages failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("5e: print plain log entries.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_process(env, llh, plain_print_cb, "foobar", 0, 0);
+	if (rc) {
+		CERROR("5e: process with plain_print_cb failed: %d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5e: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	CWARN("5f: print plain log entries reversely.. expect 6\n");
+	plain_counter = 0;
+	rc = llog_cat_reverse_process(env, llh, plain_print_cb, "foobar");
+	if (rc) {
+		CERROR("5f: reversely process with plain_print_cb failed:"
+		       "%d\n", rc);
+		GOTO(out, rc);
+	}
+	if (plain_counter != 6) {
+		CERROR("5f: found %d records\n", plain_counter);
+		GOTO(out, rc = -EINVAL);
+	}
+
+out:
+	CWARN("5g: close re-opened catalog\n");
+	rc2 = llog_cat_close(env, llh);
+	if (rc2) {
+		CERROR("5g: close log %s failed: %d\n", name, rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+out_put:
+	llog_ctxt_put(ctxt);
+
+	RETURN(rc);
+}
+
+/* Test client api; open log by name and process */
+static int llog_test_6(const struct lu_env *env, struct obd_device *obd,
+		       char *name)
+{
+	struct obd_device	*mgc_obd;
+	struct llog_ctxt	*ctxt;
+	struct obd_uuid		*mgs_uuid;
+	struct obd_export	*exp;
+	struct obd_uuid		 uuid = { "LLOG_TEST6_UUID" };
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*nctxt;
+	int			 rc, rc2;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	mgs_uuid = &ctxt->loc_exp->exp_obd->obd_uuid;
+
+	CWARN("6a: re-open log %s using client API\n", name);
+	mgc_obd = class_find_client_obd(mgs_uuid, LUSTRE_MGC_NAME, NULL);
+	if (mgc_obd == NULL) {
+		CERROR("6a: no MGC devices connected to %s found.\n",
+		       mgs_uuid->uuid);
+		GOTO(ctxt_release, rc = -ENOENT);
+	}
+
+	rc = obd_connect(NULL, &exp, mgc_obd, &uuid,
+			 NULL /* obd_connect_data */, NULL);
+	if (rc != -EALREADY) {
+		CERROR("6a: connect on connected MGC (%s) failed to return"
+		       " -EALREADY", mgc_obd->obd_name);
+		if (rc == 0)
+			obd_disconnect(exp);
+		GOTO(ctxt_release, rc = -EINVAL);
+	}
+
+	nctxt = llog_get_context(mgc_obd, LLOG_CONFIG_REPL_CTXT);
+	rc = llog_open(env, nctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc) {
+		CERROR("6a: llog_open failed %d\n", rc);
+		GOTO(nctxt_put, rc);
+	}
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc) {
+		CERROR("6a: llog_init_handle failed %d\n", rc);
+		GOTO(parse_out, rc);
+	}
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6b: process log %s using client API\n", name);
+	rc = llog_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6b: llog_process failed %d\n", rc);
+	CWARN("6b: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6b", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	plain_counter = 1; /* llog header is first record */
+	CWARN("6c: process log %s reversely using client API\n", name);
+	rc = llog_reverse_process(env, llh, plain_print_cb, NULL, NULL);
+	if (rc)
+		CERROR("6c: llog_reverse_process failed %d\n", rc);
+	CWARN("6c: processed %d records\n", plain_counter);
+
+	rc = verify_handle("6c", llh, plain_counter);
+	if (rc)
+		GOTO(parse_out, rc);
+
+parse_out:
+	rc2 = llog_close(env, llh);
+	if (rc2) {
+		CERROR("6: llog_close failed: rc = %d\n", rc2);
+		if (rc == 0)
+			rc = rc2;
+	}
+nctxt_put:
+	llog_ctxt_put(nctxt);
+ctxt_release:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+static union {
+	struct llog_rec_hdr		lrh;   /* common header */
+	struct llog_logid_rec		llr;   /* LLOG_LOGID_MAGIC */
+	struct llog_unlink64_rec	lur;   /* MDS_UNLINK64_REC */
+	struct llog_setattr64_rec	lsr64; /* MDS_SETATTR64_REC */
+	struct llog_size_change_rec	lscr;  /* OST_SZ_REC */
+	struct llog_changelog_rec	lcr;   /* CHANGELOG_REC */
+	struct llog_changelog_user_rec	lcur;  /* CHANGELOG_USER_REC */
+	struct llog_gen_rec		lgr;   /* LLOG_GEN_REC */
+} llog_records;
+
+static int test_7_print_cb(const struct lu_env *env, struct llog_handle *llh,
+			   struct llog_rec_hdr *rec, void *data)
+{
+	struct lu_fid fid = {0};
+
+	logid_to_fid(&llh->lgh_id, &fid);
+
+	CDEBUG(D_OTHER, "record type %#x at index %d in log "DFID"\n",
+	       rec->lrh_type, rec->lrh_index, PFID(&fid));
+
+	plain_counter++;
+	return 0;
+}
+
+static int test_7_cancel_cb(const struct lu_env *env, struct llog_handle *llh,
+			    struct llog_rec_hdr *rec, void *data)
+{
+	plain_counter++;
+	/* test LLOG_DEL_RECORD is working */
+	return LLOG_DEL_RECORD;
+}
+
+static int llog_test_7_sub(const struct lu_env *env, struct llog_ctxt *ctxt)
+{
+	struct llog_handle	*llh;
+	int			 rc = 0, i, process_count;
+	int			 num_recs = 0;
+
+	ENTRY;
+
+	rc = llog_open_create(env, ctxt, &llh, NULL, NULL);
+	if (rc) {
+		CERROR("7_sub: create log failed\n");
+		RETURN(rc);
+	}
+
+	rc = llog_init_handle(env, llh,
+			      LLOG_F_IS_PLAIN | LLOG_F_ZAP_WHEN_EMPTY,
+			      &uuid);
+	if (rc) {
+		CERROR("7_sub: can't init llog handle: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	for (i = 0; i < LLOG_BITMAP_SIZE(llh->lgh_hdr); i++) {
+		rc = llog_write(env, llh, &llog_records.lrh, NULL, 0,
+				NULL, -1);
+		if (rc == -ENOSPC) {
+			break;
+		} else if (rc < 0) {
+			CERROR("7_sub: write recs failed at #%d: %d\n",
+			       i + 1, rc);
+			GOTO(out_close, rc);
+		}
+		num_recs++;
+	}
+	if (rc != -ENOSPC) {
+		CWARN("7_sub: write record more than BITMAP size!\n");
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	rc = verify_handle("7_sub", llh, num_recs + 1);
+	if (rc) {
+		CERROR("7_sub: verify handle failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (num_recs < LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1)
+		CWARN("7_sub: records are not aligned, written %d from %u\n",
+		      num_recs, LLOG_BITMAP_SIZE(llh->lgh_hdr) - 1);
+
+	plain_counter = 0;
+	rc = llog_process(env, llh, test_7_print_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	process_count = plain_counter;
+	if (process_count != num_recs) {
+		CERROR("7_sub: processed %d records from %d total\n",
+		       process_count, num_recs);
+		GOTO(out_close, rc = -EINVAL);
+	}
+
+	plain_counter = 0;
+	rc = llog_reverse_process(env, llh, test_7_cancel_cb, "test 7", NULL);
+	if (rc) {
+		CERROR("7_sub: reverse llog process failed: %d\n", rc);
+		GOTO(out_close, rc);
+	}
+	if (process_count != plain_counter) {
+		CERROR("7_sub: Reverse/direct processing found different"
+		       "number of records: %d/%d\n",
+		       plain_counter, process_count);
+		GOTO(out_close, rc = -EINVAL);
+	}
+	if (llog_exist(llh)) {
+		CERROR("7_sub: llog exists but should be zapped\n");
+		GOTO(out_close, rc = -EEXIST);
+	}
+
+	rc = verify_handle("7_sub", llh, 1);
+out_close:
+	if (rc)
+		llog_destroy(env, llh);
+	llog_close(env, llh);
+	RETURN(rc);
+}
+
+/* Test all llog records writing and processing */
+static int llog_test_7(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_ctxt	*ctxt;
+	int			 rc;
+
+	ENTRY;
+
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+
+	CWARN("7a: test llog_logid_rec\n");
+	llog_records.llr.lid_hdr.lrh_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_tail.lrt_len = sizeof(llog_records.llr);
+	llog_records.llr.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7a: llog_logid_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7b: test llog_unlink64_rec\n");
+	llog_records.lur.lur_hdr.lrh_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_tail.lrt_len = sizeof(llog_records.lur);
+	llog_records.lur.lur_hdr.lrh_type = MDS_UNLINK64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7b: llog_unlink_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7c: test llog_setattr64_rec\n");
+	llog_records.lsr64.lsr_hdr.lrh_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_tail.lrt_len = sizeof(llog_records.lsr64);
+	llog_records.lsr64.lsr_hdr.lrh_type = MDS_SETATTR64_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7c: llog_setattr64_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7d: test llog_size_change_rec\n");
+	llog_records.lscr.lsc_hdr.lrh_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_tail.lrt_len = sizeof(llog_records.lscr);
+	llog_records.lscr.lsc_hdr.lrh_type = OST_SZ_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7d: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7e: test llog_changelog_rec\n");
+	llog_records.lcr.cr_hdr.lrh_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_tail.lrt_len = sizeof(llog_records.lcr);
+	llog_records.lcr.cr_hdr.lrh_type = CHANGELOG_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7e: llog_changelog_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7f: test llog_changelog_user_rec\n");
+	llog_records.lcur.cur_hdr.lrh_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_tail.lrt_len = sizeof(llog_records.lcur);
+	llog_records.lcur.cur_hdr.lrh_type = CHANGELOG_USER_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7f: llog_changelog_user_rec test failed\n");
+		GOTO(out, rc);
+	}
+
+	CWARN("7g: test llog_gen_rec\n");
+	llog_records.lgr.lgr_hdr.lrh_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_tail.lrt_len = sizeof(llog_records.lgr);
+	llog_records.lgr.lgr_hdr.lrh_type = LLOG_GEN_REC;
+
+	rc = llog_test_7_sub(env, ctxt);
+	if (rc) {
+		CERROR("7g: llog_size_change_rec test failed\n");
+		GOTO(out, rc);
+	}
+out:
+	llog_ctxt_put(ctxt);
+	RETURN(rc);
+}
+
+/* -------------------------------------------------------------------------
+ * Tests above, boring obd functions below
+ * ------------------------------------------------------------------------- */
+static int llog_run_tests(const struct lu_env *env, struct obd_device *obd)
+{
+	struct llog_handle	*llh = NULL;
+	struct llog_ctxt	*ctxt;
+	int			 rc, err;
+	char			 name[10];
+
+	ENTRY;
+	ctxt = llog_get_context(obd, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+
+	sprintf(name, "%x", llog_test_rand);
+
+	rc = llog_test_1(env, obd, name);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_2(env, obd, name, &llh);
+	if (rc)
+		GOTO(cleanup_ctxt, rc);
+
+	rc = llog_test_3(env, obd, llh);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_4(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_5(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_6(env, obd, name);
+	if (rc)
+		GOTO(cleanup, rc);
+
+	rc = llog_test_7(env, obd);
+	if (rc)
+		GOTO(cleanup, rc);
+
+cleanup:
+	err = llog_destroy(env, llh);
+	if (err)
+		CERROR("cleanup: llog_destroy failed: %d\n", err);
+	llog_close(env, llh);
+	if (rc == 0)
+		rc = err;
+cleanup_ctxt:
+	llog_ctxt_put(ctxt);
+	return rc;
+}
+
+#ifdef LPROCFS
+static struct lprocfs_vars lprocfs_llog_test_obd_vars[] = { {0} };
+static struct lprocfs_vars lprocfs_llog_test_module_vars[] = { {0} };
+static void lprocfs_llog_test_init_vars(struct lprocfs_static_vars *lvars)
+{
+    lvars->module_vars  = lprocfs_llog_test_module_vars;
+    lvars->obd_vars     = lprocfs_llog_test_obd_vars;
+}
+#endif
+
+static int llog_test_cleanup(struct obd_device *obd)
+{
+	struct obd_device	*tgt;
+	struct lu_env		 env;
+	int			 rc;
+
+	ENTRY;
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	tgt = obd->obd_lvfs_ctxt.dt->dd_lu_dev.ld_obd;
+	rc = llog_cleanup(&env, llog_get_context(tgt, LLOG_TEST_ORIG_CTXT));
+	if (rc)
+		CERROR("failed to llog_test_llog_finish: %d\n", rc);
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static int llog_test_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_device	*tgt;
+	struct llog_ctxt	*ctxt;
+	struct dt_object	*o;
+	struct lu_env		 env;
+	struct lu_context	 test_session;
+	int			 rc;
+
+	ENTRY;
+
+	if (lcfg->lcfg_bufcount < 2) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	if (lcfg->lcfg_buflens[1] < 1) {
+		CERROR("requires a TARGET OBD name\n");
+		RETURN(-EINVAL);
+	}
+
+	/* disk obd */
+	tgt = class_name2obd(lustre_cfg_string(lcfg, 1));
+	if (!tgt || !tgt->obd_attached || !tgt->obd_set_up) {
+		CERROR("target device not attached or not set up (%s)\n",
+		       lustre_cfg_string(lcfg, 1));
+		RETURN(-EINVAL);
+	}
+
+	rc = lu_env_init(&env, LCT_LOCAL | LCT_MG_THREAD);
+	if (rc)
+		RETURN(rc);
+
+	rc = lu_context_init(&test_session, LCT_SESSION);
+	if (rc)
+		GOTO(cleanup_env, rc);
+	test_session.lc_thread = (struct ptlrpc_thread *)current;
+	lu_context_enter(&test_session);
+	env.le_ses = &test_session;
+
+	CWARN("Setup llog-test device over %s device\n",
+	      lustre_cfg_string(lcfg, 1));
+
+	OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt);
+	obd->obd_lvfs_ctxt.dt = lu2dt_dev(tgt->obd_lu_dev);
+
+	rc = llog_setup(&env, tgt, &tgt->obd_olg, LLOG_TEST_ORIG_CTXT, tgt,
+			&llog_osd_ops);
+	if (rc)
+		GOTO(cleanup_session, rc);
+
+	/* use MGS llog dir for tests */
+	ctxt = llog_get_context(tgt, LLOG_CONFIG_ORIG_CTXT);
+	LASSERT(ctxt);
+	o = ctxt->loc_dir;
+	llog_ctxt_put(ctxt);
+
+	ctxt = llog_get_context(tgt, LLOG_TEST_ORIG_CTXT);
+	LASSERT(ctxt);
+	ctxt->loc_dir = o;
+	llog_ctxt_put(ctxt);
+
+	llog_test_rand = cfs_rand();
+
+	rc = llog_run_tests(&env, tgt);
+	if (rc)
+		llog_test_cleanup(obd);
+cleanup_session:
+	lu_context_exit(&test_session);
+	lu_context_fini(&test_session);
+cleanup_env:
+	lu_env_fini(&env);
+	RETURN(rc);
+}
+
+static struct obd_ops llog_obd_ops = {
+	.o_owner       = THIS_MODULE,
+	.o_setup       = llog_test_setup,
+	.o_cleanup     = llog_test_cleanup,
+};
+
+static int __init llog_test_init(void)
+{
+	struct lprocfs_static_vars lvars;
+
+	lprocfs_llog_test_init_vars(&lvars);
+	return class_register_type(&llog_obd_ops, NULL,
+				   lvars.module_vars, "llog_test", NULL);
+}
+
+static void __exit llog_test_exit(void)
+{
+	class_unregister_type("llog_test");
+}
+
+MODULE_AUTHOR("Sun Microsystems, Inc. <http://www.lustre.org/>");
+MODULE_DESCRIPTION("llog test module");
+MODULE_LICENSE("GPL");
+
+module_init(llog_test_init);
+module_exit(llog_test_exit);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.c b/drivers/staging/lustre/lustre/obdclass/local_storage.c
new file mode 100644
index 000000000000..3be35a83a495
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.c
@@ -0,0 +1,903 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include "local_storage.h"
+
+/* all initialized local storages on this node are linked on this */
+static LIST_HEAD(ls_list_head);
+static DEFINE_MUTEX(ls_list_mutex);
+
+static int ls_object_init(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_object_conf *unused)
+{
+	struct ls_device	*ls;
+	struct lu_object	*below;
+	struct lu_device	*under;
+
+	ENTRY;
+
+	ls = container_of0(o->lo_dev, struct ls_device, ls_top_dev.dd_lu_dev);
+	under = &ls->ls_osd->dd_lu_dev;
+	below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under);
+	if (below == NULL)
+		RETURN(-ENOMEM);
+
+	lu_object_add(o, below);
+
+	RETURN(0);
+}
+
+static void ls_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct ls_object	*obj = lu2ls_obj(o);
+	struct lu_object_header	*h = o->lo_header;
+
+	dt_object_fini(&obj->ls_obj);
+	lu_object_header_fini(h);
+	OBD_FREE_PTR(obj);
+}
+
+struct lu_object_operations ls_lu_obj_ops = {
+	.loo_object_init  = ls_object_init,
+	.loo_object_free  = ls_object_free,
+};
+
+struct lu_object *ls_object_alloc(const struct lu_env *env,
+				  const struct lu_object_header *_h,
+				  struct lu_device *d)
+{
+	struct lu_object_header	*h;
+	struct ls_object	*o;
+	struct lu_object	*l;
+
+	LASSERT(_h == NULL);
+
+	OBD_ALLOC_PTR(o);
+	if (o != NULL) {
+		l = &o->ls_obj.do_lu;
+		h = &o->ls_header;
+
+		lu_object_header_init(h);
+		dt_object_init(&o->ls_obj, h, d);
+		lu_object_add_top(h, l);
+
+		l->lo_ops = &ls_lu_obj_ops;
+
+		return l;
+	} else {
+		return NULL;
+	}
+}
+
+static struct lu_device_operations ls_lu_dev_ops = {
+	.ldo_object_alloc =	ls_object_alloc
+};
+
+static struct ls_device *__ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls, *ret = NULL;
+
+	list_for_each_entry(ls, &ls_list_head, ls_linkage) {
+		if (ls->ls_osd == dev) {
+			atomic_inc(&ls->ls_refcount);
+			ret = ls;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ls_device *ls_find_dev(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	mutex_unlock(&ls_list_mutex);
+
+	return ls;
+}
+
+static struct lu_device_type_operations ls_device_type_ops = {
+	.ldto_start = NULL,
+	.ldto_stop  = NULL,
+};
+
+static struct lu_device_type ls_lu_type = {
+	.ldt_name = "local_storage",
+	.ldt_ops  = &ls_device_type_ops,
+};
+
+struct ls_device *ls_device_get(struct dt_device *dev)
+{
+	struct ls_device *ls;
+
+	ENTRY;
+
+	mutex_lock(&ls_list_mutex);
+	ls = __ls_find_dev(dev);
+	if (ls)
+		GOTO(out_ls, ls);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(ls);
+	if (ls == NULL)
+		GOTO(out_ls, ls = ERR_PTR(-ENOMEM));
+
+	atomic_set(&ls->ls_refcount, 1);
+	INIT_LIST_HEAD(&ls->ls_los_list);
+	mutex_init(&ls->ls_los_mutex);
+
+	ls->ls_osd = dev;
+
+	LASSERT(dev->dd_lu_dev.ld_site);
+	lu_device_init(&ls->ls_top_dev.dd_lu_dev, &ls_lu_type);
+	ls->ls_top_dev.dd_lu_dev.ld_ops = &ls_lu_dev_ops;
+	ls->ls_top_dev.dd_lu_dev.ld_site = dev->dd_lu_dev.ld_site;
+
+	/* finally add ls to the list */
+	list_add(&ls->ls_linkage, &ls_list_head);
+out_ls:
+	mutex_unlock(&ls_list_mutex);
+	RETURN(ls);
+}
+
+void ls_device_put(const struct lu_env *env, struct ls_device *ls)
+{
+	LASSERT(env);
+	if (!atomic_dec_and_test(&ls->ls_refcount))
+		return;
+
+	mutex_lock(&ls_list_mutex);
+	if (atomic_read(&ls->ls_refcount) == 0) {
+		LASSERT(list_empty(&ls->ls_los_list));
+		list_del(&ls->ls_linkage);
+		lu_site_purge(env, ls->ls_top_dev.dd_lu_dev.ld_site, ~0);
+		lu_device_fini(&ls->ls_top_dev.dd_lu_dev);
+		OBD_FREE_PTR(ls);
+	}
+	mutex_unlock(&ls_list_mutex);
+}
+
+/**
+ * local file fid generation
+ */
+int local_object_fid_generate(const struct lu_env *env,
+			      struct local_oid_storage *los,
+			      struct lu_fid *fid)
+{
+	LASSERT(los->los_dev);
+	LASSERT(los->los_obj);
+
+	/* take next OID */
+
+	/* to make it unique after reboot we store
+	 * the latest generated fid atomically with
+	 * object creation see local_object_create() */
+
+	mutex_lock(&los->los_id_lock);
+	fid->f_seq = los->los_seq;
+	fid->f_oid = ++los->los_last_oid;
+	fid->f_ver = 0;
+	mutex_unlock(&los->los_id_lock);
+
+	return 0;
+}
+
+int local_object_declare_create(const struct lu_env *env,
+				struct local_oid_storage *los,
+				struct dt_object *o, struct lu_attr *attr,
+				struct dt_object_format *dof,
+				struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	int			 rc;
+
+	ENTRY;
+
+	/* update fid generation file */
+	if (los != NULL) {
+		LASSERT(dt_object_exists(los->los_obj));
+		rc = dt_declare_record_write(env, los->los_obj,
+					     sizeof(struct los_ondisk), 0, th);
+		if (rc)
+			RETURN(rc);
+	}
+
+	rc = dt_declare_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	dti->dti_lb.lb_buf = NULL;
+	dti->dti_lb.lb_len = sizeof(dti->dti_lma);
+	rc = dt_declare_xattr_set(env, o, &dti->dti_lb, XATTR_NAME_LMA, 0, th);
+
+	RETURN(rc);
+}
+
+int local_object_create(const struct lu_env *env,
+			struct local_oid_storage *los,
+			struct dt_object *o, struct lu_attr *attr,
+			struct dt_object_format *dof, struct thandle *th)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	obd_id			 lastid;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_create(env, o, attr, NULL, dof, th);
+	if (rc)
+		RETURN(rc);
+
+	if (los == NULL)
+		RETURN(rc);
+
+	LASSERT(los->los_obj);
+	LASSERT(dt_object_exists(los->los_obj));
+
+	/* many threads can be updated this, serialize
+	 * them here to avoid the race where one thread
+	 * takes the value first, but writes it last */
+	mutex_lock(&los->los_id_lock);
+
+	/* update local oid number on disk so that
+	 * we know the last one used after reboot */
+	lastid = cpu_to_le64(los->los_last_oid);
+
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &lastid;
+	dti->dti_lb.lb_len = sizeof(lastid);
+	rc = dt_record_write(env, los->los_obj, &dti->dti_lb, &dti->dti_off,
+			     th);
+	mutex_unlock(&los->los_id_lock);
+
+	RETURN(rc);
+}
+
+/*
+ * Create local named object (file, directory or index) in parent directory.
+ */
+struct dt_object *__local_file_create(const struct lu_env *env,
+				      const struct lu_fid *fid,
+				      struct local_oid_storage *los,
+				      struct ls_device *ls,
+				      struct dt_object *parent,
+				      const char *name, struct lu_attr *attr,
+				      struct dt_object_format *dof)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	dto = ls_locate(env, ls, fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(dto);
+
+	LASSERT(dto != NULL);
+	if (dt_object_exists(dto))
+		GOTO(out, rc = -EEXIST);
+
+	th = dt_trans_create(env, ls->ls_osd);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		dt_declare_ref_add(env, dto, th);
+		dt_declare_ref_add(env, parent, th);
+	}
+
+	rc = dt_declare_insert(env, parent, (void *)fid, (void *)name, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	rc = dt_trans_start_local(env, ls->ls_osd, th);
+	if (rc)
+		GOTO(trans_stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	if (dt_object_exists(dto))
+		GOTO(unlock, rc = 0);
+
+	CDEBUG(D_OTHER, "create new object "DFID"\n",
+	       PFID(lu_object_fid(&dto->do_lu)));
+	rc = local_object_create(env, los, dto, attr, dof, th);
+	if (rc)
+		GOTO(unlock, rc);
+	LASSERT(dt_object_exists(dto));
+
+	if (dti->dti_dof.dof_type == DFT_DIR) {
+		if (!dt_try_as_dir(env, dto))
+			GOTO(destroy, rc = -ENOTDIR);
+		/* Add "." and ".." for newly created dir */
+		rc = dt_insert(env, dto, (void *)fid, (void *)".", th,
+			       BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+		dt_ref_add(env, dto, th);
+		rc = dt_insert(env, dto, (void *)lu_object_fid(&parent->do_lu),
+			       (void *)"..", th, BYPASS_CAPA, 1);
+		if (rc)
+			GOTO(destroy, rc);
+	}
+
+	dt_write_lock(env, parent, 0);
+	rc = dt_insert(env, parent, (const struct dt_rec *)fid,
+		       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+	if (dti->dti_dof.dof_type == DFT_DIR)
+		dt_ref_add(env, parent, th);
+	dt_write_unlock(env, parent);
+	if (rc)
+		GOTO(destroy, rc);
+destroy:
+	if (rc)
+		dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+trans_stop:
+	dt_trans_stop(env, ls->ls_osd, th);
+out:
+	if (rc) {
+		lu_object_put_nocache(env, &dto->do_lu);
+		dto = ERR_PTR(rc);
+	}
+	RETURN(dto);
+}
+
+/*
+ * Look up and create (if it does not exist) a local named file or directory in
+ * parent directory.
+ */
+struct dt_object *local_file_find_or_create(const struct lu_env *env,
+					    struct local_oid_storage *los,
+					    struct dt_object *parent,
+					    const char *name, __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0)
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	else if (rc != -ENOENT)
+		dto = ERR_PTR(rc);
+	else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create);
+
+struct dt_object *local_file_find_or_create_with_fid(const struct lu_env *env,
+						     struct dt_device *dt,
+						     const struct lu_fid *fid,
+						     struct dt_object *parent,
+						     const char *name,
+						     __u32 mode)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		dto = dt_locate(env, dt, &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid	= LA_MODE;
+			dti->dti_attr.la_mode	= mode;
+			dti->dti_dof.dof_type	= dt_mode_to_dft(mode & S_IFMT);
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_file_find_or_create_with_fid);
+
+/*
+ * Look up and create (if it does not exist) a local named index file in parent
+ * directory.
+ */
+struct dt_object *local_index_find_or_create(const struct lu_env *env,
+					     struct local_oid_storage *los,
+					     struct dt_object *parent,
+					     const char *name, __u32 mode,
+					     const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		dto = ls_locate(env, dt2ls_dev(los->los_dev), &dti->dti_fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		rc = local_object_fid_generate(env, los, &dti->dti_fid);
+		if (rc < 0) {
+			dto = ERR_PTR(rc);
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat	= ft;
+			dto = __local_file_create(env, &dti->dti_fid, los,
+						  dt2ls_dev(los->los_dev),
+						  parent, name, &dti->dti_attr,
+						  &dti->dti_dof);
+		}
+	}
+	return dto;
+
+}
+EXPORT_SYMBOL(local_index_find_or_create);
+
+struct dt_object *
+local_index_find_or_create_with_fid(const struct lu_env *env,
+				    struct dt_device *dt,
+				    const struct lu_fid *fid,
+				    struct dt_object *parent,
+				    const char *name, __u32 mode,
+				    const struct dt_index_features *ft)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	int			 rc;
+
+	LASSERT(parent);
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == 0) {
+		/* name is found, get the object */
+		if (!lu_fid_eq(fid, &dti->dti_fid))
+			dto = ERR_PTR(-EINVAL);
+		else
+			dto = dt_locate(env, dt, fid);
+	} else if (rc != -ENOENT) {
+		dto = ERR_PTR(rc);
+	} else {
+		struct ls_device *ls;
+
+		ls = ls_device_get(dt);
+		if (IS_ERR(ls)) {
+			dto = ERR_PTR(PTR_ERR(ls));
+		} else {
+			/* create the object */
+			dti->dti_attr.la_valid		= LA_MODE;
+			dti->dti_attr.la_mode		= mode;
+			dti->dti_dof.dof_type		= DFT_INDEX;
+			dti->dti_dof.u.dof_idx.di_feat  = ft;
+			dto = __local_file_create(env, fid, NULL, ls, parent,
+						  name, &dti->dti_attr,
+						  &dti->dti_dof);
+			/* ls_device_put() will finalize the ls device, we
+			 * have to open the object in other device stack */
+			if (!IS_ERR(dto)) {
+				dti->dti_fid = dto->do_lu.lo_header->loh_fid;
+				lu_object_put_nocache(env, &dto->do_lu);
+				dto = dt_locate(env, dt, &dti->dti_fid);
+			}
+			ls_device_put(env, ls);
+		}
+	}
+	return dto;
+}
+EXPORT_SYMBOL(local_index_find_or_create_with_fid);
+
+static int local_object_declare_unlink(const struct lu_env *env,
+				       struct dt_device *dt,
+				       struct dt_object *p,
+				       struct dt_object *c, const char *name,
+				       struct thandle *th)
+{
+	int rc;
+
+	rc = dt_declare_delete(env, p, (const struct dt_key *)name, th);
+	if (rc < 0)
+		return rc;
+
+	rc = dt_declare_ref_del(env, c, th);
+	if (rc < 0)
+		return rc;
+
+	return dt_declare_destroy(env, c, th);
+}
+
+int local_object_unlink(const struct lu_env *env, struct dt_device *dt,
+			struct dt_object *parent, const char *name)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*dto;
+	struct thandle		*th;
+	int			 rc;
+
+	ENTRY;
+
+	rc = dt_lookup_dir(env, parent, name, &dti->dti_fid);
+	if (rc == -ENOENT)
+		RETURN(0);
+	else if (rc < 0)
+		RETURN(rc);
+
+	dto = dt_locate(env, dt, &dti->dti_fid);
+	if (unlikely(IS_ERR(dto)))
+		RETURN(PTR_ERR(dto));
+
+	th = dt_trans_create(env, dt);
+	if (IS_ERR(th))
+		GOTO(out, rc = PTR_ERR(th));
+
+	rc = local_object_declare_unlink(env, dt, parent, dto, name, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	rc = dt_trans_start_local(env, dt, th);
+	if (rc < 0)
+		GOTO(stop, rc);
+
+	dt_write_lock(env, dto, 0);
+	rc = dt_delete(env, parent, (struct dt_key *)name, th, BYPASS_CAPA);
+	if (rc < 0)
+		GOTO(unlock, rc);
+
+	rc = dt_ref_del(env, dto, th);
+	if (rc < 0) {
+		rc = dt_insert(env, parent,
+			       (const struct dt_rec *)&dti->dti_fid,
+			       (const struct dt_key *)name, th, BYPASS_CAPA, 1);
+		GOTO(unlock, rc);
+	}
+
+	rc = dt_destroy(env, dto, th);
+unlock:
+	dt_write_unlock(env, dto);
+stop:
+	dt_trans_stop(env, dt, th);
+out:
+	lu_object_put_nocache(env, &dto->do_lu);
+	return rc;
+}
+EXPORT_SYMBOL(local_object_unlink);
+
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq)
+{
+	struct local_oid_storage *los, *ret = NULL;
+
+	list_for_each_entry(los, &ls->ls_los_list, los_list) {
+		if (los->los_seq == seq) {
+			atomic_inc(&los->los_refcount);
+			ret = los;
+			break;
+		}
+	}
+	return ret;
+}
+
+void dt_los_put(struct local_oid_storage *los)
+{
+	if (atomic_dec_and_test(&los->los_refcount))
+		/* should never happen, only local_oid_storage_fini should
+		 * drop refcount to zero */
+		LBUG();
+	return;
+}
+
+/* after Lustre 2.3 release there may be old file to store last generated FID
+ * If such file exists then we have to read its content
+ */
+int lastid_compat_check(const struct lu_env *env, struct dt_device *dev,
+			__u64 lastid_seq, __u32 *first_oid, struct ls_device *ls)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct dt_object	*root = NULL;
+	struct los_ondisk	 losd;
+	struct dt_object	*o = NULL;
+	int			 rc = 0;
+
+	rc = dt_root_get(env, dev, &dti->dti_fid);
+	if (rc)
+		return rc;
+
+	root = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(root))
+		return PTR_ERR(root);
+
+	/* find old last_id file */
+	snprintf(dti->dti_buf, sizeof(dti->dti_buf), "seq-"LPX64"-lastid",
+		 lastid_seq);
+	rc = dt_lookup_dir(env, root, dti->dti_buf, &dti->dti_fid);
+	lu_object_put_nocache(env, &root->do_lu);
+	if (rc == -ENOENT) {
+		/* old llog lastid accessed by FID only */
+		if (lastid_seq != FID_SEQ_LLOG)
+			return 0;
+		dti->dti_fid.f_seq = FID_SEQ_LLOG;
+		dti->dti_fid.f_oid = 1;
+		dti->dti_fid.f_ver = 0;
+		o = ls_locate(env, ls, &dti->dti_fid);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+
+		if (!dt_object_exists(o)) {
+			lu_object_put_nocache(env, &o->do_lu);
+			return 0;
+		}
+		CDEBUG(D_INFO, "Found old llog lastid file\n");
+	} else if (rc < 0) {
+		return rc;
+	} else {
+		CDEBUG(D_INFO, "Found old lastid file for sequence "LPX64"\n",
+		       lastid_seq);
+		o = ls_locate(env, ls, &dti->dti_fid);
+		if (IS_ERR(o))
+			return PTR_ERR(o);
+	}
+	/* let's read seq-NNNNNN-lastid file value */
+	LASSERT(dt_object_exists(o));
+	dti->dti_off = 0;
+	dti->dti_lb.lb_buf = &losd;
+	dti->dti_lb.lb_len = sizeof(losd);
+	dt_read_lock(env, o, 0);
+	rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+	dt_read_unlock(env, o);
+	lu_object_put_nocache(env, &o->do_lu);
+	if (rc == 0 && le32_to_cpu(losd.lso_magic) != LOS_MAGIC) {
+		CERROR("%s: wrong content of seq-"LPX64"-lastid file, magic %x\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq,
+		       le32_to_cpu(losd.lso_magic));
+		return -EINVAL;
+	} else if (rc < 0) {
+		CERROR("%s: failed to read seq-"LPX64"-lastid: rc = %d\n",
+		       o->do_lu.lo_dev->ld_obd->obd_name, lastid_seq, rc);
+		return rc;
+	}
+	*first_oid = le32_to_cpu(losd.lso_next_oid);
+	return rc;
+}
+
+/**
+ * Initialize local OID storage for required sequence.
+ * That may be needed for services that uses local files and requires
+ * dynamic OID allocation for them.
+ *
+ * Per each sequence we have an object with 'first_fid' identificator
+ * containing the counter for OIDs of locally created files with that
+ * sequence.
+ *
+ * It is used now by llog subsystem and MGS for NID tables
+ *
+ * Function gets first_fid to create counter object.
+ * All dynamic fids will be generated with the same sequence and incremented
+ * OIDs
+ *
+ * Returned local_oid_storage is in-memory representaion of OID storage
+ */
+int local_oid_storage_init(const struct lu_env *env, struct dt_device *dev,
+			   const struct lu_fid *first_fid,
+			   struct local_oid_storage **los)
+{
+	struct dt_thread_info	*dti = dt_info(env);
+	struct ls_device	*ls;
+	obd_id			 lastid;
+	struct dt_object	*o = NULL;
+	struct thandle		*th;
+	__u32			 first_oid = fid_oid(first_fid);
+	int			 rc = 0;
+
+	ENTRY;
+
+	ls = ls_device_get(dev);
+	if (IS_ERR(ls))
+		RETURN(PTR_ERR(ls));
+
+	mutex_lock(&ls->ls_los_mutex);
+	*los = dt_los_find(ls, fid_seq(first_fid));
+	if (*los != NULL)
+		GOTO(out, rc = 0);
+
+	/* not found, then create */
+	OBD_ALLOC_PTR(*los);
+	if (*los == NULL)
+		GOTO(out, rc = -ENOMEM);
+
+	atomic_set(&(*los)->los_refcount, 1);
+	mutex_init(&(*los)->los_id_lock);
+	(*los)->los_dev = &ls->ls_top_dev;
+	atomic_inc(&ls->ls_refcount);
+	list_add(&(*los)->los_list, &ls->ls_los_list);
+
+	/* Use {seq, 0, 0} to create the LAST_ID file for every
+	 * sequence.  OIDs start at LUSTRE_FID_INIT_OID.
+	 */
+	dti->dti_fid.f_seq = fid_seq(first_fid);
+	dti->dti_fid.f_oid = LUSTRE_FID_LASTID_OID;
+	dti->dti_fid.f_ver = 0;
+	o = ls_locate(env, ls, &dti->dti_fid);
+	if (IS_ERR(o))
+		GOTO(out_los, rc = PTR_ERR(o));
+
+	if (!dt_object_exists(o)) {
+		rc = lastid_compat_check(env, dev, fid_seq(first_fid),
+					 &first_oid, ls);
+		if (rc < 0)
+			GOTO(out_los, rc);
+
+		th = dt_trans_create(env, dev);
+		if (IS_ERR(th))
+			GOTO(out_los, rc = PTR_ERR(th));
+
+		dti->dti_attr.la_valid = LA_MODE | LA_TYPE;
+		dti->dti_attr.la_mode = S_IFREG | S_IRUGO | S_IWUSR;
+		dti->dti_dof.dof_type = dt_mode_to_dft(S_IFREG);
+
+		rc = dt_declare_create(env, o, &dti->dti_attr, NULL,
+				       &dti->dti_dof, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_declare_record_write(env, o, sizeof(lastid), 0, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		rc = dt_trans_start_local(env, dev, th);
+		if (rc)
+			GOTO(out_trans, rc);
+
+		dt_write_lock(env, o, 0);
+		if (dt_object_exists(o))
+			GOTO(out_lock, rc = 0);
+
+		rc = dt_create(env, o, &dti->dti_attr, NULL, &dti->dti_dof,
+			       th);
+		if (rc)
+			GOTO(out_lock, rc);
+
+		lastid = cpu_to_le64(first_oid);
+
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		rc = dt_record_write(env, o, &dti->dti_lb, &dti->dti_off, th);
+		if (rc)
+			GOTO(out_lock, rc);
+out_lock:
+		dt_write_unlock(env, o);
+out_trans:
+		dt_trans_stop(env, dev, th);
+	} else {
+		dti->dti_off = 0;
+		dti->dti_lb.lb_buf = &lastid;
+		dti->dti_lb.lb_len = sizeof(lastid);
+		dt_read_lock(env, o, 0);
+		rc = dt_record_read(env, o, &dti->dti_lb, &dti->dti_off);
+		dt_read_unlock(env, o);
+		if (rc == 0 && le64_to_cpu(lastid) > OBIF_MAX_OID) {
+			CERROR("%s: bad oid "LPU64" is read from LAST_ID\n",
+			       o->do_lu.lo_dev->ld_obd->obd_name,
+			       le64_to_cpu(lastid));
+			rc = -EINVAL;
+		}
+	}
+out_los:
+	if (rc != 0) {
+		list_del(&(*los)->los_list);
+		atomic_dec(&ls->ls_refcount);
+		OBD_FREE_PTR(*los);
+		*los = NULL;
+		if (o != NULL && !IS_ERR(o))
+			lu_object_put_nocache(env, &o->do_lu);
+	} else {
+		(*los)->los_seq = fid_seq(first_fid);
+		(*los)->los_last_oid = le64_to_cpu(lastid);
+		(*los)->los_obj = o;
+		/* read value should not be less than initial one */
+		LASSERTF((*los)->los_last_oid >= first_oid, "%u < %u\n",
+			 (*los)->los_last_oid, first_oid);
+	}
+out:
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+	return rc;
+}
+EXPORT_SYMBOL(local_oid_storage_init);
+
+void local_oid_storage_fini(const struct lu_env *env,
+			    struct local_oid_storage *los)
+{
+	struct ls_device *ls;
+
+	if (!atomic_dec_and_test(&los->los_refcount))
+		return;
+
+	LASSERT(env);
+	LASSERT(los->los_dev);
+	ls = dt2ls_dev(los->los_dev);
+
+	mutex_lock(&ls->ls_los_mutex);
+	if (atomic_read(&los->los_refcount) == 0) {
+		if (los->los_obj)
+			lu_object_put_nocache(env, &los->los_obj->do_lu);
+		list_del(&los->los_list);
+		OBD_FREE_PTR(los);
+	}
+	mutex_unlock(&ls->ls_los_mutex);
+	ls_device_put(env, ls);
+}
+EXPORT_SYMBOL(local_oid_storage_fini);
diff --git a/drivers/staging/lustre/lustre/obdclass/local_storage.h b/drivers/staging/lustre/lustre/obdclass/local_storage.h
new file mode 100644
index 000000000000..d553c3752703
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/local_storage.h
@@ -0,0 +1,88 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License version 2 for more details.  A copy is
+ * included in the COPYING file that accompanied this code.
+
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * lustre/obdclass/local_storage.c
+ *
+ * Local storage for file/objects with fid generation. Works on top of OSD.
+ *
+ * Author: Mikhail Pershin <mike.pershin@intel.com>
+ */
+
+#include <dt_object.h>
+#include <obd.h>
+#include <lustre_fid.h>
+#include <lustre_disk.h>
+
+struct ls_device {
+	struct dt_device	 ls_top_dev;
+	/* all initialized ls_devices on this node linked by this */
+	struct list_head		 ls_linkage;
+	/* how many handle's reference this local storage */
+	atomic_t		 ls_refcount;
+	/* underlaying OSD device */
+	struct dt_device	*ls_osd;
+	/* list of all local OID storages */
+	struct list_head		 ls_los_list;
+	struct mutex		 ls_los_mutex;
+};
+
+static inline struct ls_device *dt2ls_dev(struct dt_device *d)
+{
+	return container_of0(d, struct ls_device, ls_top_dev);
+}
+
+struct ls_object {
+	struct lu_object_header	 ls_header;
+	struct dt_object	 ls_obj;
+};
+
+static inline struct ls_object *lu2ls_obj(struct lu_object *o)
+{
+	return container_of0(o, struct ls_object, ls_obj.do_lu);
+}
+
+static inline struct dt_object *ls_locate(const struct lu_env *env,
+					  struct ls_device *ls,
+					  const struct lu_fid *fid)
+{
+	return dt_locate_at(env, ls->ls_osd, fid, &ls->ls_top_dev.dd_lu_dev);
+}
+
+struct ls_device *ls_device_get(struct dt_device *dev);
+void ls_device_put(const struct lu_env *env, struct ls_device *ls);
+struct local_oid_storage *dt_los_find(struct ls_device *ls, __u64 seq);
+void dt_los_put(struct local_oid_storage *los);
+
+/* Lustre 2.3 on-disk structure describing local object OIDs storage
+ * the structure to be used with any sequence managed by
+ * local object library.
+ * Obsoleted since 2.4 but is kept for compatibility reasons,
+ * see lastid_compat_check() in obdclass/local_storage.c */
+struct los_ondisk {
+	__u32 lso_magic;
+	__u32 lso_next_oid;
+};
+
+#define LOS_MAGIC	0xdecafbee
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
new file mode 100644
index 000000000000..e2d57fef0da3
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_jobstats.c
@@ -0,0 +1,562 @@
+/* GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Niu Yawei <niu@whamcloud.com>
+ */
+/*
+ * lustre/obdclass/lprocfs_jobstats.c
+ */
+
+#ifndef EXPORT_SYMTAB
+# define EXPORT_SYMTAB
+#endif
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+#if defined(LPROCFS)
+
+/*
+ * JobID formats & JobID environment variable names for supported
+ * job schedulers:
+ *
+ * SLURM:
+ *   JobID format:  32 bit integer.
+ *   JobID env var: SLURM_JOB_ID.
+ * SGE:
+ *   JobID format:  Decimal integer range to 99999.
+ *   JobID env var: JOB_ID.
+ * LSF:
+ *   JobID format:  6 digit integer by default (up to 999999), can be
+ *		  increased to 10 digit (up to 2147483646).
+ *   JobID env var: LSB_JOBID.
+ * Loadleveler:
+ *   JobID format:  String of machine_name.cluster_id.process_id, for
+ *		  example: fr2n02.32.0
+ *   JobID env var: LOADL_STEP_ID.
+ * PBS:
+ *   JobID format:  String of sequence_number[.server_name][@server].
+ *   JobID env var: PBS_JOBID.
+ * Maui/MOAB:
+ *   JobID format:  Same as PBS.
+ *   JobID env var: Same as PBS.
+ */
+
+struct job_stat {
+	struct hlist_node      js_hash;
+	struct list_head	    js_list;
+	atomic_t	  js_refcount;
+	char		  js_jobid[JOBSTATS_JOBID_SIZE];
+	time_t		js_timestamp; /* seconds */
+	struct lprocfs_stats *js_stats;
+	struct obd_job_stats *js_jobstats;
+};
+
+static unsigned job_stat_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, strlen(key), mask);
+}
+
+static void *job_stat_key(struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return job->js_jobid;
+}
+
+static int job_stat_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	return (strlen(job->js_jobid) == strlen(key)) &&
+	       !strncmp(job->js_jobid, key, strlen(key));
+}
+
+static void *job_stat_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct job_stat, js_hash);
+}
+
+static void job_stat_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	atomic_inc(&job->js_refcount);
+}
+
+static void job_free(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) == 0);
+	LASSERT(job->js_jobstats);
+
+	write_lock(&job->js_jobstats->ojs_lock);
+	list_del_init(&job->js_list);
+	write_unlock(&job->js_jobstats->ojs_lock);
+
+	lprocfs_free_stats(&job->js_stats);
+	OBD_FREE_PTR(job);
+}
+
+static void job_putref(struct job_stat *job)
+{
+	LASSERT(atomic_read(&job->js_refcount) > 0);
+	if (atomic_dec_and_test(&job->js_refcount))
+		job_free(job);
+}
+
+static void job_stat_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct job_stat *job;
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	job_putref(job);
+}
+
+static void job_stat_exit(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	CERROR("Should not have any items!");
+}
+
+static cfs_hash_ops_t job_stats_hash_ops = {
+	.hs_hash       = job_stat_hash,
+	.hs_key	= job_stat_key,
+	.hs_keycmp     = job_stat_keycmp,
+	.hs_object     = job_stat_object,
+	.hs_get	= job_stat_get,
+	.hs_put_locked = job_stat_put_locked,
+	.hs_exit       = job_stat_exit,
+};
+
+static int job_iter_callback(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			     struct hlist_node *hnode, void *data)
+{
+	time_t oldest = *((time_t *)data);
+	struct job_stat *job;
+
+	job = hlist_entry(hnode, struct job_stat, js_hash);
+	if (!oldest || job->js_timestamp < oldest)
+		cfs_hash_bd_del_locked(hs, bd, hnode);
+
+	return 0;
+}
+
+static void lprocfs_job_cleanup(struct obd_job_stats *stats, bool force)
+{
+	time_t oldest, now;
+
+	if (stats->ojs_cleanup_interval == 0)
+		return;
+
+	now = cfs_time_current_sec();
+	if (!force && now < stats->ojs_last_cleanup +
+			    stats->ojs_cleanup_interval)
+		return;
+
+	oldest = now - stats->ojs_cleanup_interval;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+			       &oldest);
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+}
+
+static struct job_stat *job_alloc(char *jobid, struct obd_job_stats *jobs)
+{
+	struct job_stat *job;
+
+	LASSERT(jobs->ojs_cntr_num && jobs->ojs_cntr_init_fn);
+
+	OBD_ALLOC_PTR(job);
+	if (job == NULL)
+		return NULL;
+
+	job->js_stats = lprocfs_alloc_stats(jobs->ojs_cntr_num, 0);
+	if (job->js_stats == NULL) {
+		OBD_FREE_PTR(job);
+		return NULL;
+	}
+
+	jobs->ojs_cntr_init_fn(job->js_stats);
+
+	memcpy(job->js_jobid, jobid, JOBSTATS_JOBID_SIZE);
+	job->js_timestamp = cfs_time_current_sec();
+	job->js_jobstats = jobs;
+	INIT_HLIST_NODE(&job->js_hash);
+	INIT_LIST_HEAD(&job->js_list);
+	atomic_set(&job->js_refcount, 1);
+
+	return job;
+}
+
+int lprocfs_job_stats_log(struct obd_device *obd, char *jobid,
+			  int event, long amount)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	struct job_stat *job, *job2;
+	ENTRY;
+
+	LASSERT(stats && stats->ojs_hash);
+
+	lprocfs_job_cleanup(stats, false);
+
+	if (!jobid || !strlen(jobid))
+		RETURN(-EINVAL);
+
+	if (strlen(jobid) >= JOBSTATS_JOBID_SIZE) {
+		CERROR("Invalid jobid size (%lu), expect(%d)\n",
+		       (unsigned long)strlen(jobid) + 1, JOBSTATS_JOBID_SIZE);
+		RETURN(-EINVAL);
+	}
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (job)
+		goto found;
+
+	job = job_alloc(jobid, stats);
+	if (job == NULL)
+		RETURN(-ENOMEM);
+
+	job2 = cfs_hash_findadd_unique(stats->ojs_hash, job->js_jobid,
+				       &job->js_hash);
+	if (job2 != job) {
+		job_putref(job);
+		job = job2;
+		/* We cannot LASSERT(!list_empty(&job->js_list)) here,
+		 * since we just lost the race for inserting "job" into the
+		 * ojs_list, and some other thread is doing it _right_now_.
+		 * Instead, be content the other thread is doing this, since
+		 * "job2" was initialized in job_alloc() already. LU-2163 */
+	} else {
+		LASSERT(list_empty(&job->js_list));
+		write_lock(&stats->ojs_lock);
+		list_add_tail(&job->js_list, &stats->ojs_list);
+		write_unlock(&stats->ojs_lock);
+	}
+
+found:
+	LASSERT(stats == job->js_jobstats);
+	LASSERT(stats->ojs_cntr_num > event);
+	job->js_timestamp = cfs_time_current_sec();
+	lprocfs_counter_add(job->js_stats, event, amount);
+
+	job_putref(job);
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_log);
+
+void lprocfs_job_stats_fini(struct obd_device *obd)
+{
+	struct obd_job_stats *stats = &obd->u.obt.obt_jobstats;
+	time_t oldest = 0;
+
+	if (stats->ojs_hash == NULL)
+		return;
+	cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback, &oldest);
+	cfs_hash_putref(stats->ojs_hash);
+	stats->ojs_hash = NULL;
+	LASSERT(list_empty(&stats->ojs_list));
+}
+EXPORT_SYMBOL(lprocfs_job_stats_fini);
+
+static void *lprocfs_jobstats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	loff_t off = *pos;
+	struct job_stat *job;
+
+	read_lock(&stats->ojs_lock);
+	if (off == 0)
+		return SEQ_START_TOKEN;
+	off--;
+	list_for_each_entry(job, &stats->ojs_list, js_list) {
+		if (!off--)
+			return job;
+	}
+	return NULL;
+}
+
+static void lprocfs_jobstats_seq_stop(struct seq_file *p, void *v)
+{
+	struct obd_job_stats *stats = p->private;
+
+	read_unlock(&stats->ojs_lock);
+}
+
+static void *lprocfs_jobstats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	struct obd_job_stats *stats = p->private;
+	struct job_stat *job;
+	struct list_head *next;
+
+	++*pos;
+	if (v == SEQ_START_TOKEN) {
+		next = stats->ojs_list.next;
+	} else {
+		job = (struct job_stat *)v;
+		next = job->js_list.next;
+	}
+
+	return next == &stats->ojs_list ? NULL :
+		list_entry(next, struct job_stat, js_list);
+}
+
+/*
+ * Example of output on MDT:
+ *
+ * job_stats:
+ * - job_id:	test_id.222.25844
+ *   snapshot_time: 1322494486
+ *   open:	  { samples:	       3, unit: reqs }
+ *   close:	 { samples:	       3, unit: reqs }
+ *   mknod:	 { samples:	       0, unit: reqs }
+ *   link:	  { samples:	       0, unit: reqs }
+ *   unlink:	{ samples:	       0, unit: reqs }
+ *   mkdir:	 { samples:	       0, unit: reqs }
+ *   rmdir:	 { samples:	       0, unit: reqs }
+ *   rename:	{ samples:	       1, unit: reqs }
+ *   getattr:       { samples:	       7, unit: reqs }
+ *   setattr:       { samples:	       0, unit: reqs }
+ *   getxattr:      { samples:	       0, unit: reqs }
+ *   setxattr:      { samples:	       0, unit: reqs }
+ *   statfs:	{ samples:	       0, unit: reqs }
+ *   sync:	  { samples:	       0, unit: reqs }
+ *
+ * Example of output on OST:
+ *
+ * job_stats:
+ * - job_id	 4854
+ *   snapshot_time: 1322494602
+ *   read:	  { samples:  0, unit: bytes, min:  0, max:  0, sum:  0 }
+ *   write:	 { samples:  1, unit: bytes, min: 10, max: 10, sum: 10 }
+ *   setattr:       { samples:  0, unit: reqs }
+ *   punch:	 { samples:  0, unit: reqs }
+ *   sync:	  { samples:  0, unit: reqs }
+ */
+
+static const char spaces[] = "		    ";
+
+static int inline width(const char *str, int len)
+{
+	return len - min((int)strlen(str), 15);
+}
+
+static int lprocfs_jobstats_seq_show(struct seq_file *p, void *v)
+{
+	struct job_stat			*job = v;
+	struct lprocfs_stats		*s;
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter		*cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+
+	if (v == SEQ_START_TOKEN) {
+		seq_printf(p, "job_stats:\n");
+		return 0;
+	}
+
+	seq_printf(p, "- %-16s %s\n", "job_id:", job->js_jobid);
+	seq_printf(p, "  %-16s %ld\n", "snapshot_time:", job->js_timestamp);
+
+	s = job->js_stats;
+	for (i = 0; i < s->ls_num; i++) {
+		cntr = lprocfs_stats_counter_get(s, 0, i);
+		cntr_header = &s->ls_cnt_header[i];
+		lprocfs_stats_collect(s, i, &ret);
+
+		seq_printf(p, "  %s:%.*s { samples: %11"LPF64"u",
+			   cntr_header->lc_name,
+			   width(cntr_header->lc_name, 15), spaces,
+			   ret.lc_count);
+		if (cntr_header->lc_units[0] != '\0')
+			seq_printf(p, ", unit: %5s", cntr_header->lc_units);
+
+		if (cntr_header->lc_config & LPROCFS_CNTR_AVGMINMAX) {
+			seq_printf(p, ", min:%8"LPF64"u, max:%8"LPF64"u,"
+				   " sum:%16"LPF64"u",
+				   ret.lc_count ? ret.lc_min : 0,
+				   ret.lc_count ? ret.lc_max : 0,
+				   ret.lc_count ? ret.lc_sum : 0);
+		}
+		if (cntr_header->lc_config & LPROCFS_CNTR_STDDEV) {
+			seq_printf(p, ", sumsq: %18"LPF64"u",
+				   ret.lc_count ? ret.lc_sumsquare : 0);
+		}
+
+		seq_printf(p, " }\n");
+
+	}
+	return 0;
+}
+
+struct seq_operations lprocfs_jobstats_seq_sops = {
+	start: lprocfs_jobstats_seq_start,
+	stop:  lprocfs_jobstats_seq_stop,
+	next:  lprocfs_jobstats_seq_next,
+	show:  lprocfs_jobstats_seq_show,
+};
+
+static int lprocfs_jobstats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_jobstats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+static ssize_t lprocfs_jobstats_seq_write(struct file *file, const char *buf,
+					  size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct obd_job_stats *stats = seq->private;
+	char jobid[JOBSTATS_JOBID_SIZE];
+	int all = 0;
+	struct job_stat *job;
+
+	if (!memcmp(buf, "clear", strlen("clear"))) {
+		all = 1;
+	} else if (len < JOBSTATS_JOBID_SIZE) {
+		memset(jobid, 0, JOBSTATS_JOBID_SIZE);
+		/* Trim '\n' if any */
+		if (buf[len - 1] == '\n')
+			memcpy(jobid, buf, len - 1);
+		else
+			memcpy(jobid, buf, len);
+	} else {
+		return -EINVAL;
+	}
+
+	LASSERT(stats->ojs_hash);
+	if (all) {
+		time_t oldest = 0;
+		cfs_hash_for_each_safe(stats->ojs_hash, job_iter_callback,
+				       &oldest);
+		return len;
+	}
+
+	if (!strlen(jobid))
+		return -EINVAL;
+
+	job = cfs_hash_lookup(stats->ojs_hash, jobid);
+	if (!job)
+		return -EINVAL;
+
+	cfs_hash_del_key(stats->ojs_hash, jobid);
+
+	job_putref(job);
+	return len;
+}
+
+struct file_operations lprocfs_jobstats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_jobstats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_jobstats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_job_stats_init(struct obd_device *obd, int cntr_num,
+			   cntr_init_callback init_fn)
+{
+	struct proc_dir_entry *entry;
+	struct obd_job_stats *stats;
+	ENTRY;
+
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_type->typ_name);
+
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) {
+		CERROR("Invalid obd device type.\n");
+		RETURN(-EINVAL);
+	}
+	stats = &obd->u.obt.obt_jobstats;
+
+	LASSERT(stats->ojs_hash == NULL);
+	stats->ojs_hash = cfs_hash_create("JOB_STATS",
+					  HASH_JOB_STATS_CUR_BITS,
+					  HASH_JOB_STATS_MAX_BITS,
+					  HASH_JOB_STATS_BKT_BITS, 0,
+					  CFS_HASH_MIN_THETA,
+					  CFS_HASH_MAX_THETA,
+					  &job_stats_hash_ops,
+					  CFS_HASH_DEFAULT);
+	if (stats->ojs_hash == NULL)
+		RETURN(-ENOMEM);
+
+	INIT_LIST_HEAD(&stats->ojs_list);
+	rwlock_init(&stats->ojs_lock);
+	stats->ojs_cntr_num = cntr_num;
+	stats->ojs_cntr_init_fn = init_fn;
+	stats->ojs_cleanup_interval = 600; /* 10 mins by default */
+	stats->ojs_last_cleanup = cfs_time_current_sec();
+
+	entry = proc_create_data("job_stats", 0644, obd->obd_proc_entry,
+				 &lprocfs_jobstats_seq_fops, stats);
+	if (entry)
+		RETURN(0);
+	else
+		RETURN(-ENOMEM);
+}
+EXPORT_SYMBOL(lprocfs_job_stats_init);
+
+int lprocfs_rd_job_interval(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+	return seq_printf(m, "%d\n", stats->ojs_cleanup_interval);
+}
+EXPORT_SYMBOL(lprocfs_rd_job_interval);
+
+int lprocfs_wr_job_interval(struct file *file, const char *buffer,
+			    unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_job_stats *stats;
+	int val, rc;
+
+	LASSERT(obd != NULL);
+	stats = &obd->u.obt.obt_jobstats;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc)
+		return rc;
+
+	stats->ojs_cleanup_interval = val;
+	lprocfs_job_cleanup(stats, true);
+
+	return count;
+
+}
+EXPORT_SYMBOL(lprocfs_wr_job_interval);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
new file mode 100644
index 000000000000..f7af3d6a4efc
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lprocfs_status.c
@@ -0,0 +1,1985 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lprocfs_status.c
+ *
+ * Author: Hariharan Thantry <thantry@users.sourceforge.net>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <obd_class.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+#include <linux/seq_file.h>
+
+#if defined(LPROCFS)
+
+static int lprocfs_no_percpu_stats = 0;
+CFS_MODULE_PARM(lprocfs_no_percpu_stats, "i", int, 0644,
+		"Do not alloc percpu data for lprocfs stats");
+
+#define MAX_STRING_SIZE 128
+
+int lprocfs_single_release(struct inode *inode, struct file *file)
+{
+	return single_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_single_release);
+
+int lprocfs_seq_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+EXPORT_SYMBOL(lprocfs_seq_release);
+
+/* lprocfs API calls */
+
+proc_dir_entry_t *lprocfs_add_simple(struct proc_dir_entry *root,
+				     char *name, void *data,
+				     struct file_operations *fops)
+{
+	proc_dir_entry_t *proc;
+	umode_t mode = 0;
+
+	if (root == NULL || name == NULL || fops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (fops->read)
+		mode = 0444;
+	if (fops->write)
+		mode |= 0200;
+	proc = proc_create_data(name, mode, root, fops, data);
+	if (!proc) {
+		CERROR("LprocFS: No memory to create /proc entry %s", name);
+		return ERR_PTR(-ENOMEM);
+	}
+	return proc;
+}
+EXPORT_SYMBOL(lprocfs_add_simple);
+
+struct proc_dir_entry *lprocfs_add_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *format, ...)
+{
+	struct proc_dir_entry *entry;
+	char *dest;
+	va_list ap;
+
+	if (parent == NULL || format == NULL)
+		return NULL;
+
+	OBD_ALLOC_WAIT(dest, MAX_STRING_SIZE + 1);
+	if (dest == NULL)
+		return NULL;
+
+	va_start(ap, format);
+	vsnprintf(dest, MAX_STRING_SIZE, format, ap);
+	va_end(ap);
+
+	entry = proc_symlink(name, parent, dest);
+	if (entry == NULL)
+		CERROR("LprocFS: Could not create symbolic link from %s to %s",
+			name, dest);
+
+	OBD_FREE(dest, MAX_STRING_SIZE + 1);
+	return entry;
+}
+EXPORT_SYMBOL(lprocfs_add_symlink);
+
+static struct file_operations lprocfs_generic_fops = { };
+
+/**
+ * Add /proc entries.
+ *
+ * \param root [in]  The parent proc entry on which new entry will be added.
+ * \param list [in]  Array of proc entries to be added.
+ * \param data [in]  The argument to be passed when entries read/write routines
+ *		   are called through /proc file.
+ *
+ * \retval 0   on success
+ *	 < 0 on error
+ */
+int lprocfs_add_vars(struct proc_dir_entry *root, struct lprocfs_vars *list,
+		     void *data)
+{
+	if (root == NULL || list == NULL)
+		return -EINVAL;
+
+	while (list->name != NULL) {
+		struct proc_dir_entry *proc;
+		umode_t mode = 0;
+
+		if (list->proc_mode != 0000) {
+			mode = list->proc_mode;
+		} else if (list->fops) {
+			if (list->fops->read)
+				mode = 0444;
+			if (list->fops->write)
+				mode |= 0200;
+		}
+		proc = proc_create_data(list->name, mode, root,
+					list->fops ?: &lprocfs_generic_fops,
+					list->data ?: data);
+		if (proc == NULL)
+			return -ENOMEM;
+		list++;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_add_vars);
+
+void lprocfs_remove(struct proc_dir_entry **rooth)
+{
+	proc_remove(*rooth);
+	*rooth = NULL;
+}
+EXPORT_SYMBOL(lprocfs_remove);
+
+void lprocfs_remove_proc_entry(const char *name, struct proc_dir_entry *parent)
+{
+	LASSERT(parent != NULL);
+	remove_proc_entry(name, parent);
+}
+EXPORT_SYMBOL(lprocfs_remove_proc_entry);
+
+struct proc_dir_entry *lprocfs_register(const char *name,
+					struct proc_dir_entry *parent,
+					struct lprocfs_vars *list, void *data)
+{
+	struct proc_dir_entry *newchild;
+
+	newchild = proc_mkdir(name, parent);
+	if (newchild != NULL && list != NULL) {
+		int rc = lprocfs_add_vars(newchild, list, data);
+		if (rc) {
+			lprocfs_remove(&newchild);
+			return ERR_PTR(rc);
+		}
+	}
+	return newchild;
+}
+EXPORT_SYMBOL(lprocfs_register);
+
+/* Generic callbacks */
+int lprocfs_rd_uint(struct seq_file *m, void *data)
+{
+	return seq_printf(m, "%u\n", *(unsigned int *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_uint);
+
+int lprocfs_wr_uint(struct file *file, const char __user *buffer,
+		    unsigned long count, void *data)
+{
+	unsigned *p = data;
+	char dummy[MAX_STRING_SIZE + 1], *end;
+	unsigned long tmp;
+
+	dummy[MAX_STRING_SIZE] = '\0';
+	if (copy_from_user(dummy, buffer, MAX_STRING_SIZE))
+		return -EFAULT;
+
+	tmp = simple_strtoul(dummy, &end, 0);
+	if (dummy == end)
+		return -EINVAL;
+
+	*p = (unsigned int)tmp;
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_uint);
+
+int lprocfs_rd_u64(struct seq_file *m, void *data)
+{
+	return seq_printf(m, LPU64"\n", *(__u64 *)data);
+}
+EXPORT_SYMBOL(lprocfs_rd_u64);
+
+int lprocfs_rd_atomic(struct seq_file *m, void *data)
+{
+	atomic_t *atom = data;
+	LASSERT(atom != NULL);
+	return seq_printf(m, "%d\n", atomic_read(atom));
+}
+EXPORT_SYMBOL(lprocfs_rd_atomic);
+
+int lprocfs_wr_atomic(struct file *file, const char __user *buffer,
+		      unsigned long count, void *data)
+{
+	atomic_t *atm = data;
+	int val = 0;
+	int rc;
+
+	rc = lprocfs_write_helper(buffer, count, &val);
+	if (rc < 0)
+		return rc;
+
+	if (val <= 0)
+		return -ERANGE;
+
+	atomic_set(atm, val);
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_wr_atomic);
+
+int lprocfs_rd_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	return seq_printf(m, "%s\n", obd->obd_uuid.uuid);
+}
+EXPORT_SYMBOL(lprocfs_rd_uuid);
+
+int lprocfs_rd_name(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+
+	LASSERT(dev != NULL);
+	return seq_printf(m, "%s\n", dev->obd_name);
+}
+EXPORT_SYMBOL(lprocfs_rd_name);
+
+int lprocfs_rd_blksize(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, "%u\n", osfs.os_bsize);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_blksize);
+
+int lprocfs_rd_kbytestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_blocks;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytestotal);
+
+int lprocfs_rd_kbytesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bfree;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesfree);
+
+int lprocfs_rd_kbytesavail(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc) {
+		__u32 blk_size = osfs.os_bsize >> 10;
+		__u64 result = osfs.os_bavail;
+
+		while (blk_size >>= 1)
+			result <<= 1;
+
+		rc = seq_printf(m, LPU64"\n", result);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_kbytesavail);
+
+int lprocfs_rd_filestotal(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, LPU64"\n", osfs.os_files);
+
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filestotal);
+
+int lprocfs_rd_filesfree(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_statfs  osfs;
+	int rc = obd_statfs(NULL, obd->obd_self_export, &osfs,
+			    cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+			    OBD_STATFS_NODELAY);
+	if (!rc)
+		rc = seq_printf(m, LPU64"\n", osfs.os_ffree);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_filesfree);
+
+int lprocfs_rd_server_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct obd_import *imp;
+	char *imp_state_name = NULL;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+	imp_state_name = ptlrpc_import_state_name(imp->imp_state);
+	rc = seq_printf(m, "%s\t%s%s\n", obd2cli_tgt(obd), imp_state_name,
+			imp->imp_deactive ? "\tDEACTIVATED" : "");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_server_uuid);
+
+int lprocfs_rd_conn_uuid(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	struct ptlrpc_connection *conn;
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+
+	LPROCFS_CLIMP_CHECK(obd);
+	conn = obd->u.cli.cl_import->imp_connection;
+	if (conn && obd->u.cli.cl_import)
+		rc = seq_printf(m, "%s\n", conn->c_remote_uuid.uuid);
+	else
+		rc = seq_printf(m, "%s\n", "<none>");
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_rd_conn_uuid);
+
+/** add up per-cpu counters */
+void lprocfs_stats_collect(struct lprocfs_stats *stats, int idx,
+			   struct lprocfs_counter *cnt)
+{
+	unsigned int			num_entry;
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*cntr_header;
+	int				i;
+	unsigned long			flags = 0;
+
+	memset(cnt, 0, sizeof(*cnt));
+
+	if (stats == NULL) {
+		/* set count to 1 to avoid divide-by-zero errs in callers */
+		cnt->lc_count = 1;
+		return;
+	}
+
+	cnt->lc_min = LC_MIN_INIT;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		cntr_header = &stats->ls_cnt_header[idx];
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, idx);
+
+		cnt->lc_count += percpu_cntr->lc_count;
+		cnt->lc_sum += percpu_cntr->lc_sum;
+		if (percpu_cntr->lc_min < cnt->lc_min)
+			cnt->lc_min = percpu_cntr->lc_min;
+		if (percpu_cntr->lc_max > cnt->lc_max)
+			cnt->lc_max = percpu_cntr->lc_max;
+		cnt->lc_sumsquare += percpu_cntr->lc_sumsquare;
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_stats_collect);
+
+/**
+ * Append a space separated list of current set flags to str.
+ */
+#define flag2str(flag, first)						\
+	do {								\
+		if (imp->imp_##flag)					\
+		     seq_printf(m, "%s" #flag, first ? "" : ", ");	\
+	} while (0)
+static int obd_import_flags2str(struct obd_import *imp, struct seq_file *m)
+{
+	bool first = true;
+
+	if (imp->imp_obd->obd_no_recov) {
+		seq_printf(m, "no_recov");
+		first = false;
+	}
+
+	flag2str(invalid, first);
+	first = false;
+	flag2str(deactive, first);
+	flag2str(replayable, first);
+	flag2str(pingable, first);
+	return 0;
+}
+#undef flags2str
+
+static const char *obd_connect_names[] = {
+	"read_only",
+	"lov_index",
+	"unused",
+	"write_grant",
+	"server_lock",
+	"version",
+	"request_portal",
+	"acl",
+	"xattr",
+	"create_on_write",
+	"truncate_lock",
+	"initial_transno",
+	"inode_bit_locks",
+	"join_file(obsolete)",
+	"getattr_by_fid",
+	"no_oh_for_devices",
+	"remote_client",
+	"remote_client_by_force",
+	"max_byte_per_rpc",
+	"64bit_qdata",
+	"mds_capability",
+	"oss_capability",
+	"early_lock_cancel",
+	"som",
+	"adaptive_timeouts",
+	"lru_resize",
+	"mds_mds_connection",
+	"real_conn",
+	"change_qunit_size",
+	"alt_checksum_algorithm",
+	"fid_is_enabled",
+	"version_recovery",
+	"pools",
+	"grant_shrink",
+	"skip_orphan",
+	"large_ea",
+	"full20",
+	"layout_lock",
+	"64bithash",
+	"object_max_bytes",
+	"imp_recov",
+	"jobstats",
+	"umask",
+	"einprogress",
+	"grant_param",
+	"flock_owner",
+	"lvb_type",
+	"nanoseconds_times",
+	"lightweight_conn",
+	"short_io",
+	"pingless",
+	"unknown",
+	NULL
+};
+
+static void obd_connect_seq_flags2str(struct seq_file *m, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i;
+	bool first = true;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask) {
+			seq_printf(m, "%s%s",
+					first ? sep : "", obd_connect_names[i]);
+			first = false;
+		}
+	}
+	if (flags & ~(mask - 1))
+		seq_printf(m, "%sunknown flags "LPX64,
+				first ? sep : "", flags & ~(mask - 1));
+}
+
+int obd_connect_flags2str(char *page, int count, __u64 flags, char *sep)
+{
+	__u64 mask = 1;
+	int i, ret = 0;
+
+	for (i = 0; obd_connect_names[i] != NULL; i++, mask <<= 1) {
+		if (flags & mask)
+			ret += snprintf(page + ret, count - ret, "%s%s",
+					ret ? sep : "", obd_connect_names[i]);
+	}
+	if (flags & ~(mask - 1))
+		ret += snprintf(page + ret, count - ret,
+				"%sunknown flags "LPX64,
+				ret ? sep : "", flags & ~(mask - 1));
+	return ret;
+}
+EXPORT_SYMBOL(obd_connect_flags2str);
+
+int lprocfs_rd_import(struct seq_file *m, void *data)
+{
+	struct lprocfs_counter		ret;
+	struct lprocfs_counter_header	*header;
+	struct obd_device		*obd	= (struct obd_device *)data;
+	struct obd_import		*imp;
+	struct obd_import_conn		*conn;
+	int				j;
+	int				k;
+	int				rw	= 0;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m,
+		     "import:\n"
+		     "    name: %s\n"
+		     "    target: %s\n"
+		     "    state: %s\n"
+		     "    instance: %u\n"
+		     "    connect_flags: [",
+		     obd->obd_name,
+		     obd2cli_tgt(obd),
+		     ptlrpc_import_state_name(imp->imp_state),
+		     imp->imp_connect_data.ocd_instance);
+	obd_connect_seq_flags2str(m, imp->imp_connect_data.ocd_connect_flags, ", ");
+	seq_printf(m,
+		      "]\n"
+		      "    import_flags: [");
+	obd_import_flags2str(imp, m);
+
+	seq_printf(m,
+		      "]\n"
+		      "    connection:\n"
+		      "       failover_nids: [");
+	spin_lock(&imp->imp_lock);
+	j = 0;
+	list_for_each_entry(conn, &imp->imp_conn_list, oic_item) {
+		seq_printf(m, "%s%s", j ? ", " : "",
+			   libcfs_nid2str(conn->oic_conn->c_peer.nid));
+		j++;
+	}
+	seq_printf(m,
+		      "]\n"
+		      "       current_connection: %s\n"
+		      "       connection_attempts: %u\n"
+		      "       generation: %u\n"
+		      "       in-progress_invalidations: %u\n",
+		      imp->imp_connection == NULL ? "<none>" :
+			      libcfs_nid2str(imp->imp_connection->c_peer.nid),
+		      imp->imp_conn_cnt,
+		      imp->imp_generation,
+		      atomic_read(&imp->imp_inval_count));
+	spin_unlock(&imp->imp_lock);
+
+	if (obd->obd_svc_stats == NULL)
+		goto out_climp;
+
+	header = &obd->obd_svc_stats->ls_cnt_header[PTLRPC_REQWAIT_CNTR];
+	lprocfs_stats_collect(obd->obd_svc_stats, PTLRPC_REQWAIT_CNTR, &ret);
+	if (ret.lc_count != 0) {
+		/* first argument to do_div MUST be __u64 */
+		__u64 sum = ret.lc_sum;
+		do_div(sum, ret.lc_count);
+		ret.lc_sum = sum;
+	} else
+		ret.lc_sum = 0;
+	seq_printf(m,
+		      "    rpcs:\n"
+		      "       inflight: %u\n"
+		      "       unregistering: %u\n"
+		      "       timeouts: %u\n"
+		      "       avg_waittime: "LPU64" %s\n",
+		      atomic_read(&imp->imp_inflight),
+		      atomic_read(&imp->imp_unregistering),
+		      atomic_read(&imp->imp_timeouts),
+		      ret.lc_sum, header->lc_units);
+
+	k = 0;
+	for(j = 0; j < IMP_AT_MAX_PORTALS; j++) {
+		if (imp->imp_at.iat_portal[j] == 0)
+			break;
+		k = max_t(unsigned int, k,
+			  at_get(&imp->imp_at.iat_service_estimate[j]));
+	}
+	seq_printf(m,
+		      "    service_estimates:\n"
+		      "       services: %u sec\n"
+		      "       network: %u sec\n",
+		      k,
+		      at_get(&imp->imp_at.iat_net_latency));
+
+	seq_printf(m,
+		      "    transactions:\n"
+		      "       last_replay: "LPU64"\n"
+		      "       peer_committed: "LPU64"\n"
+		      "       last_checked: "LPU64"\n",
+		      imp->imp_last_replay_transno,
+		      imp->imp_peer_committed_transno,
+		      imp->imp_last_transno_checked);
+
+	/* avg data rates */
+	for (rw = 0; rw <= 1; rw++) {
+		lprocfs_stats_collect(obd->obd_svc_stats,
+				      PTLRPC_LAST_CNTR + BRW_READ_BYTES + rw,
+				      &ret);
+		if (ret.lc_sum > 0 && ret.lc_count > 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "    %s_data_averages:\n"
+				      "       bytes_per_rpc: "LPU64"\n",
+				      rw ? "write" : "read",
+				      ret.lc_sum);
+		}
+		k = (int)ret.lc_sum;
+		j = opcode_offset(OST_READ + rw) + EXTRA_MAX_OPCODES;
+		header = &obd->obd_svc_stats->ls_cnt_header[j];
+		lprocfs_stats_collect(obd->obd_svc_stats, j, &ret);
+		if (ret.lc_sum > 0 && ret.lc_count != 0) {
+			/* first argument to do_div MUST be __u64 */
+			__u64 sum = ret.lc_sum;
+			do_div(sum, ret.lc_count);
+			ret.lc_sum = sum;
+			seq_printf(m,
+				      "       %s_per_rpc: "LPU64"\n",
+				      header->lc_units, ret.lc_sum);
+			j = (int)ret.lc_sum;
+			if (j > 0)
+				seq_printf(m,
+					      "       MB_per_sec: %u.%.02u\n",
+					      k / j, (100 * k / j) % 100);
+		}
+	}
+
+out_climp:
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_import);
+
+int lprocfs_rd_state(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	int j, k;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	seq_printf(m, "current_state: %s\n",
+		     ptlrpc_import_state_name(imp->imp_state));
+	seq_printf(m, "state_history:\n");
+	k = imp->imp_state_hist_idx;
+	for (j = 0; j < IMP_STATE_HIST_LEN; j++) {
+		struct import_state_hist *ish =
+			&imp->imp_state_hist[(k + j) % IMP_STATE_HIST_LEN];
+		if (ish->ish_state == 0)
+			continue;
+		seq_printf(m, " - ["CFS_TIME_T", %s]\n",
+			      ish->ish_time,
+			      ptlrpc_import_state_name(ish->ish_state));
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_state);
+
+int lprocfs_at_hist_helper(struct seq_file *m, struct adaptive_timeout *at)
+{
+	int i;
+	for (i = 0; i < AT_BINS; i++)
+		seq_printf(m, "%3u ", at->at_hist[i]);
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_at_hist_helper);
+
+/* See also ptlrpc_lprocfs_rd_timeouts */
+int lprocfs_rd_timeouts(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct obd_import *imp;
+	unsigned int cur, worst;
+	time_t now, worstt;
+	struct dhms ts;
+	int i;
+
+	LASSERT(obd != NULL);
+	LPROCFS_CLIMP_CHECK(obd);
+	imp = obd->u.cli.cl_import;
+
+	now = cfs_time_current_sec();
+
+	/* Some network health info for kicks */
+	s2dhms(&ts, now - imp->imp_last_reply_time);
+	seq_printf(m, "%-10s : %ld, "DHMS_FMT" ago\n",
+		       "last reply", imp->imp_last_reply_time, DHMS_VARS(&ts));
+
+	cur = at_get(&imp->imp_at.iat_net_latency);
+	worst = imp->imp_at.iat_net_latency.at_worst_ever;
+	worstt = imp->imp_at.iat_net_latency.at_worst_time;
+	s2dhms(&ts, now - worstt);
+	seq_printf(m, "%-10s : cur %3u  worst %3u (at %ld, "DHMS_FMT" ago) ",
+		       "network", cur, worst, worstt, DHMS_VARS(&ts));
+	lprocfs_at_hist_helper(m, &imp->imp_at.iat_net_latency);
+
+	for(i = 0; i < IMP_AT_MAX_PORTALS; i++) {
+		if (imp->imp_at.iat_portal[i] == 0)
+			break;
+		cur = at_get(&imp->imp_at.iat_service_estimate[i]);
+		worst = imp->imp_at.iat_service_estimate[i].at_worst_ever;
+		worstt = imp->imp_at.iat_service_estimate[i].at_worst_time;
+		s2dhms(&ts, now - worstt);
+		seq_printf(m, "portal %-2d  : cur %3u  worst %3u (at %ld, "
+			       DHMS_FMT" ago) ", imp->imp_at.iat_portal[i],
+			       cur, worst, worstt, DHMS_VARS(&ts));
+		lprocfs_at_hist_helper(m, &imp->imp_at.iat_service_estimate[i]);
+	}
+
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_timeouts);
+
+int lprocfs_rd_connect_flags(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+	__u64 flags;
+
+	LPROCFS_CLIMP_CHECK(obd);
+	flags = obd->u.cli.cl_import->imp_connect_data.ocd_connect_flags;
+	seq_printf(m, "flags="LPX64"\n", flags);
+	obd_connect_seq_flags2str(m, flags, "\n");
+	seq_printf(m, "\n");
+	LPROCFS_CLIMP_EXIT(obd);
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_rd_connect_flags);
+
+int lprocfs_rd_num_exports(struct seq_file *m, void *data)
+{
+	struct obd_device *obd = data;
+
+	LASSERT(obd != NULL);
+	return seq_printf(m, "%u\n", obd->obd_num_exports);
+}
+EXPORT_SYMBOL(lprocfs_rd_num_exports);
+
+int lprocfs_rd_numrefs(struct seq_file *m, void *data)
+{
+	struct obd_type *class = (struct obd_type*) data;
+
+	LASSERT(class != NULL);
+	return seq_printf(m, "%d\n", class->typ_refcnt);
+}
+EXPORT_SYMBOL(lprocfs_rd_numrefs);
+
+int lprocfs_obd_setup(struct obd_device *obd, struct lprocfs_vars *list)
+{
+	int rc = 0;
+
+	LASSERT(obd != NULL);
+	LASSERT(obd->obd_magic == OBD_DEVICE_MAGIC);
+	LASSERT(obd->obd_type->typ_procroot != NULL);
+
+	obd->obd_proc_entry = lprocfs_register(obd->obd_name,
+					       obd->obd_type->typ_procroot,
+					       list, obd);
+	if (IS_ERR(obd->obd_proc_entry)) {
+		rc = PTR_ERR(obd->obd_proc_entry);
+		CERROR("error %d setting up lprocfs for %s\n",rc,obd->obd_name);
+		obd->obd_proc_entry = NULL;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_setup);
+
+int lprocfs_obd_cleanup(struct obd_device *obd)
+{
+	if (!obd)
+		return -EINVAL;
+	if (obd->obd_proc_exports_entry) {
+		/* Should be no exports left */
+		lprocfs_remove(&obd->obd_proc_exports_entry);
+		obd->obd_proc_exports_entry = NULL;
+	}
+	if (obd->obd_proc_entry) {
+		lprocfs_remove(&obd->obd_proc_entry);
+		obd->obd_proc_entry = NULL;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_obd_cleanup);
+
+static void lprocfs_free_client_stats(struct nid_stat *client_stat)
+{
+	CDEBUG(D_CONFIG, "stat %p - data %p/%p\n", client_stat,
+	       client_stat->nid_proc, client_stat->nid_stats);
+
+	LASSERTF(atomic_read(&client_stat->nid_exp_ref_count) == 0,
+		 "nid %s:count %d\n", libcfs_nid2str(client_stat->nid),
+		 atomic_read(&client_stat->nid_exp_ref_count));
+
+	if (client_stat->nid_proc)
+		lprocfs_remove(&client_stat->nid_proc);
+
+	if (client_stat->nid_stats)
+		lprocfs_free_stats(&client_stat->nid_stats);
+
+	if (client_stat->nid_ldlm_stats)
+		lprocfs_free_stats(&client_stat->nid_ldlm_stats);
+
+	OBD_FREE_PTR(client_stat);
+	return;
+
+}
+
+void lprocfs_free_per_client_stats(struct obd_device *obd)
+{
+	cfs_hash_t *hash = obd->obd_nid_stats_hash;
+	struct nid_stat *stat;
+	ENTRY;
+
+	/* we need extra list - because hash_exit called to early */
+	/* not need locking because all clients is died */
+	while (!list_empty(&obd->obd_nid_stats)) {
+		stat = list_entry(obd->obd_nid_stats.next,
+				      struct nid_stat, nid_list);
+		list_del_init(&stat->nid_list);
+		cfs_hash_del(hash, &stat->nid, &stat->nid_hash);
+		lprocfs_free_client_stats(stat);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(lprocfs_free_per_client_stats);
+
+struct lprocfs_stats *lprocfs_alloc_stats(unsigned int num,
+					  enum lprocfs_stats_flags flags)
+{
+	struct lprocfs_stats	*stats;
+	unsigned int		num_entry;
+	unsigned int		percpusize = 0;
+	int			i;
+
+	if (num == 0)
+		return NULL;
+
+	if (lprocfs_no_percpu_stats != 0)
+		flags |= LPROCFS_STATS_FLAG_NOPERCPU;
+
+	if (flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	/* alloc percpu pointers for all possible cpu slots */
+	LIBCFS_ALLOC(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+	if (stats == NULL)
+		return NULL;
+
+	stats->ls_num = num;
+	stats->ls_flags = flags;
+	spin_lock_init(&stats->ls_lock);
+
+	/* alloc num of counter headers */
+	LIBCFS_ALLOC(stats->ls_cnt_header,
+		     stats->ls_num * sizeof(struct lprocfs_counter_header));
+	if (stats->ls_cnt_header == NULL)
+		goto fail;
+
+	if ((flags & LPROCFS_STATS_FLAG_NOPERCPU) != 0) {
+		/* contains only one set counters */
+		percpusize = lprocfs_stats_counter_size(stats);
+		LIBCFS_ALLOC_ATOMIC(stats->ls_percpu[0], percpusize);
+		if (stats->ls_percpu[0] == NULL)
+			goto fail;
+		stats->ls_biggest_alloc_num = 1;
+	} else if ((flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0) {
+		/* alloc all percpu data, currently only obd_memory use this */
+		for (i = 0; i < num_entry; ++i)
+			if (lprocfs_stats_alloc_one(stats, i) < 0)
+				goto fail;
+	}
+
+	return stats;
+
+fail:
+	lprocfs_free_stats(&stats);
+	return NULL;
+}
+EXPORT_SYMBOL(lprocfs_alloc_stats);
+
+void lprocfs_free_stats(struct lprocfs_stats **statsh)
+{
+	struct lprocfs_stats *stats = *statsh;
+	unsigned int num_entry;
+	unsigned int percpusize;
+	unsigned int i;
+
+	if (stats == NULL || stats->ls_num == 0)
+		return;
+	*statsh = NULL;
+
+	if (stats->ls_flags & LPROCFS_STATS_FLAG_NOPERCPU)
+		num_entry = 1;
+	else
+		num_entry = num_possible_cpus();
+
+	percpusize = lprocfs_stats_counter_size(stats);
+	for (i = 0; i < num_entry; i++)
+		if (stats->ls_percpu[i] != NULL)
+			LIBCFS_FREE(stats->ls_percpu[i], percpusize);
+	if (stats->ls_cnt_header != NULL)
+		LIBCFS_FREE(stats->ls_cnt_header, stats->ls_num *
+					sizeof(struct lprocfs_counter_header));
+	LIBCFS_FREE(stats, offsetof(typeof(*stats), ls_percpu[num_entry]));
+}
+EXPORT_SYMBOL(lprocfs_free_stats);
+
+void lprocfs_clear_stats(struct lprocfs_stats *stats)
+{
+	struct lprocfs_counter		*percpu_cntr;
+	struct lprocfs_counter_header	*header;
+	int				i;
+	int				j;
+	unsigned int			num_entry;
+	unsigned long			flags = 0;
+
+	num_entry = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+
+	for (i = 0; i < num_entry; i++) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		for (j = 0; j < stats->ls_num; j++) {
+			header = &stats->ls_cnt_header[j];
+			percpu_cntr = lprocfs_stats_counter_get(stats, i, j);
+			percpu_cntr->lc_count		= 0;
+			percpu_cntr->lc_min		= LC_MIN_INIT;
+			percpu_cntr->lc_max		= 0;
+			percpu_cntr->lc_sumsquare	= 0;
+			percpu_cntr->lc_sum		= 0;
+			if (stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE)
+				percpu_cntr->lc_sum_irq	= 0;
+		}
+	}
+
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_clear_stats);
+
+static ssize_t lprocfs_stats_seq_write(struct file *file,
+				       const char __user *buf,
+				       size_t len, loff_t *off)
+{
+	struct seq_file *seq = file->private_data;
+	struct lprocfs_stats *stats = seq->private;
+
+	lprocfs_clear_stats(stats);
+
+	return len;
+}
+
+static void *lprocfs_stats_seq_start(struct seq_file *p, loff_t *pos)
+{
+	struct lprocfs_stats *stats = p->private;
+
+	return (*pos < stats->ls_num) ? pos : NULL;
+}
+
+static void lprocfs_stats_seq_stop(struct seq_file *p, void *v)
+{
+}
+
+static void *lprocfs_stats_seq_next(struct seq_file *p, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return lprocfs_stats_seq_start(p, pos);
+}
+
+/* seq file export of one lprocfs counter */
+static int lprocfs_stats_seq_show(struct seq_file *p, void *v)
+{
+	struct lprocfs_stats		*stats	= p->private;
+	struct lprocfs_counter_header   *hdr;
+	struct lprocfs_counter           ctr;
+	int                              idx    = *(loff_t *)v;
+	int                              rc     = 0;
+
+	if (idx == 0) {
+		struct timeval now;
+		do_gettimeofday(&now);
+		rc = seq_printf(p, "%-25s %lu.%lu secs.usecs\n",
+				"snapshot_time", now.tv_sec, now.tv_usec);
+		if (rc < 0)
+			return rc;
+	}
+	hdr = &stats->ls_cnt_header[idx];
+	lprocfs_stats_collect(stats, idx, &ctr);
+
+	if (ctr.lc_count == 0)
+		goto out;
+
+	rc = seq_printf(p, "%-25s "LPD64" samples [%s]", hdr->lc_name,
+			ctr.lc_count, hdr->lc_units);
+
+	if (rc < 0)
+		goto out;
+
+	if ((hdr->lc_config & LPROCFS_CNTR_AVGMINMAX) && (ctr.lc_count > 0)) {
+		rc = seq_printf(p, " "LPD64" "LPD64" "LPD64,
+				ctr.lc_min, ctr.lc_max, ctr.lc_sum);
+		if (rc < 0)
+			goto out;
+		if (hdr->lc_config & LPROCFS_CNTR_STDDEV)
+			rc = seq_printf(p, " "LPD64, ctr.lc_sumsquare);
+		if (rc < 0)
+			goto out;
+	}
+	rc = seq_printf(p, "\n");
+out:
+	return (rc < 0) ? rc : 0;
+}
+
+struct seq_operations lprocfs_stats_seq_sops = {
+	.start	= lprocfs_stats_seq_start,
+	.stop	= lprocfs_stats_seq_stop,
+	.next	= lprocfs_stats_seq_next,
+	.show	= lprocfs_stats_seq_show,
+};
+
+static int lprocfs_stats_seq_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int rc;
+
+	rc = seq_open(file, &lprocfs_stats_seq_sops);
+	if (rc)
+		return rc;
+	seq = file->private_data;
+	seq->private = PDE_DATA(inode);
+	return 0;
+}
+
+struct file_operations lprocfs_stats_seq_fops = {
+	.owner   = THIS_MODULE,
+	.open    = lprocfs_stats_seq_open,
+	.read    = seq_read,
+	.write   = lprocfs_stats_seq_write,
+	.llseek  = seq_lseek,
+	.release = lprocfs_seq_release,
+};
+
+int lprocfs_register_stats(struct proc_dir_entry *root, const char *name,
+			   struct lprocfs_stats *stats)
+{
+	struct proc_dir_entry *entry;
+	LASSERT(root != NULL);
+
+	entry = proc_create_data(name, 0644, root,
+				 &lprocfs_stats_seq_fops, stats);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_register_stats);
+
+void lprocfs_counter_init(struct lprocfs_stats *stats, int index,
+			  unsigned conf, const char *name, const char *units)
+{
+	struct lprocfs_counter_header	*header;
+	struct lprocfs_counter		*percpu_cntr;
+	unsigned long			flags = 0;
+	unsigned int			i;
+	unsigned int			num_cpu;
+
+	LASSERT(stats != NULL);
+
+	header = &stats->ls_cnt_header[index];
+	LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n",
+		 index, name, units);
+
+	header->lc_config = conf;
+	header->lc_name   = name;
+	header->lc_units  = units;
+
+	num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags);
+	for (i = 0; i < num_cpu; ++i) {
+		if (stats->ls_percpu[i] == NULL)
+			continue;
+		percpu_cntr = lprocfs_stats_counter_get(stats, i, index);
+		percpu_cntr->lc_count		= 0;
+		percpu_cntr->lc_min		= LC_MIN_INIT;
+		percpu_cntr->lc_max		= 0;
+		percpu_cntr->lc_sumsquare	= 0;
+		percpu_cntr->lc_sum		= 0;
+		if ((stats->ls_flags & LPROCFS_STATS_FLAG_IRQ_SAFE) != 0)
+			percpu_cntr->lc_sum_irq	= 0;
+	}
+	lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags);
+}
+EXPORT_SYMBOL(lprocfs_counter_init);
+
+#define LPROCFS_OBD_OP_INIT(base, stats, op)			       \
+do {								       \
+	unsigned int coffset = base + OBD_COUNTER_OFFSET(op);	      \
+	LASSERT(coffset < stats->ls_num);				  \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	      \
+} while (0)
+
+void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iocontrol);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_info);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_info_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, attach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, detach);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cleanup);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, process_config);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, postrecov);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, add_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, del_conn);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, reconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, disconnect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_fini);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, fid_alloc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, statfs_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, packmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpackmd);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preallocate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, precreate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, create_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, setattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, migrate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, copy);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, iterate);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, init_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, destroy_export);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, extent_calc);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_init);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_connect);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, llog_finish);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, unpin);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, import_event);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, notify);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, health_check);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_del);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, getref);
+	LPROCFS_OBD_OP_INIT(num_private_stats, stats, putref);
+}
+EXPORT_SYMBOL(lprocfs_init_ops_stats);
+
+int lprocfs_alloc_obd_stats(struct obd_device *obd, unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->obd_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->obd_cntr_base == 0);
+
+	num_stats = ((int)sizeof(*obd->obd_type->typ_dt_ops) / sizeof(void *)) +
+		num_private_stats - 1 /* o_owner */;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_ops_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		/* If this LBUGs, it is likely that an obd
+		 * operation was added to struct obd_ops in
+		 * <obd.h>, and that the corresponding line item
+		 * LPROCFS_OBD_OP_INIT(.., .., opname)
+		 * is missing from the list above. */
+		LASSERTF(stats->ls_cnt_header[i].lc_name != NULL,
+			 "Missing obd_stat initializer obd_op "
+			 "operation at offset %d.\n", i - num_private_stats);
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->obd_stats  = stats;
+		obd->obd_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_obd_stats);
+
+void lprocfs_free_obd_stats(struct obd_device *obd)
+{
+	if (obd->obd_stats)
+		lprocfs_free_stats(&obd->obd_stats);
+}
+EXPORT_SYMBOL(lprocfs_free_obd_stats);
+
+#define LPROCFS_MD_OP_INIT(base, stats, op)			     \
+do {								    \
+	unsigned int coffset = base + MD_COUNTER_OFFSET(op);	    \
+	LASSERT(coffset < stats->ls_num);			       \
+	lprocfs_counter_init(stats, coffset, 0, #op, "reqs");	   \
+} while (0)
+
+void lprocfs_init_mps_stats(int num_private_stats, struct lprocfs_stats *stats)
+{
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getstatus);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, null_inode);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, find_cbdata);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, close);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, create);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, done_writing);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, enqueue);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getattr_name);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_lock);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, link);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, rename);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, is_subdir);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, sync);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, readpage);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unlink);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, setxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, getxattr);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, init_ea_size);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, free_lustre_md);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, clear_open_replay_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, set_lock_data);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async);
+	LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock);
+}
+EXPORT_SYMBOL(lprocfs_init_mps_stats);
+
+int lprocfs_alloc_md_stats(struct obd_device *obd,
+			   unsigned num_private_stats)
+{
+	struct lprocfs_stats *stats;
+	unsigned int num_stats;
+	int rc, i;
+
+	LASSERT(obd->md_stats == NULL);
+	LASSERT(obd->obd_proc_entry != NULL);
+	LASSERT(obd->md_cntr_base == 0);
+
+	num_stats = 1 + MD_COUNTER_OFFSET(revalidate_lock) +
+		    num_private_stats;
+	stats = lprocfs_alloc_stats(num_stats, 0);
+	if (stats == NULL)
+		return -ENOMEM;
+
+	lprocfs_init_mps_stats(num_private_stats, stats);
+
+	for (i = num_private_stats; i < num_stats; i++) {
+		if (stats->ls_cnt_header[i].lc_name == NULL) {
+			CERROR("Missing md_stat initializer md_op "
+			       "operation at offset %d. Aborting.\n",
+			       i - num_private_stats);
+			LBUG();
+		}
+	}
+	rc = lprocfs_register_stats(obd->obd_proc_entry, "md_stats", stats);
+	if (rc < 0) {
+		lprocfs_free_stats(&stats);
+	} else {
+		obd->md_stats  = stats;
+		obd->md_cntr_base = num_private_stats;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_alloc_md_stats);
+
+void lprocfs_free_md_stats(struct obd_device *obd)
+{
+	struct lprocfs_stats *stats = obd->md_stats;
+
+	if (stats != NULL) {
+		obd->md_stats = NULL;
+		obd->md_cntr_base = 0;
+		lprocfs_free_stats(&stats);
+	}
+}
+EXPORT_SYMBOL(lprocfs_free_md_stats);
+
+void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats)
+{
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_ENQUEUE - LDLM_FIRST_OPC,
+			     0, "ldlm_enqueue", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CONVERT - LDLM_FIRST_OPC,
+			     0, "ldlm_convert", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CANCEL - LDLM_FIRST_OPC,
+			     0, "ldlm_cancel", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_BL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_bl_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_CP_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_cp_callback", "reqs");
+	lprocfs_counter_init(ldlm_stats,
+			     LDLM_GL_CALLBACK - LDLM_FIRST_OPC,
+			     0, "ldlm_gl_callback", "reqs");
+}
+EXPORT_SYMBOL(lprocfs_init_ldlm_stats);
+
+int lprocfs_exp_print_uuid(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *data)
+
+{
+	struct obd_export *exp = cfs_hash_object(hs, hnode);
+	struct seq_file *m = (struct seq_file *)data;
+
+	if (exp->exp_nid_stats)
+		seq_printf(m, "%s\n", obd_uuid2str(&exp->exp_client_uuid));
+
+	return 0;
+}
+
+static int
+lproc_exp_uuid_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_uuid, m);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_uuid);
+
+struct exp_hash_cb_data {
+	struct seq_file *m;
+	bool		first;
+};
+
+int lprocfs_exp_print_hash(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+			   struct hlist_node *hnode, void *cb_data)
+
+{
+	struct exp_hash_cb_data *data = (struct exp_hash_cb_data *)cb_data;
+	struct obd_export       *exp = cfs_hash_object(hs, hnode);
+
+	if (exp->exp_lock_hash != NULL) {
+		if (data->first) {
+			cfs_hash_debug_header(data->m);
+			data->first = false;
+		}
+		cfs_hash_debug_str(hs, data->m);
+	}
+
+	return 0;
+}
+
+static int
+lproc_exp_hash_seq_show(struct seq_file *m, void *unused)
+{
+	struct nid_stat *stats = (struct nid_stat *)m->private;
+	struct obd_device *obd = stats->nid_obd;
+	struct exp_hash_cb_data cb_data = {m, true};
+
+	cfs_hash_for_each_key(obd->obd_nid_hash, &stats->nid,
+			      lprocfs_exp_print_hash, &cb_data);
+	return 0;
+}
+
+LPROC_SEQ_FOPS_RO(lproc_exp_hash);
+
+int lprocfs_nid_stats_clear_read(struct seq_file *m, void *data)
+{
+	return seq_printf(m, "%s\n",
+			"Write into this file to clear all nid stats and "
+			"stale nid entries");
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_read);
+
+static int lprocfs_nid_stats_clear_write_cb(void *obj, void *data)
+{
+	struct nid_stat *stat = obj;
+	ENTRY;
+
+	CDEBUG(D_INFO,"refcnt %d\n", atomic_read(&stat->nid_exp_ref_count));
+	if (atomic_read(&stat->nid_exp_ref_count) == 1) {
+		/* object has only hash references. */
+		spin_lock(&stat->nid_obd->obd_nid_lock);
+		list_move(&stat->nid_list, data);
+		spin_unlock(&stat->nid_obd->obd_nid_lock);
+		RETURN(1);
+	}
+	/* we has reference to object - only clear data*/
+	if (stat->nid_stats)
+		lprocfs_clear_stats(stat->nid_stats);
+
+	RETURN(0);
+}
+
+int lprocfs_nid_stats_clear_write(struct file *file, const char *buffer,
+				  unsigned long count, void *data)
+{
+	struct obd_device *obd = (struct obd_device *)data;
+	struct nid_stat *client_stat;
+	LIST_HEAD(free_list);
+
+	cfs_hash_cond_del(obd->obd_nid_stats_hash,
+			  lprocfs_nid_stats_clear_write_cb, &free_list);
+
+	while (!list_empty(&free_list)) {
+		client_stat = list_entry(free_list.next, struct nid_stat,
+					     nid_list);
+		list_del_init(&client_stat->nid_list);
+		lprocfs_free_client_stats(client_stat);
+	}
+
+	return count;
+}
+EXPORT_SYMBOL(lprocfs_nid_stats_clear_write);
+
+int lprocfs_exp_setup(struct obd_export *exp, lnet_nid_t *nid, int *newnid)
+{
+	struct nid_stat *new_stat, *old_stat;
+	struct obd_device *obd = NULL;
+	proc_dir_entry_t *entry;
+	char *buffer = NULL;
+	int rc = 0;
+	ENTRY;
+
+	*newnid = 0;
+
+	if (!exp || !exp->exp_obd || !exp->exp_obd->obd_proc_exports_entry ||
+	    !exp->exp_obd->obd_nid_stats_hash)
+		RETURN(-EINVAL);
+
+	/* not test against zero because eric say:
+	 * You may only test nid against another nid, or LNET_NID_ANY.
+	 * Anything else is nonsense.*/
+	if (!nid || *nid == LNET_NID_ANY)
+		RETURN(0);
+
+	obd = exp->exp_obd;
+
+	CDEBUG(D_CONFIG, "using hash %p\n", obd->obd_nid_stats_hash);
+
+	OBD_ALLOC_PTR(new_stat);
+	if (new_stat == NULL)
+		RETURN(-ENOMEM);
+
+	new_stat->nid	       = *nid;
+	new_stat->nid_obd	   = exp->exp_obd;
+	/* we need set default refcount to 1 to balance obd_disconnect */
+	atomic_set(&new_stat->nid_exp_ref_count, 1);
+
+	old_stat = cfs_hash_findadd_unique(obd->obd_nid_stats_hash,
+					   nid, &new_stat->nid_hash);
+	CDEBUG(D_INFO, "Found stats %p for nid %s - ref %d\n",
+	       old_stat, libcfs_nid2str(*nid),
+	       atomic_read(&new_stat->nid_exp_ref_count));
+
+	/* We need to release old stats because lprocfs_exp_cleanup() hasn't
+	 * been and will never be called. */
+	if (exp->exp_nid_stats) {
+		nidstat_putref(exp->exp_nid_stats);
+		exp->exp_nid_stats = NULL;
+	}
+
+	/* Return -EALREADY here so that we know that the /proc
+	 * entry already has been created */
+	if (old_stat != new_stat) {
+		exp->exp_nid_stats = old_stat;
+		GOTO(destroy_new, rc = -EALREADY);
+	}
+	/* not found - create */
+	OBD_ALLOC(buffer, LNET_NIDSTR_SIZE);
+	if (buffer == NULL)
+		GOTO(destroy_new, rc = -ENOMEM);
+
+	memcpy(buffer, libcfs_nid2str(*nid), LNET_NIDSTR_SIZE);
+	new_stat->nid_proc = lprocfs_register(buffer,
+					      obd->obd_proc_exports_entry,
+					      NULL, NULL);
+	OBD_FREE(buffer, LNET_NIDSTR_SIZE);
+
+	if (new_stat->nid_proc == NULL) {
+		CERROR("Error making export directory for nid %s\n",
+		       libcfs_nid2str(*nid));
+		GOTO(destroy_new_ns, rc = -ENOMEM);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "uuid",
+				   new_stat, &lproc_exp_uuid_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the NID stats file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	entry = lprocfs_add_simple(new_stat->nid_proc, "hash",
+				   new_stat, &lproc_exp_hash_fops);
+	if (IS_ERR(entry)) {
+		CWARN("Error adding the hash file\n");
+		rc = PTR_ERR(entry);
+		GOTO(destroy_new_ns, rc);
+	}
+
+	exp->exp_nid_stats = new_stat;
+	*newnid = 1;
+	/* protect competitive add to list, not need locking on destroy */
+	spin_lock(&obd->obd_nid_lock);
+	list_add(&new_stat->nid_list, &obd->obd_nid_stats);
+	spin_unlock(&obd->obd_nid_lock);
+
+	RETURN(rc);
+
+destroy_new_ns:
+	if (new_stat->nid_proc != NULL)
+		lprocfs_remove(&new_stat->nid_proc);
+	cfs_hash_del(obd->obd_nid_stats_hash, nid, &new_stat->nid_hash);
+
+destroy_new:
+	nidstat_putref(new_stat);
+	OBD_FREE_PTR(new_stat);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lprocfs_exp_setup);
+
+int lprocfs_exp_cleanup(struct obd_export *exp)
+{
+	struct nid_stat *stat = exp->exp_nid_stats;
+
+	if(!stat || !exp->exp_obd)
+		RETURN(0);
+
+	nidstat_putref(exp->exp_nid_stats);
+	exp->exp_nid_stats = NULL;
+
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_exp_cleanup);
+
+int lprocfs_write_helper(const char *buffer, unsigned long count,
+			 int *val)
+{
+	return lprocfs_write_frac_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_helper);
+
+int lprocfs_write_frac_helper(const char *buffer, unsigned long count,
+			      int *val, int mult)
+{
+	char kernbuf[20], *end, *pbuf;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	*val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int temp_val, pow = 1;
+		int i;
+
+		pbuf = end + 1;
+		if (strlen(pbuf) > 5)
+			pbuf[5] = '\0'; /*only allow 5bits fractional*/
+
+		temp_val = (int)simple_strtoul(pbuf, &end, 10) * mult;
+
+		if (pbuf < end) {
+			for (i = 0; i < (end - pbuf); i++)
+				pow *= 10;
+
+			*val += temp_val / pow;
+		}
+	}
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_helper);
+
+int lprocfs_read_frac_helper(char *buffer, unsigned long count, long val,
+			     int mult)
+{
+	long decimal_val, frac_val;
+	int prtn;
+
+	if (count < 10)
+		return -EINVAL;
+
+	decimal_val = val / mult;
+	prtn = snprintf(buffer, count, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (prtn < (count - 4) && frac_val > 0) {
+		long temp_frac;
+		int i, temp_mult = 1, frac_bits = 0;
+
+		temp_frac = frac_val * 10;
+		buffer[prtn++] = '.';
+		while (frac_bits < 2 && (temp_frac / mult) < 1 ) {
+			/* only reserved 2 bits fraction */
+			buffer[prtn++] ='0';
+			temp_frac *= 10;
+			frac_bits++;
+		}
+		/*
+		 * Need to think these cases :
+		 *      1. #echo x.00 > /proc/xxx       output result : x
+		 *      2. #echo x.0x > /proc/xxx       output result : x.0x
+		 *      3. #echo x.x0 > /proc/xxx       output result : x.x
+		 *      4. #echo x.xx > /proc/xxx       output result : x.xx
+		 *      Only reserved 2 bits fraction.
+		 */
+		for (i = 0; i < (5 - prtn); i++)
+			temp_mult *= 10;
+
+		frac_bits = min((int)count - prtn, 3 - frac_bits);
+		prtn += snprintf(buffer + prtn, frac_bits, "%ld",
+				 frac_val * temp_mult / mult);
+
+		prtn--;
+		while(buffer[prtn] < '1' || buffer[prtn] > '9') {
+			prtn--;
+			if (buffer[prtn] == '.') {
+				prtn--;
+				break;
+			}
+		}
+		prtn++;
+	}
+	buffer[prtn++] ='\n';
+	return prtn;
+}
+EXPORT_SYMBOL(lprocfs_read_frac_helper);
+
+int lprocfs_seq_read_frac_helper(struct seq_file *m, long val, int mult)
+{
+	long decimal_val, frac_val;
+
+	decimal_val = val / mult;
+	seq_printf(m, "%ld", decimal_val);
+	frac_val = val % mult;
+
+	if (frac_val > 0) {
+		frac_val *= 100;
+		frac_val /= mult;
+	}
+	if (frac_val > 0) {
+		/* Three cases: x0, xx, 0x */
+		if ((frac_val % 10) != 0)
+			seq_printf(m, ".%ld", frac_val);
+		else
+			seq_printf(m, ".%ld", frac_val / 10);
+	}
+
+	seq_printf(m, "\n");
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_seq_read_frac_helper);
+
+int lprocfs_write_u64_helper(const char *buffer, unsigned long count,__u64 *val)
+{
+	return lprocfs_write_frac_u64_helper(buffer, count, val, 1);
+}
+EXPORT_SYMBOL(lprocfs_write_u64_helper);
+
+int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count,
+			      __u64 *val, int mult)
+{
+	char kernbuf[22], *end, *pbuf;
+	__u64 whole, frac = 0, units;
+	unsigned frac_d = 1;
+
+	if (count > (sizeof(kernbuf) - 1))
+		return -EINVAL;
+
+	if (copy_from_user(kernbuf, buffer, count))
+		return -EFAULT;
+
+	kernbuf[count] = '\0';
+	pbuf = kernbuf;
+	if (*pbuf == '-') {
+		mult = -mult;
+		pbuf++;
+	}
+
+	whole = simple_strtoull(pbuf, &end, 10);
+	if (pbuf == end)
+		return -EINVAL;
+
+	if (end != NULL && *end == '.') {
+		int i;
+		pbuf = end + 1;
+
+		/* need to limit frac_d to a __u32 */
+		if (strlen(pbuf) > 10)
+			pbuf[10] = '\0';
+
+		frac = simple_strtoull(pbuf, &end, 10);
+		/* count decimal places */
+		for (i = 0; i < (end - pbuf); i++)
+			frac_d *= 10;
+	}
+
+	units = 1;
+	switch(*end) {
+	case 'p': case 'P':
+		units <<= 10;
+	case 't': case 'T':
+		units <<= 10;
+	case 'g': case 'G':
+		units <<= 10;
+	case 'm': case 'M':
+		units <<= 10;
+	case 'k': case 'K':
+		units <<= 10;
+	}
+	/* Specified units override the multiplier */
+	if (units)
+		mult = mult < 0 ? -units : units;
+
+	frac *= mult;
+	do_div(frac, frac_d);
+	*val = whole * mult + frac;
+	return 0;
+}
+EXPORT_SYMBOL(lprocfs_write_frac_u64_helper);
+
+static char *lprocfs_strnstr(const char *s1, const char *s2, size_t len)
+{
+	size_t l2;
+
+	l2 = strlen(s2);
+	if (!l2)
+		return (char *)s1;
+	while (len >= l2) {
+		len--;
+		if (!memcmp(s1, s2, l2))
+			return (char *)s1;
+		s1++;
+	}
+	return NULL;
+}
+
+/**
+ * Find the string \a name in the input \a buffer, and return a pointer to the
+ * value immediately following \a name, reducing \a count appropriately.
+ * If \a name is not found the original \a buffer is returned.
+ */
+char *lprocfs_find_named_value(const char *buffer, const char *name,
+				unsigned long *count)
+{
+	char *val;
+	size_t buflen = *count;
+
+	/* there is no strnstr() in rhel5 and ubuntu kernels */
+	val = lprocfs_strnstr(buffer, name, buflen);
+	if (val == NULL)
+		return (char *)buffer;
+
+	val += strlen(name);			     /* skip prefix */
+	while (val < buffer + buflen && isspace(*val)) /* skip separator */
+		val++;
+
+	*count = 0;
+	while (val < buffer + buflen && isalnum(*val)) {
+		++*count;
+		++val;
+	}
+
+	return val - *count;
+}
+EXPORT_SYMBOL(lprocfs_find_named_value);
+
+int lprocfs_seq_create(proc_dir_entry_t *parent,
+		       const char *name,
+		       umode_t mode,
+		       const struct file_operations *seq_fops,
+		       void *data)
+{
+	struct proc_dir_entry *entry;
+	ENTRY;
+
+	/* Disallow secretly (un)writable entries. */
+	LASSERT((seq_fops->write == NULL) == ((mode & 0222) == 0));
+	entry = proc_create_data(name, mode, parent, seq_fops, data);
+
+	if (entry == NULL)
+		RETURN(-ENOMEM);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lprocfs_seq_create);
+
+int lprocfs_obd_seq_create(struct obd_device *dev,
+			   const char *name,
+			   umode_t mode,
+			   const struct file_operations *seq_fops,
+			   void *data)
+{
+	return (lprocfs_seq_create(dev->obd_proc_entry, name,
+				   mode, seq_fops, data));
+}
+EXPORT_SYMBOL(lprocfs_obd_seq_create);
+
+void lprocfs_oh_tally(struct obd_histogram *oh, unsigned int value)
+{
+	if (value >= OBD_HIST_MAX)
+		value = OBD_HIST_MAX - 1;
+
+	spin_lock(&oh->oh_lock);
+	oh->oh_buckets[value]++;
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally);
+
+void lprocfs_oh_tally_log2(struct obd_histogram *oh, unsigned int value)
+{
+	unsigned int val;
+
+	for (val = 0; ((1 << val) < value) && (val <= OBD_HIST_MAX); val++)
+		;
+
+	lprocfs_oh_tally(oh, val);
+}
+EXPORT_SYMBOL(lprocfs_oh_tally_log2);
+
+unsigned long lprocfs_oh_sum(struct obd_histogram *oh)
+{
+	unsigned long ret = 0;
+	int i;
+
+	for (i = 0; i < OBD_HIST_MAX; i++)
+		ret +=  oh->oh_buckets[i];
+	return ret;
+}
+EXPORT_SYMBOL(lprocfs_oh_sum);
+
+void lprocfs_oh_clear(struct obd_histogram *oh)
+{
+	spin_lock(&oh->oh_lock);
+	memset(oh->oh_buckets, 0, sizeof(oh->oh_buckets));
+	spin_unlock(&oh->oh_lock);
+}
+EXPORT_SYMBOL(lprocfs_oh_clear);
+
+int lprocfs_obd_rd_max_pages_per_rpc(struct seq_file *m, void *data)
+{
+	struct obd_device *dev = data;
+	struct client_obd *cli = &dev->u.cli;
+	int rc;
+
+	client_obd_list_lock(&cli->cl_loi_list_lock);
+	rc = seq_printf(m, "%d\n", cli->cl_max_pages_per_rpc);
+	client_obd_list_unlock(&cli->cl_loi_list_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lprocfs_obd_rd_max_pages_per_rpc);
+
+#endif /* LPROCFS*/
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_object.c b/drivers/staging/lustre/lustre/obdclass/lu_object.c
new file mode 100644
index 000000000000..fdf0ed367693
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_object.c
@@ -0,0 +1,2185 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+
+# include <linux/module.h>
+
+/* hash_long() */
+#include <linux/libcfs/libcfs_hash.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lustre_disk.h>
+#include <lustre_fid.h>
+#include <lu_object.h>
+#include <lu_ref.h>
+#include <linux/list.h>
+
+static void lu_object_free(const struct lu_env *env, struct lu_object *o);
+
+/**
+ * Decrease reference counter on object. If last reference is freed, return
+ * object to the cache, unless lu_object_is_dying(o) holds. In the latter
+ * case, free object immediately.
+ */
+void lu_object_put(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *top;
+	struct lu_site	  *site;
+	struct lu_object	*orig;
+	cfs_hash_bd_t	    bd;
+	const struct lu_fid     *fid;
+
+	top  = o->lo_header;
+	site = o->lo_dev->ld_site;
+	orig = o;
+
+	/*
+	 * till we have full fids-on-OST implemented anonymous objects
+	 * are possible in OSP. such an object isn't listed in the site
+	 * so we should not remove it from the site.
+	 */
+	fid = lu_object_fid(o);
+	if (fid_is_zero(fid)) {
+		LASSERT(top->loh_hash.next == NULL
+			&& top->loh_hash.pprev == NULL);
+		LASSERT(list_empty(&top->loh_lru));
+		if (!atomic_dec_and_test(&top->loh_ref))
+			return;
+		list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+			if (o->lo_ops->loo_object_release != NULL)
+				o->lo_ops->loo_object_release(env, o);
+		}
+		lu_object_free(env, orig);
+		return;
+	}
+
+	cfs_hash_bd_get(site->ls_obj_hash, &top->loh_fid, &bd);
+	bkt = cfs_hash_bd_extra_get(site->ls_obj_hash, &bd);
+
+	if (!cfs_hash_bd_dec_and_lock(site->ls_obj_hash, &bd, &top->loh_ref)) {
+		if (lu_object_is_dying(top)) {
+
+			/*
+			 * somebody may be waiting for this, currently only
+			 * used for cl_object, see cl_object_put_last().
+			 */
+			wake_up_all(&bkt->lsb_marche_funebre);
+		}
+		return;
+	}
+
+	LASSERT(bkt->lsb_busy > 0);
+	bkt->lsb_busy--;
+	/*
+	 * When last reference is released, iterate over object
+	 * layers, and notify them that object is no longer busy.
+	 */
+	list_for_each_entry_reverse(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_release != NULL)
+			o->lo_ops->loo_object_release(env, o);
+	}
+
+	if (!lu_object_is_dying(top)) {
+		LASSERT(list_empty(&top->loh_lru));
+		list_add_tail(&top->loh_lru, &bkt->lsb_lru);
+		cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+		return;
+	}
+
+	/*
+	 * If object is dying (will not be cached), removed it
+	 * from hash table and LRU.
+	 *
+	 * This is done with hash table and LRU lists locked. As the only
+	 * way to acquire first reference to previously unreferenced
+	 * object is through hash-table lookup (lu_object_find()),
+	 * or LRU scanning (lu_site_purge()), that are done under hash-table
+	 * and LRU lock, no race with concurrent object lookup is possible
+	 * and we can safely destroy object below.
+	 */
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags))
+		cfs_hash_bd_del_locked(site->ls_obj_hash, &bd, &top->loh_hash);
+	cfs_hash_bd_unlock(site->ls_obj_hash, &bd, 1);
+	/*
+	 * Object was already removed from hash and lru above, can
+	 * kill it.
+	 */
+	lu_object_free(env, orig);
+}
+EXPORT_SYMBOL(lu_object_put);
+
+/**
+ * Put object and don't keep in cache. This is temporary solution for
+ * multi-site objects when its layering is not constant.
+ */
+void lu_object_put_nocache(const struct lu_env *env, struct lu_object *o)
+{
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &o->lo_header->loh_flags);
+	return lu_object_put(env, o);
+}
+EXPORT_SYMBOL(lu_object_put_nocache);
+
+/**
+ * Kill the object and take it out of LRU cache.
+ * Currently used by client code for layout change.
+ */
+void lu_object_unhash(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	set_bit(LU_OBJECT_HEARD_BANSHEE, &top->loh_flags);
+	if (!test_and_set_bit(LU_OBJECT_UNHASHED, &top->loh_flags)) {
+		cfs_hash_t *obj_hash = o->lo_dev->ld_site->ls_obj_hash;
+		cfs_hash_bd_t bd;
+
+		cfs_hash_bd_get_and_lock(obj_hash, &top->loh_fid, &bd, 1);
+		list_del_init(&top->loh_lru);
+		cfs_hash_bd_del_locked(obj_hash, &bd, &top->loh_hash);
+		cfs_hash_bd_unlock(obj_hash, &bd, 1);
+	}
+}
+EXPORT_SYMBOL(lu_object_unhash);
+
+/**
+ * Allocate new object.
+ *
+ * This follows object creation protocol, described in the comment within
+ * struct lu_device_operations definition.
+ */
+static struct lu_object *lu_object_alloc(const struct lu_env *env,
+					 struct lu_device *dev,
+					 const struct lu_fid *f,
+					 const struct lu_object_conf *conf)
+{
+	struct lu_object *scan;
+	struct lu_object *top;
+	struct list_head *layers;
+	int clean;
+	int result;
+	ENTRY;
+
+	/*
+	 * Create top-level object slice. This will also create
+	 * lu_object_header.
+	 */
+	top = dev->ld_ops->ldo_object_alloc(env, NULL, dev);
+	if (top == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+	if (IS_ERR(top))
+		RETURN(top);
+	/*
+	 * This is the only place where object fid is assigned. It's constant
+	 * after this point.
+	 */
+	top->lo_header->loh_fid = *f;
+	layers = &top->lo_header->loh_layers;
+	do {
+		/*
+		 * Call ->loo_object_init() repeatedly, until no more new
+		 * object slices are created.
+		 */
+		clean = 1;
+		list_for_each_entry(scan, layers, lo_linkage) {
+			if (scan->lo_flags & LU_OBJECT_ALLOCATED)
+				continue;
+			clean = 0;
+			scan->lo_header = top->lo_header;
+			result = scan->lo_ops->loo_object_init(env, scan, conf);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+			scan->lo_flags |= LU_OBJECT_ALLOCATED;
+		}
+	} while (!clean);
+
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_start != NULL) {
+			result = scan->lo_ops->loo_object_start(env, scan);
+			if (result != 0) {
+				lu_object_free(env, top);
+				RETURN(ERR_PTR(result));
+			}
+		}
+	}
+
+	lprocfs_counter_incr(dev->ld_site->ls_stats, LU_SS_CREATED);
+	RETURN(top);
+}
+
+/**
+ * Free an object.
+ */
+static void lu_object_free(const struct lu_env *env, struct lu_object *o)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_site	  *site;
+	struct lu_object	*scan;
+	struct list_head	      *layers;
+	struct list_head	       splice;
+
+	site   = o->lo_dev->ld_site;
+	layers = &o->lo_header->loh_layers;
+	bkt    = lu_site_bkt_from_fid(site, &o->lo_header->loh_fid);
+	/*
+	 * First call ->loo_object_delete() method to release all resources.
+	 */
+	list_for_each_entry_reverse(scan, layers, lo_linkage) {
+		if (scan->lo_ops->loo_object_delete != NULL)
+			scan->lo_ops->loo_object_delete(env, scan);
+	}
+
+	/*
+	 * Then, splice object layers into stand-alone list, and call
+	 * ->loo_object_free() on all layers to free memory. Splice is
+	 * necessary, because lu_object_header is freed together with the
+	 * top-level slice.
+	 */
+	INIT_LIST_HEAD(&splice);
+	list_splice_init(layers, &splice);
+	while (!list_empty(&splice)) {
+		/*
+		 * Free layers in bottom-to-top order, so that object header
+		 * lives as long as possible and ->loo_object_free() methods
+		 * can look at its contents.
+		 */
+		o = container_of0(splice.prev, struct lu_object, lo_linkage);
+		list_del_init(&o->lo_linkage);
+		LASSERT(o->lo_ops->loo_object_free != NULL);
+		o->lo_ops->loo_object_free(env, o);
+	}
+
+	if (waitqueue_active(&bkt->lsb_marche_funebre))
+		wake_up_all(&bkt->lsb_marche_funebre);
+}
+
+/**
+ * Free \a nr objects from the cold end of the site LRU list.
+ */
+int lu_site_purge(const struct lu_env *env, struct lu_site *s, int nr)
+{
+	struct lu_object_header *h;
+	struct lu_object_header *temp;
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t	    bd;
+	cfs_hash_bd_t	    bd2;
+	struct list_head	       dispose;
+	int		      did_sth;
+	int		      start;
+	int		      count;
+	int		      bnr;
+	int		      i;
+
+	if (OBD_FAIL_CHECK(OBD_FAIL_OBD_NO_LRU))
+		RETURN(0);
+
+	INIT_LIST_HEAD(&dispose);
+	/*
+	 * Under LRU list lock, scan LRU list and move unreferenced objects to
+	 * the dispose list, removing them from LRU and hash table.
+	 */
+	start = s->ls_purge_start;
+	bnr = (nr == ~0) ? -1 : nr / CFS_HASH_NBKT(s->ls_obj_hash) + 1;
+ again:
+	did_sth = 0;
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		if (i < start)
+			continue;
+		count = bnr;
+		cfs_hash_bd_lock(s->ls_obj_hash, &bd, 1);
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+
+		list_for_each_entry_safe(h, temp, &bkt->lsb_lru, loh_lru) {
+			LASSERT(atomic_read(&h->loh_ref) == 0);
+
+			cfs_hash_bd_get(s->ls_obj_hash, &h->loh_fid, &bd2);
+			LASSERT(bd.bd_bucket == bd2.bd_bucket);
+
+			cfs_hash_bd_del_locked(s->ls_obj_hash,
+					       &bd2, &h->loh_hash);
+			list_move(&h->loh_lru, &dispose);
+			if (did_sth == 0)
+				did_sth = 1;
+
+			if (nr != ~0 && --nr == 0)
+				break;
+
+			if (count > 0 && --count == 0)
+				break;
+
+		}
+		cfs_hash_bd_unlock(s->ls_obj_hash, &bd, 1);
+		cond_resched();
+		/*
+		 * Free everything on the dispose list. This is safe against
+		 * races due to the reasons described in lu_object_put().
+		 */
+		while (!list_empty(&dispose)) {
+			h = container_of0(dispose.next,
+					  struct lu_object_header, loh_lru);
+			list_del_init(&h->loh_lru);
+			lu_object_free(env, lu_object_top(h));
+			lprocfs_counter_incr(s->ls_stats, LU_SS_LRU_PURGED);
+		}
+
+		if (nr == 0)
+			break;
+	}
+
+	if (nr != 0 && did_sth && start != 0) {
+		start = 0; /* restart from the first bucket */
+		goto again;
+	}
+	/* race on s->ls_purge_start, but nobody cares */
+	s->ls_purge_start = i % CFS_HASH_NBKT(s->ls_obj_hash);
+
+	return nr;
+}
+EXPORT_SYMBOL(lu_site_purge);
+
+/*
+ * Object printing.
+ *
+ * Code below has to jump through certain loops to output object description
+ * into libcfs_debug_msg-based log. The problem is that lu_object_print()
+ * composes object description from strings that are parts of _lines_ of
+ * output (i.e., strings that are not terminated by newline). This doesn't fit
+ * very well into libcfs_debug_msg() interface that assumes that each message
+ * supplied to it is a self-contained output line.
+ *
+ * To work around this, strings are collected in a temporary buffer
+ * (implemented as a value of lu_cdebug_key key), until terminating newline
+ * character is detected.
+ *
+ */
+
+enum {
+	/**
+	 * Maximal line size.
+	 *
+	 * XXX overflow is not handled correctly.
+	 */
+	LU_CDEBUG_LINE = 512
+};
+
+struct lu_cdebug_data {
+	/**
+	 * Temporary buffer.
+	 */
+	char lck_area[LU_CDEBUG_LINE];
+};
+
+/* context key constructor/destructor: lu_global_key_init, lu_global_key_fini */
+LU_KEY_INIT_FINI(lu_global, struct lu_cdebug_data);
+
+/**
+ * Key, holding temporary buffer. This key is registered very early by
+ * lu_global_init().
+ */
+struct lu_context_key lu_global_key = {
+	.lct_tags = LCT_MD_THREAD | LCT_DT_THREAD |
+		    LCT_MG_THREAD | LCT_CL_THREAD,
+	.lct_init = lu_global_key_init,
+	.lct_fini = lu_global_key_fini
+};
+
+/**
+ * Printer function emitting messages through libcfs_debug_msg().
+ */
+int lu_cdebug_printer(const struct lu_env *env,
+		      void *cookie, const char *format, ...)
+{
+	struct libcfs_debug_msg_data *msgdata = cookie;
+	struct lu_cdebug_data	*key;
+	int used;
+	int complete;
+	va_list args;
+
+	va_start(args, format);
+
+	key = lu_context_key_get(&env->le_ctx, &lu_global_key);
+	LASSERT(key != NULL);
+
+	used = strlen(key->lck_area);
+	complete = format[strlen(format) - 1] == '\n';
+	/*
+	 * Append new chunk to the buffer.
+	 */
+	vsnprintf(key->lck_area + used,
+		  ARRAY_SIZE(key->lck_area) - used, format, args);
+	if (complete) {
+		if (cfs_cdebug_show(msgdata->msg_mask, msgdata->msg_subsys))
+			libcfs_debug_msg(msgdata, "%s", key->lck_area);
+		key->lck_area[0] = 0;
+	}
+	va_end(args);
+	return 0;
+}
+EXPORT_SYMBOL(lu_cdebug_printer);
+
+/**
+ * Print object header.
+ */
+void lu_object_header_print(const struct lu_env *env, void *cookie,
+			    lu_printer_t printer,
+			    const struct lu_object_header *hdr)
+{
+	(*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]",
+		   hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref),
+		   PFID(&hdr->loh_fid),
+		   hlist_unhashed(&hdr->loh_hash) ? "" : " hash",
+		   list_empty((struct list_head *)&hdr->loh_lru) ? \
+		   "" : " lru",
+		   hdr->loh_attr & LOHA_EXISTS ? " exist":"");
+}
+EXPORT_SYMBOL(lu_object_header_print);
+
+/**
+ * Print human readable representation of the \a o to the \a printer.
+ */
+void lu_object_print(const struct lu_env *env, void *cookie,
+		     lu_printer_t printer, const struct lu_object *o)
+{
+	static const char ruler[] = "........................................";
+	struct lu_object_header *top;
+	int depth;
+
+	top = o->lo_header;
+	lu_object_header_print(env, cookie, printer, top);
+	(*printer)(env, cookie, "{ \n");
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		depth = o->lo_depth + 4;
+
+		/*
+		 * print `.' \a depth times followed by type name and address
+		 */
+		(*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler,
+			   o->lo_dev->ld_type->ldt_name, o);
+		if (o->lo_ops->loo_object_print != NULL)
+			o->lo_ops->loo_object_print(env, cookie, printer, o);
+		(*printer)(env, cookie, "\n");
+	}
+	(*printer)(env, cookie, "} header@%p\n", top);
+}
+EXPORT_SYMBOL(lu_object_print);
+
+/**
+ * Check object consistency.
+ */
+int lu_object_invariant(const struct lu_object *o)
+{
+	struct lu_object_header *top;
+
+	top = o->lo_header;
+	list_for_each_entry(o, &top->loh_layers, lo_linkage) {
+		if (o->lo_ops->loo_object_invariant != NULL &&
+		    !o->lo_ops->loo_object_invariant(o))
+			return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(lu_object_invariant);
+
+static struct lu_object *htable_lookup(struct lu_site *s,
+				       cfs_hash_bd_t *bd,
+				       const struct lu_fid *f,
+				       wait_queue_t *waiter,
+				       __u64 *version)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object_header *h;
+	struct hlist_node	*hnode;
+	__u64  ver = cfs_hash_bd_version_get(bd);
+
+	if (*version == ver)
+		return NULL;
+
+	*version = ver;
+	bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, bd);
+	/* cfs_hash_bd_peek_locked is a somehow "internal" function
+	 * of cfs_hash, it doesn't add refcount on object. */
+	hnode = cfs_hash_bd_peek_locked(s->ls_obj_hash, bd, (void *)f);
+	if (hnode == NULL) {
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_MISS);
+		return NULL;
+	}
+
+	h = container_of0(hnode, struct lu_object_header, loh_hash);
+	if (likely(!lu_object_is_dying(h))) {
+		cfs_hash_get(s->ls_obj_hash, hnode);
+		lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_HIT);
+		list_del_init(&h->loh_lru);
+		return lu_object_top(h);
+	}
+
+	/*
+	 * Lookup found an object being destroyed this object cannot be
+	 * returned (to assure that references to dying objects are eventually
+	 * drained), and moreover, lookup has to wait until object is freed.
+	 */
+
+	init_waitqueue_entry_current(waiter);
+	add_wait_queue(&bkt->lsb_marche_funebre, waiter);
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_DEATH_RACE);
+	return ERR_PTR(-EAGAIN);
+}
+
+/**
+ * Search cache for an object with the fid \a f. If such object is found,
+ * return it. Otherwise, create new object, insert it into cache and return
+ * it. In any case, additional reference is acquired on the returned object.
+ */
+struct lu_object *lu_object_find(const struct lu_env *env,
+				 struct lu_device *dev, const struct lu_fid *f,
+				 const struct lu_object_conf *conf)
+{
+	return lu_object_find_at(env, dev->ld_site->ls_top_dev, f, conf);
+}
+EXPORT_SYMBOL(lu_object_find);
+
+static struct lu_object *lu_object_new(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object	*o;
+	cfs_hash_t	      *hs;
+	cfs_hash_bd_t	    bd;
+	struct lu_site_bkt_data *bkt;
+
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	hs = dev->ld_site->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	return o;
+}
+
+/**
+ * Core logic of lu_object_find*() functions.
+ */
+static struct lu_object *lu_object_find_try(const struct lu_env *env,
+					    struct lu_device *dev,
+					    const struct lu_fid *f,
+					    const struct lu_object_conf *conf,
+					    wait_queue_t *waiter)
+{
+	struct lu_object      *o;
+	struct lu_object      *shadow;
+	struct lu_site	*s;
+	cfs_hash_t	    *hs;
+	cfs_hash_bd_t	  bd;
+	__u64		  version = 0;
+
+	/*
+	 * This uses standard index maintenance protocol:
+	 *
+	 *     - search index under lock, and return object if found;
+	 *     - otherwise, unlock index, allocate new object;
+	 *     - lock index and search again;
+	 *     - if nothing is found (usual case), insert newly created
+	 *       object into index;
+	 *     - otherwise (race: other thread inserted object), free
+	 *       object just allocated.
+	 *     - unlock index;
+	 *     - return object.
+	 *
+	 * For "LOC_F_NEW" case, we are sure the object is new established.
+	 * It is unnecessary to perform lookup-alloc-lookup-insert, instead,
+	 * just alloc and insert directly.
+	 *
+	 * If dying object is found during index search, add @waiter to the
+	 * site wait-queue and return ERR_PTR(-EAGAIN).
+	 */
+	if (conf != NULL && conf->loc_flags & LOC_F_NEW)
+		return lu_object_new(env, dev, f, conf);
+
+	s  = dev->ld_site;
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)f, &bd, 1);
+	o = htable_lookup(s, &bd, f, waiter, &version);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	if (o != NULL)
+		return o;
+
+	/*
+	 * Allocate new object. This may result in rather complicated
+	 * operations, including fld queries, inode loading, etc.
+	 */
+	o = lu_object_alloc(env, dev, f, conf);
+	if (unlikely(IS_ERR(o)))
+		return o;
+
+	LASSERT(lu_fid_eq(lu_object_fid(o), f));
+
+	cfs_hash_bd_lock(hs, &bd, 1);
+
+	shadow = htable_lookup(s, &bd, f, waiter, &version);
+	if (likely(shadow == NULL)) {
+		struct lu_site_bkt_data *bkt;
+
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+		bkt->lsb_busy++;
+		cfs_hash_bd_unlock(hs, &bd, 1);
+		return o;
+	}
+
+	lprocfs_counter_incr(s->ls_stats, LU_SS_CACHE_RACE);
+	cfs_hash_bd_unlock(hs, &bd, 1);
+	lu_object_free(env, o);
+	return shadow;
+}
+
+/**
+ * Much like lu_object_find(), but top level device of object is specifically
+ * \a dev rather than top level device of the site. This interface allows
+ * objects of different "stacking" to be created within the same site.
+ */
+struct lu_object *lu_object_find_at(const struct lu_env *env,
+				    struct lu_device *dev,
+				    const struct lu_fid *f,
+				    const struct lu_object_conf *conf)
+{
+	struct lu_site_bkt_data *bkt;
+	struct lu_object	*obj;
+	wait_queue_t	   wait;
+
+	while (1) {
+		obj = lu_object_find_try(env, dev, f, conf, &wait);
+		if (obj != ERR_PTR(-EAGAIN))
+			return obj;
+		/*
+		 * lu_object_find_try() already added waiter into the
+		 * wait queue.
+		 */
+		waitq_wait(&wait, TASK_UNINTERRUPTIBLE);
+		bkt = lu_site_bkt_from_fid(dev->ld_site, (void *)f);
+		remove_wait_queue(&bkt->lsb_marche_funebre, &wait);
+	}
+}
+EXPORT_SYMBOL(lu_object_find_at);
+
+/**
+ * Find object with given fid, and return its slice belonging to given device.
+ */
+struct lu_object *lu_object_find_slice(const struct lu_env *env,
+				       struct lu_device *dev,
+				       const struct lu_fid *f,
+				       const struct lu_object_conf *conf)
+{
+	struct lu_object *top;
+	struct lu_object *obj;
+
+	top = lu_object_find(env, dev, f, conf);
+	if (!IS_ERR(top)) {
+		obj = lu_object_locate(top->lo_header, dev->ld_type);
+		if (obj == NULL)
+			lu_object_put(env, top);
+	} else
+		obj = top;
+	return obj;
+}
+EXPORT_SYMBOL(lu_object_find_slice);
+
+/**
+ * Global list of all device types.
+ */
+static LIST_HEAD(lu_device_types);
+
+int lu_device_type_init(struct lu_device_type *ldt)
+{
+	int result = 0;
+
+	INIT_LIST_HEAD(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_init)
+		result = ldt->ldt_ops->ldto_init(ldt);
+	if (result == 0)
+		list_add(&ldt->ldt_linkage, &lu_device_types);
+	return result;
+}
+EXPORT_SYMBOL(lu_device_type_init);
+
+void lu_device_type_fini(struct lu_device_type *ldt)
+{
+	list_del_init(&ldt->ldt_linkage);
+	if (ldt->ldt_ops->ldto_fini)
+		ldt->ldt_ops->ldto_fini(ldt);
+}
+EXPORT_SYMBOL(lu_device_type_fini);
+
+void lu_types_stop(void)
+{
+	struct lu_device_type *ldt;
+
+	list_for_each_entry(ldt, &lu_device_types, ldt_linkage) {
+		if (ldt->ldt_device_nr == 0 && ldt->ldt_ops->ldto_stop)
+			ldt->ldt_ops->ldto_stop(ldt);
+	}
+}
+EXPORT_SYMBOL(lu_types_stop);
+
+/**
+ * Global list of all sites on this node
+ */
+static LIST_HEAD(lu_sites);
+static DEFINE_MUTEX(lu_sites_guard);
+
+/**
+ * Global environment used by site shrinker.
+ */
+static struct lu_env lu_shrink_env;
+
+struct lu_site_print_arg {
+	struct lu_env   *lsp_env;
+	void	    *lsp_cookie;
+	lu_printer_t     lsp_printer;
+};
+
+static int
+lu_site_obj_print(cfs_hash_t *hs, cfs_hash_bd_t *bd,
+		  struct hlist_node *hnode, void *data)
+{
+	struct lu_site_print_arg *arg = (struct lu_site_print_arg *)data;
+	struct lu_object_header  *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (!list_empty(&h->loh_layers)) {
+		const struct lu_object *o;
+
+		o = lu_object_top(h);
+		lu_object_print(arg->lsp_env, arg->lsp_cookie,
+				arg->lsp_printer, o);
+	} else {
+		lu_object_header_print(arg->lsp_env, arg->lsp_cookie,
+				       arg->lsp_printer, h);
+	}
+	return 0;
+}
+
+/**
+ * Print all objects in \a s.
+ */
+void lu_site_print(const struct lu_env *env, struct lu_site *s, void *cookie,
+		   lu_printer_t printer)
+{
+	struct lu_site_print_arg arg = {
+		.lsp_env     = (struct lu_env *)env,
+		.lsp_cookie  = cookie,
+		.lsp_printer = printer,
+	};
+
+	cfs_hash_for_each(s->ls_obj_hash, lu_site_obj_print, &arg);
+}
+EXPORT_SYMBOL(lu_site_print);
+
+enum {
+	LU_CACHE_PERCENT_MAX     = 50,
+	LU_CACHE_PERCENT_DEFAULT = 20
+};
+
+static unsigned int lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+CFS_MODULE_PARM(lu_cache_percent, "i", int, 0644,
+		"Percentage of memory to be used as lu_object cache");
+
+/**
+ * Return desired hash table order.
+ */
+static int lu_htable_order(void)
+{
+	unsigned long cache_size;
+	int bits;
+
+	/*
+	 * Calculate hash table size, assuming that we want reasonable
+	 * performance when 20% of total memory is occupied by cache of
+	 * lu_objects.
+	 *
+	 * Size of lu_object is (arbitrary) taken as 1K (together with inode).
+	 */
+	cache_size = num_physpages;
+
+#if BITS_PER_LONG == 32
+	/* limit hashtable size for lowmem systems to low RAM */
+	if (cache_size > 1 << (30 - PAGE_CACHE_SHIFT))
+		cache_size = 1 << (30 - PAGE_CACHE_SHIFT) * 3 / 4;
+#endif
+
+	/* clear off unreasonable cache setting. */
+	if (lu_cache_percent == 0 || lu_cache_percent > LU_CACHE_PERCENT_MAX) {
+		CWARN("obdclass: invalid lu_cache_percent: %u, it must be in"
+		      " the range of (0, %u]. Will use default value: %u.\n",
+		      lu_cache_percent, LU_CACHE_PERCENT_MAX,
+		      LU_CACHE_PERCENT_DEFAULT);
+
+		lu_cache_percent = LU_CACHE_PERCENT_DEFAULT;
+	}
+	cache_size = cache_size / 100 * lu_cache_percent *
+		(PAGE_CACHE_SIZE / 1024);
+
+	for (bits = 1; (1 << bits) < cache_size; ++bits) {
+		;
+	}
+	return bits;
+}
+
+static unsigned lu_obj_hop_hash(cfs_hash_t *hs,
+				const void *key, unsigned mask)
+{
+	struct lu_fid  *fid = (struct lu_fid *)key;
+	__u32	   hash;
+
+	hash = fid_flatten32(fid);
+	hash += (hash >> 4) + (hash << 12); /* mixing oid and seq */
+	hash = cfs_hash_long(hash, hs->hs_bkt_bits);
+
+	/* give me another random factor */
+	hash -= cfs_hash_long((unsigned long)hs, fid_oid(fid) % 11 + 3);
+
+	hash <<= hs->hs_cur_bits - hs->hs_bkt_bits;
+	hash |= (fid_seq(fid) + fid_oid(fid)) & (CFS_HASH_NBKT(hs) - 1);
+
+	return hash & mask;
+}
+
+static void *lu_obj_hop_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct lu_object_header, loh_hash);
+}
+
+static void *lu_obj_hop_key(struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return &h->loh_fid;
+}
+
+static int lu_obj_hop_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	return lu_fid_eq(&h->loh_fid, (struct lu_fid *)key);
+}
+
+static void lu_obj_hop_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct lu_object_header *h;
+
+	h = hlist_entry(hnode, struct lu_object_header, loh_hash);
+	if (atomic_add_return(1, &h->loh_ref) == 1) {
+		struct lu_site_bkt_data *bkt;
+		cfs_hash_bd_t	    bd;
+
+		cfs_hash_bd_get(hs, &h->loh_fid, &bd);
+		bkt = cfs_hash_bd_extra_get(hs, &bd);
+		bkt->lsb_busy++;
+	}
+}
+
+static void lu_obj_hop_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	LBUG(); /* we should never called it */
+}
+
+cfs_hash_ops_t lu_site_hash_ops = {
+	.hs_hash	= lu_obj_hop_hash,
+	.hs_key	 = lu_obj_hop_key,
+	.hs_keycmp      = lu_obj_hop_keycmp,
+	.hs_object      = lu_obj_hop_object,
+	.hs_get	 = lu_obj_hop_get,
+	.hs_put_locked  = lu_obj_hop_put_locked,
+};
+
+void lu_dev_add_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	if (list_empty(&d->ld_linkage))
+		list_add(&d->ld_linkage, &s->ls_ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_add_linkage);
+
+void lu_dev_del_linkage(struct lu_site *s, struct lu_device *d)
+{
+	spin_lock(&s->ls_ld_lock);
+	list_del_init(&d->ld_linkage);
+	spin_unlock(&s->ls_ld_lock);
+}
+EXPORT_SYMBOL(lu_dev_del_linkage);
+
+/**
+ * Initialize site \a s, with \a d as the top level device.
+ */
+#define LU_SITE_BITS_MIN    12
+#define LU_SITE_BITS_MAX    24
+/**
+ * total 256 buckets, we don't want too many buckets because:
+ * - consume too much memory
+ * - avoid unbalanced LRU list
+ */
+#define LU_SITE_BKT_BITS    8
+
+int lu_site_init(struct lu_site *s, struct lu_device *top)
+{
+	struct lu_site_bkt_data *bkt;
+	cfs_hash_bd_t bd;
+	char name[16];
+	int bits;
+	int i;
+	ENTRY;
+
+	memset(s, 0, sizeof *s);
+	bits = lu_htable_order();
+	snprintf(name, 16, "lu_site_%s", top->ld_type->ldt_name);
+	for (bits = min(max(LU_SITE_BITS_MIN, bits), LU_SITE_BITS_MAX);
+	     bits >= LU_SITE_BITS_MIN; bits--) {
+		s->ls_obj_hash = cfs_hash_create(name, bits, bits,
+						 bits - LU_SITE_BKT_BITS,
+						 sizeof(*bkt), 0, 0,
+						 &lu_site_hash_ops,
+						 CFS_HASH_SPIN_BKTLOCK |
+						 CFS_HASH_NO_ITEMREF |
+						 CFS_HASH_DEPTH |
+						 CFS_HASH_ASSERT_EMPTY);
+		if (s->ls_obj_hash != NULL)
+			break;
+	}
+
+	if (s->ls_obj_hash == NULL) {
+		CERROR("failed to create lu_site hash with bits: %d\n", bits);
+		return -ENOMEM;
+	}
+
+	cfs_hash_for_each_bucket(s->ls_obj_hash, &bd, i) {
+		bkt = cfs_hash_bd_extra_get(s->ls_obj_hash, &bd);
+		INIT_LIST_HEAD(&bkt->lsb_lru);
+		init_waitqueue_head(&bkt->lsb_marche_funebre);
+	}
+
+	s->ls_stats = lprocfs_alloc_stats(LU_SS_LAST_STAT, 0);
+	if (s->ls_stats == NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+		return -ENOMEM;
+	}
+
+	lprocfs_counter_init(s->ls_stats, LU_SS_CREATED,
+			     0, "created", "created");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT,
+			     0, "cache_hit", "cache_hit");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS,
+			     0, "cache_miss", "cache_miss");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE,
+			     0, "cache_race", "cache_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE,
+			     0, "cache_death_race", "cache_death_race");
+	lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED,
+			     0, "lru_purged", "lru_purged");
+
+	INIT_LIST_HEAD(&s->ls_linkage);
+	s->ls_top_dev = top;
+	top->ld_site = s;
+	lu_device_get(top);
+	lu_ref_add(&top->ld_reference, "site-top", s);
+
+	INIT_LIST_HEAD(&s->ls_ld_linkage);
+	spin_lock_init(&s->ls_ld_lock);
+
+	lu_dev_add_linkage(s, top);
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lu_site_init);
+
+/**
+ * Finalize \a s and release its resources.
+ */
+void lu_site_fini(struct lu_site *s)
+{
+	mutex_lock(&lu_sites_guard);
+	list_del_init(&s->ls_linkage);
+	mutex_unlock(&lu_sites_guard);
+
+	if (s->ls_obj_hash != NULL) {
+		cfs_hash_putref(s->ls_obj_hash);
+		s->ls_obj_hash = NULL;
+	}
+
+	if (s->ls_top_dev != NULL) {
+		s->ls_top_dev->ld_site = NULL;
+		lu_ref_del(&s->ls_top_dev->ld_reference, "site-top", s);
+		lu_device_put(s->ls_top_dev);
+		s->ls_top_dev = NULL;
+	}
+
+	if (s->ls_stats != NULL)
+		lprocfs_free_stats(&s->ls_stats);
+}
+EXPORT_SYMBOL(lu_site_fini);
+
+/**
+ * Called when initialization of stack for this site is completed.
+ */
+int lu_site_init_finish(struct lu_site *s)
+{
+	int result;
+	mutex_lock(&lu_sites_guard);
+	result = lu_context_refill(&lu_shrink_env.le_ctx);
+	if (result == 0)
+		list_add(&s->ls_linkage, &lu_sites);
+	mutex_unlock(&lu_sites_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_site_init_finish);
+
+/**
+ * Acquire additional reference on device \a d
+ */
+void lu_device_get(struct lu_device *d)
+{
+	atomic_inc(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_get);
+
+/**
+ * Release reference on device \a d.
+ */
+void lu_device_put(struct lu_device *d)
+{
+	LASSERT(atomic_read(&d->ld_ref) > 0);
+	atomic_dec(&d->ld_ref);
+}
+EXPORT_SYMBOL(lu_device_put);
+
+/**
+ * Initialize device \a d of type \a t.
+ */
+int lu_device_init(struct lu_device *d, struct lu_device_type *t)
+{
+	if (t->ldt_device_nr++ == 0 && t->ldt_ops->ldto_start != NULL)
+		t->ldt_ops->ldto_start(t);
+	memset(d, 0, sizeof *d);
+	atomic_set(&d->ld_ref, 0);
+	d->ld_type = t;
+	lu_ref_init(&d->ld_reference);
+	INIT_LIST_HEAD(&d->ld_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_device_init);
+
+/**
+ * Finalize device \a d.
+ */
+void lu_device_fini(struct lu_device *d)
+{
+	struct lu_device_type *t;
+
+	t = d->ld_type;
+	if (d->ld_obd != NULL) {
+		d->ld_obd->obd_lu_dev = NULL;
+		d->ld_obd = NULL;
+	}
+
+	lu_ref_fini(&d->ld_reference);
+	LASSERTF(atomic_read(&d->ld_ref) == 0,
+		 "Refcount is %u\n", atomic_read(&d->ld_ref));
+	LASSERT(t->ldt_device_nr > 0);
+	if (--t->ldt_device_nr == 0 && t->ldt_ops->ldto_stop != NULL)
+		t->ldt_ops->ldto_stop(t);
+}
+EXPORT_SYMBOL(lu_device_fini);
+
+/**
+ * Initialize object \a o that is part of compound object \a h and was created
+ * by device \a d.
+ */
+int lu_object_init(struct lu_object *o,
+		   struct lu_object_header *h, struct lu_device *d)
+{
+	memset(o, 0, sizeof *o);
+	o->lo_header = h;
+	o->lo_dev    = d;
+	lu_device_get(d);
+	o->lo_dev_ref = lu_ref_add(&d->ld_reference, "lu_object", o);
+	INIT_LIST_HEAD(&o->lo_linkage);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_init);
+
+/**
+ * Finalize object and release its resources.
+ */
+void lu_object_fini(struct lu_object *o)
+{
+	struct lu_device *dev = o->lo_dev;
+
+	LASSERT(list_empty(&o->lo_linkage));
+
+	if (dev != NULL) {
+		lu_ref_del_at(&dev->ld_reference,
+			      o->lo_dev_ref , "lu_object", o);
+		lu_device_put(dev);
+		o->lo_dev = NULL;
+	}
+}
+EXPORT_SYMBOL(lu_object_fini);
+
+/**
+ * Add object \a o as first layer of compound object \a h
+ *
+ * This is typically called by the ->ldo_object_alloc() method of top-level
+ * device.
+ */
+void lu_object_add_top(struct lu_object_header *h, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &h->loh_layers);
+}
+EXPORT_SYMBOL(lu_object_add_top);
+
+/**
+ * Add object \a o as a layer of compound object, going after \a before.
+ *
+ * This is typically called by the ->ldo_object_alloc() method of \a
+ * before->lo_dev.
+ */
+void lu_object_add(struct lu_object *before, struct lu_object *o)
+{
+	list_move(&o->lo_linkage, &before->lo_linkage);
+}
+EXPORT_SYMBOL(lu_object_add);
+
+/**
+ * Initialize compound object.
+ */
+int lu_object_header_init(struct lu_object_header *h)
+{
+	memset(h, 0, sizeof *h);
+	atomic_set(&h->loh_ref, 1);
+	INIT_HLIST_NODE(&h->loh_hash);
+	INIT_LIST_HEAD(&h->loh_lru);
+	INIT_LIST_HEAD(&h->loh_layers);
+	lu_ref_init(&h->loh_reference);
+	return 0;
+}
+EXPORT_SYMBOL(lu_object_header_init);
+
+/**
+ * Finalize compound object.
+ */
+void lu_object_header_fini(struct lu_object_header *h)
+{
+	LASSERT(list_empty(&h->loh_layers));
+	LASSERT(list_empty(&h->loh_lru));
+	LASSERT(hlist_unhashed(&h->loh_hash));
+	lu_ref_fini(&h->loh_reference);
+}
+EXPORT_SYMBOL(lu_object_header_fini);
+
+/**
+ * Given a compound object, find its slice, corresponding to the device type
+ * \a dtype.
+ */
+struct lu_object *lu_object_locate(struct lu_object_header *h,
+				   const struct lu_device_type *dtype)
+{
+	struct lu_object *o;
+
+	list_for_each_entry(o, &h->loh_layers, lo_linkage) {
+		if (o->lo_dev->ld_type == dtype)
+			return o;
+	}
+	return NULL;
+}
+EXPORT_SYMBOL(lu_object_locate);
+
+
+
+/**
+ * Finalize and free devices in the device stack.
+ *
+ * Finalize device stack by purging object cache, and calling
+ * lu_device_type_operations::ldto_device_fini() and
+ * lu_device_type_operations::ldto_device_free() on all devices in the stack.
+ */
+void lu_stack_fini(const struct lu_env *env, struct lu_device *top)
+{
+	struct lu_site   *site = top->ld_site;
+	struct lu_device *scan;
+	struct lu_device *next;
+
+	lu_site_purge(env, site, ~0);
+	for (scan = top; scan != NULL; scan = next) {
+		next = scan->ld_type->ldt_ops->ldto_device_fini(env, scan);
+		lu_ref_del(&scan->ld_reference, "lu-stack", &lu_site_init);
+		lu_device_put(scan);
+	}
+
+	/* purge again. */
+	lu_site_purge(env, site, ~0);
+
+	for (scan = top; scan != NULL; scan = next) {
+		const struct lu_device_type *ldt = scan->ld_type;
+		struct obd_type	     *type;
+
+		next = ldt->ldt_ops->ldto_device_free(env, scan);
+		type = ldt->ldt_obd_type;
+		if (type != NULL) {
+			type->typ_refcnt--;
+			class_put_type(type);
+		}
+	}
+}
+EXPORT_SYMBOL(lu_stack_fini);
+
+enum {
+	/**
+	 * Maximal number of tld slots.
+	 */
+	LU_CONTEXT_KEY_NR = 40
+};
+
+static struct lu_context_key *lu_keys[LU_CONTEXT_KEY_NR] = { NULL, };
+
+static DEFINE_SPINLOCK(lu_keys_guard);
+
+/**
+ * Global counter incremented whenever key is registered, unregistered,
+ * revived or quiesced. This is used to void unnecessary calls to
+ * lu_context_refill(). No locking is provided, as initialization and shutdown
+ * are supposed to be externally serialized.
+ */
+static unsigned key_set_version = 0;
+
+/**
+ * Register new key.
+ */
+int lu_context_key_register(struct lu_context_key *key)
+{
+	int result;
+	int i;
+
+	LASSERT(key->lct_init != NULL);
+	LASSERT(key->lct_fini != NULL);
+	LASSERT(key->lct_tags != 0);
+	LASSERT(key->lct_owner != NULL);
+
+	result = -ENFILE;
+	spin_lock(&lu_keys_guard);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		if (lu_keys[i] == NULL) {
+			key->lct_index = i;
+			atomic_set(&key->lct_used, 1);
+			lu_keys[i] = key;
+			lu_ref_init(&key->lct_reference);
+			result = 0;
+			++key_set_version;
+			break;
+		}
+	}
+	spin_unlock(&lu_keys_guard);
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register);
+
+static void key_fini(struct lu_context *ctx, int index)
+{
+	if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) {
+		struct lu_context_key *key;
+
+		key = lu_keys[index];
+		LASSERT(key != NULL);
+		LASSERT(key->lct_fini != NULL);
+		LASSERT(atomic_read(&key->lct_used) > 1);
+
+		key->lct_fini(ctx, key, ctx->lc_value[index]);
+		lu_ref_del(&key->lct_reference, "ctx", ctx);
+		atomic_dec(&key->lct_used);
+
+		LASSERT(key->lct_owner != NULL);
+		if ((ctx->lc_tags & LCT_NOREF) == 0) {
+#ifdef CONFIG_MODULE_UNLOAD
+			LINVRNT(module_refcount(key->lct_owner) > 0);
+#endif
+			module_put(key->lct_owner);
+		}
+		ctx->lc_value[index] = NULL;
+	}
+}
+
+/**
+ * Deregister key.
+ */
+void lu_context_key_degister(struct lu_context_key *key)
+{
+	LASSERT(atomic_read(&key->lct_used) >= 1);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+
+	lu_context_key_quiesce(key);
+
+	++key_set_version;
+	spin_lock(&lu_keys_guard);
+	key_fini(&lu_shrink_env.le_ctx, key->lct_index);
+	if (lu_keys[key->lct_index]) {
+		lu_keys[key->lct_index] = NULL;
+		lu_ref_fini(&key->lct_reference);
+	}
+	spin_unlock(&lu_keys_guard);
+
+	LASSERTF(atomic_read(&key->lct_used) == 1,
+		 "key has instances: %d\n",
+		 atomic_read(&key->lct_used));
+}
+EXPORT_SYMBOL(lu_context_key_degister);
+
+/**
+ * Register a number of keys. This has to be called after all keys have been
+ * initialized by a call to LU_CONTEXT_KEY_INIT().
+ */
+int lu_context_key_register_many(struct lu_context_key *k, ...)
+{
+	struct lu_context_key *key = k;
+	va_list args;
+	int result;
+
+	va_start(args, k);
+	do {
+		result = lu_context_key_register(key);
+		if (result)
+			break;
+		key = va_arg(args, struct lu_context_key *);
+	} while (key != NULL);
+	va_end(args);
+
+	if (result != 0) {
+		va_start(args, k);
+		while (k != key) {
+			lu_context_key_degister(k);
+			k = va_arg(args, struct lu_context_key *);
+		}
+		va_end(args);
+	}
+
+	return result;
+}
+EXPORT_SYMBOL(lu_context_key_register_many);
+
+/**
+ * De-register a number of keys. This is a dual to
+ * lu_context_key_register_many().
+ */
+void lu_context_key_degister_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_degister(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_degister_many);
+
+/**
+ * Revive a number of keys.
+ */
+void lu_context_key_revive_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_revive(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_revive_many);
+
+/**
+ * Quiescent a number of keys.
+ */
+void lu_context_key_quiesce_many(struct lu_context_key *k, ...)
+{
+	va_list args;
+
+	va_start(args, k);
+	do {
+		lu_context_key_quiesce(k);
+		k = va_arg(args, struct lu_context_key*);
+	} while (k != NULL);
+	va_end(args);
+}
+EXPORT_SYMBOL(lu_context_key_quiesce_many);
+
+/**
+ * Return value associated with key \a key in context \a ctx.
+ */
+void *lu_context_key_get(const struct lu_context *ctx,
+			 const struct lu_context_key *key)
+{
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys));
+	LASSERT(lu_keys[key->lct_index] == key);
+	return ctx->lc_value[key->lct_index];
+}
+EXPORT_SYMBOL(lu_context_key_get);
+
+/**
+ * List of remembered contexts. XXX document me.
+ */
+static LIST_HEAD(lu_context_remembered);
+
+/**
+ * Destroy \a key in all remembered contexts. This is used to destroy key
+ * values in "shared" contexts (like service threads), when a module owning
+ * the key is about to be unloaded.
+ */
+void lu_context_key_quiesce(struct lu_context_key *key)
+{
+	struct lu_context *ctx;
+
+	if (!(key->lct_tags & LCT_QUIESCENT)) {
+		/*
+		 * XXX layering violation.
+		 */
+		key->lct_tags |= LCT_QUIESCENT;
+		/*
+		 * XXX memory barrier has to go here.
+		 */
+		spin_lock(&lu_keys_guard);
+		list_for_each_entry(ctx, &lu_context_remembered,
+					lc_remember)
+			key_fini(ctx, key->lct_index);
+		spin_unlock(&lu_keys_guard);
+		++key_set_version;
+	}
+}
+EXPORT_SYMBOL(lu_context_key_quiesce);
+
+void lu_context_key_revive(struct lu_context_key *key)
+{
+	key->lct_tags &= ~LCT_QUIESCENT;
+	++key_set_version;
+}
+EXPORT_SYMBOL(lu_context_key_revive);
+
+static void keys_fini(struct lu_context *ctx)
+{
+	int	i;
+
+	if (ctx->lc_value == NULL)
+		return;
+
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i)
+		key_fini(ctx, i);
+
+	OBD_FREE(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	ctx->lc_value = NULL;
+}
+
+static int keys_fill(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_value != NULL);
+	for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+		struct lu_context_key *key;
+
+		key = lu_keys[i];
+		if (ctx->lc_value[i] == NULL && key != NULL &&
+		    (key->lct_tags & ctx->lc_tags) &&
+		    /*
+		     * Don't create values for a LCT_QUIESCENT key, as this
+		     * will pin module owning a key.
+		     */
+		    !(key->lct_tags & LCT_QUIESCENT)) {
+			void *value;
+
+			LINVRNT(key->lct_init != NULL);
+			LINVRNT(key->lct_index == i);
+
+			value = key->lct_init(ctx, key);
+			if (unlikely(IS_ERR(value)))
+				return PTR_ERR(value);
+
+			LASSERT(key->lct_owner != NULL);
+			if (!(ctx->lc_tags & LCT_NOREF))
+				try_module_get(key->lct_owner);
+			lu_ref_add_atomic(&key->lct_reference, "ctx", ctx);
+			atomic_inc(&key->lct_used);
+			/*
+			 * This is the only place in the code, where an
+			 * element of ctx->lc_value[] array is set to non-NULL
+			 * value.
+			 */
+			ctx->lc_value[i] = value;
+			if (key->lct_exit != NULL)
+				ctx->lc_tags |= LCT_HAS_EXIT;
+		}
+		ctx->lc_version = key_set_version;
+	}
+	return 0;
+}
+
+static int keys_init(struct lu_context *ctx)
+{
+	OBD_ALLOC(ctx->lc_value, ARRAY_SIZE(lu_keys) * sizeof ctx->lc_value[0]);
+	if (likely(ctx->lc_value != NULL))
+		return keys_fill(ctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * Initialize context data-structure. Create values for all keys.
+ */
+int lu_context_init(struct lu_context *ctx, __u32 tags)
+{
+	int	rc;
+
+	memset(ctx, 0, sizeof *ctx);
+	ctx->lc_state = LCS_INITIALIZED;
+	ctx->lc_tags = tags;
+	if (tags & LCT_REMEMBER) {
+		spin_lock(&lu_keys_guard);
+		list_add(&ctx->lc_remember, &lu_context_remembered);
+		spin_unlock(&lu_keys_guard);
+	} else {
+		INIT_LIST_HEAD(&ctx->lc_remember);
+	}
+
+	rc = keys_init(ctx);
+	if (rc != 0)
+		lu_context_fini(ctx);
+
+	return rc;
+}
+EXPORT_SYMBOL(lu_context_init);
+
+/**
+ * Finalize context data-structure. Destroy key values.
+ */
+void lu_context_fini(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_FINALIZED;
+
+	if ((ctx->lc_tags & LCT_REMEMBER) == 0) {
+		LASSERT(list_empty(&ctx->lc_remember));
+		keys_fini(ctx);
+
+	} else { /* could race with key degister */
+		spin_lock(&lu_keys_guard);
+		keys_fini(ctx);
+		list_del_init(&ctx->lc_remember);
+		spin_unlock(&lu_keys_guard);
+	}
+}
+EXPORT_SYMBOL(lu_context_fini);
+
+/**
+ * Called before entering context.
+ */
+void lu_context_enter(struct lu_context *ctx)
+{
+	LINVRNT(ctx->lc_state == LCS_INITIALIZED || ctx->lc_state == LCS_LEFT);
+	ctx->lc_state = LCS_ENTERED;
+}
+EXPORT_SYMBOL(lu_context_enter);
+
+/**
+ * Called after exiting from \a ctx
+ */
+void lu_context_exit(struct lu_context *ctx)
+{
+	int i;
+
+	LINVRNT(ctx->lc_state == LCS_ENTERED);
+	ctx->lc_state = LCS_LEFT;
+	if (ctx->lc_tags & LCT_HAS_EXIT && ctx->lc_value != NULL) {
+		for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) {
+			if (ctx->lc_value[i] != NULL) {
+				struct lu_context_key *key;
+
+				key = lu_keys[i];
+				LASSERT(key != NULL);
+				if (key->lct_exit != NULL)
+					key->lct_exit(ctx,
+						      key, ctx->lc_value[i]);
+			}
+		}
+	}
+}
+EXPORT_SYMBOL(lu_context_exit);
+
+/**
+ * Allocate for context all missing keys that were registered after context
+ * creation. key_set_version is only changed in rare cases when modules
+ * are loaded and removed.
+ */
+int lu_context_refill(struct lu_context *ctx)
+{
+	return likely(ctx->lc_version == key_set_version) ? 0 : keys_fill(ctx);
+}
+EXPORT_SYMBOL(lu_context_refill);
+
+/**
+ * lu_ctx_tags/lu_ses_tags will be updated if there are new types of
+ * obd being added. Currently, this is only used on client side, specifically
+ * for echo device client, for other stack (like ptlrpc threads), context are
+ * predefined when the lu_device type are registered, during the module probe
+ * phase.
+ */
+__u32 lu_context_tags_default = 0;
+__u32 lu_session_tags_default = 0;
+
+void lu_context_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_update);
+
+void lu_context_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_context_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_context_tags_clear);
+
+void lu_session_tags_update(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default |= tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_update);
+
+void lu_session_tags_clear(__u32 tags)
+{
+	spin_lock(&lu_keys_guard);
+	lu_session_tags_default &= ~tags;
+	key_set_version++;
+	spin_unlock(&lu_keys_guard);
+}
+EXPORT_SYMBOL(lu_session_tags_clear);
+
+int lu_env_init(struct lu_env *env, __u32 tags)
+{
+	int result;
+
+	env->le_ses = NULL;
+	result = lu_context_init(&env->le_ctx, tags);
+	if (likely(result == 0))
+		lu_context_enter(&env->le_ctx);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_init);
+
+void lu_env_fini(struct lu_env *env)
+{
+	lu_context_exit(&env->le_ctx);
+	lu_context_fini(&env->le_ctx);
+	env->le_ses = NULL;
+}
+EXPORT_SYMBOL(lu_env_fini);
+
+int lu_env_refill(struct lu_env *env)
+{
+	int result;
+
+	result = lu_context_refill(&env->le_ctx);
+	if (result == 0 && env->le_ses != NULL)
+		result = lu_context_refill(env->le_ses);
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill);
+
+/**
+ * Currently, this API will only be used by echo client.
+ * Because echo client and normal lustre client will share
+ * same cl_env cache. So echo client needs to refresh
+ * the env context after it get one from the cache, especially
+ * when normal client and echo client co-exist in the same client.
+ */
+int lu_env_refill_by_tags(struct lu_env *env, __u32 ctags,
+			  __u32 stags)
+{
+	int    result;
+
+	if ((env->le_ctx.lc_tags & ctags) != ctags) {
+		env->le_ctx.lc_version = 0;
+		env->le_ctx.lc_tags |= ctags;
+	}
+
+	if (env->le_ses && (env->le_ses->lc_tags & stags) != stags) {
+		env->le_ses->lc_version = 0;
+		env->le_ses->lc_tags |= stags;
+	}
+
+	result = lu_env_refill(env);
+
+	return result;
+}
+EXPORT_SYMBOL(lu_env_refill_by_tags);
+
+static struct shrinker *lu_site_shrinker = NULL;
+
+typedef struct lu_site_stats{
+	unsigned	lss_populated;
+	unsigned	lss_max_search;
+	unsigned	lss_total;
+	unsigned	lss_busy;
+} lu_site_stats_t;
+
+static void lu_site_stats_get(cfs_hash_t *hs,
+			      lu_site_stats_t *stats, int populated)
+{
+	cfs_hash_bd_t bd;
+	int	   i;
+
+	cfs_hash_for_each_bucket(hs, &bd, i) {
+		struct lu_site_bkt_data *bkt = cfs_hash_bd_extra_get(hs, &bd);
+		struct hlist_head	*hhead;
+
+		cfs_hash_bd_lock(hs, &bd, 1);
+		stats->lss_busy  += bkt->lsb_busy;
+		stats->lss_total += cfs_hash_bd_count_get(&bd);
+		stats->lss_max_search = max((int)stats->lss_max_search,
+					    cfs_hash_bd_depmax_get(&bd));
+		if (!populated) {
+			cfs_hash_bd_unlock(hs, &bd, 1);
+			continue;
+		}
+
+		cfs_hash_bd_for_each_hlist(hs, &bd, hhead) {
+			if (!hlist_empty(hhead))
+				stats->lss_populated++;
+		}
+		cfs_hash_bd_unlock(hs, &bd, 1);
+	}
+}
+
+
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the  lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
+static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
+{
+	lu_site_stats_t stats;
+	struct lu_site *s;
+	struct lu_site *tmp;
+	int cached = 0;
+	int remain = shrink_param(sc, nr_to_scan);
+	LIST_HEAD(splice);
+
+	if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+		if (remain != 0)
+			return -1;
+		else
+			/* We must not take the lu_sites_guard lock when
+			 * __GFP_FS is *not* set because of the deadlock
+			 * possibility detailed above. Additionally,
+			 * since we cannot determine the number of
+			 * objects in the cache without taking this
+			 * lock, we're in a particularly tough spot. As
+			 * a result, we'll just lie and say our cache is
+			 * empty. This _should_ be ok, as we can't
+			 * reclaim objects when __GFP_FS is *not* set
+			 * anyways.
+			 */
+			return 0;
+	}
+
+	CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
+	mutex_lock(&lu_sites_guard);
+	list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
+		if (shrink_param(sc, nr_to_scan) != 0) {
+			remain = lu_site_purge(&lu_shrink_env, s, remain);
+			/*
+			 * Move just shrunk site to the tail of site list to
+			 * assure shrinking fairness.
+			 */
+			list_move_tail(&s->ls_linkage, &splice);
+		}
+
+		memset(&stats, 0, sizeof(stats));
+		lu_site_stats_get(s->ls_obj_hash, &stats, 0);
+		cached += stats.lss_total - stats.lss_busy;
+		if (shrink_param(sc, nr_to_scan) && remain <= 0)
+			break;
+	}
+	list_splice(&splice, lu_sites.prev);
+	mutex_unlock(&lu_sites_guard);
+
+	cached = (cached / 100) * sysctl_vfs_cache_pressure;
+	if (shrink_param(sc, nr_to_scan) == 0)
+		CDEBUG(D_INODE, "%d objects cached\n", cached);
+	return cached;
+}
+
+/*
+ * Debugging stuff.
+ */
+
+/**
+ * Environment to be used in debugger, contains all tags.
+ */
+struct lu_env lu_debugging_env;
+
+/**
+ * Debugging printer function using printk().
+ */
+int lu_printk_printer(const struct lu_env *env,
+		      void *unused, const char *format, ...)
+{
+	va_list args;
+
+	va_start(args, format);
+	vprintk(format, args);
+	va_end(args);
+	return 0;
+}
+
+/**
+ * Initialization of global lu_* data.
+ */
+int lu_global_init(void)
+{
+	int result;
+
+	CDEBUG(D_INFO, "Lustre LU module (%p).\n", &lu_keys);
+
+	result = lu_ref_global_init();
+	if (result != 0)
+		return result;
+
+	LU_CONTEXT_KEY_INIT(&lu_global_key);
+	result = lu_context_key_register(&lu_global_key);
+	if (result != 0)
+		return result;
+
+	/*
+	 * At this level, we don't know what tags are needed, so allocate them
+	 * conservatively. This should not be too bad, because this
+	 * environment is global.
+	 */
+	mutex_lock(&lu_sites_guard);
+	result = lu_env_init(&lu_shrink_env, LCT_SHRINKER);
+	mutex_unlock(&lu_sites_guard);
+	if (result != 0)
+		return result;
+
+	/*
+	 * seeks estimation: 3 seeks to read a record from oi, one to read
+	 * inode, one for ea. Unfortunately setting this high value results in
+	 * lu_object/inode cache consuming all the memory.
+	 */
+	lu_site_shrinker = set_shrinker(DEFAULT_SEEKS, lu_cache_shrink);
+	if (lu_site_shrinker == NULL)
+		return -ENOMEM;
+
+	return result;
+}
+
+/**
+ * Dual to lu_global_init().
+ */
+void lu_global_fini(void)
+{
+	if (lu_site_shrinker != NULL) {
+		remove_shrinker(lu_site_shrinker);
+		lu_site_shrinker = NULL;
+	}
+
+	lu_context_key_degister(&lu_global_key);
+
+	/*
+	 * Tear shrinker environment down _after_ de-registering
+	 * lu_global_key, because the latter has a value in the former.
+	 */
+	mutex_lock(&lu_sites_guard);
+	lu_env_fini(&lu_shrink_env);
+	mutex_unlock(&lu_sites_guard);
+
+	lu_ref_global_fini();
+}
+
+static __u32 ls_stats_read(struct lprocfs_stats *stats, int idx)
+{
+#ifdef LPROCFS
+	struct lprocfs_counter ret;
+
+	lprocfs_stats_collect(stats, idx, &ret);
+	return (__u32)ret.lc_count;
+#else
+	return 0;
+#endif
+}
+
+/**
+ * Output site statistical counters into a buffer. Suitable for
+ * lprocfs_rd_*()-style functions.
+ */
+int lu_site_stats_print(const struct lu_site *s, struct seq_file *m)
+{
+	lu_site_stats_t stats;
+
+	memset(&stats, 0, sizeof(stats));
+	lu_site_stats_get(s->ls_obj_hash, &stats, 1);
+
+	return seq_printf(m, "%d/%d %d/%d %d %d %d %d %d %d %d\n",
+			stats.lss_busy,
+			stats.lss_total,
+			stats.lss_populated,
+			CFS_HASH_NHLIST(s->ls_obj_hash),
+			stats.lss_max_search,
+			ls_stats_read(s->ls_stats, LU_SS_CREATED),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_HIT),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_MISS),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_CACHE_DEATH_RACE),
+			ls_stats_read(s->ls_stats, LU_SS_LRU_PURGED));
+}
+EXPORT_SYMBOL(lu_site_stats_print);
+
+/**
+ * Helper function to initialize a number of kmem slab caches at once.
+ */
+int lu_kmem_init(struct lu_kmem_descr *caches)
+{
+	int result;
+	struct lu_kmem_descr *iter = caches;
+
+	for (result = 0; iter->ckd_cache != NULL; ++iter) {
+		*iter->ckd_cache = kmem_cache_create(iter->ckd_name,
+							iter->ckd_size,
+							0, 0, NULL);
+		if (*iter->ckd_cache == NULL) {
+			result = -ENOMEM;
+			/* free all previously allocated caches */
+			lu_kmem_fini(caches);
+			break;
+		}
+	}
+	return result;
+}
+EXPORT_SYMBOL(lu_kmem_init);
+
+/**
+ * Helper function to finalize a number of kmem slab cached at once. Dual to
+ * lu_kmem_init().
+ */
+void lu_kmem_fini(struct lu_kmem_descr *caches)
+{
+	for (; caches->ckd_cache != NULL; ++caches) {
+		if (*caches->ckd_cache != NULL) {
+			kmem_cache_destroy(*caches->ckd_cache);
+			*caches->ckd_cache = NULL;
+		}
+	}
+}
+EXPORT_SYMBOL(lu_kmem_fini);
+
+/**
+ * Temporary solution to be able to assign fid in ->do_create()
+ * till we have fully-functional OST fids
+ */
+void lu_object_assign_fid(const struct lu_env *env, struct lu_object *o,
+			  const struct lu_fid *fid)
+{
+	struct lu_site		*s = o->lo_dev->ld_site;
+	struct lu_fid		*old = &o->lo_header->loh_fid;
+	struct lu_site_bkt_data	*bkt;
+	struct lu_object	*shadow;
+	wait_queue_t		 waiter;
+	cfs_hash_t		*hs;
+	cfs_hash_bd_t		 bd;
+	__u64			 version = 0;
+
+	LASSERT(fid_is_zero(old));
+
+	hs = s->ls_obj_hash;
+	cfs_hash_bd_get_and_lock(hs, (void *)fid, &bd, 1);
+	shadow = htable_lookup(s, &bd, fid, &waiter, &version);
+	/* supposed to be unique */
+	LASSERT(shadow == NULL);
+	*old = *fid;
+	bkt = cfs_hash_bd_extra_get(hs, &bd);
+	cfs_hash_bd_add_locked(hs, &bd, &o->lo_header->loh_hash);
+	bkt->lsb_busy++;
+	cfs_hash_bd_unlock(hs, &bd, 1);
+}
+EXPORT_SYMBOL(lu_object_assign_fid);
+
+/**
+ * allocates object with 0 (non-assiged) fid
+ * XXX: temporary solution to be able to assign fid in ->do_create()
+ *      till we have fully-functional OST fids
+ */
+struct lu_object *lu_object_anon(const struct lu_env *env,
+				 struct lu_device *dev,
+				 const struct lu_object_conf *conf)
+{
+	struct lu_fid     fid;
+	struct lu_object *o;
+
+	fid_zero(&fid);
+	o = lu_object_alloc(env, dev, &fid, conf);
+
+	return o;
+}
+EXPORT_SYMBOL(lu_object_anon);
+
+struct lu_buf LU_BUF_NULL = {
+	.lb_buf = NULL,
+	.lb_len = 0
+};
+EXPORT_SYMBOL(LU_BUF_NULL);
+
+void lu_buf_free(struct lu_buf *buf)
+{
+	LASSERT(buf);
+	if (buf->lb_buf) {
+		LASSERT(buf->lb_len > 0);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+		buf->lb_buf = NULL;
+		buf->lb_len = 0;
+	}
+}
+EXPORT_SYMBOL(lu_buf_free);
+
+void lu_buf_alloc(struct lu_buf *buf, int size)
+{
+	LASSERT(buf);
+	LASSERT(buf->lb_buf == NULL);
+	LASSERT(buf->lb_len == 0);
+	OBD_ALLOC_LARGE(buf->lb_buf, size);
+	if (likely(buf->lb_buf))
+		buf->lb_len = size;
+}
+EXPORT_SYMBOL(lu_buf_alloc);
+
+void lu_buf_realloc(struct lu_buf *buf, int size)
+{
+	lu_buf_free(buf);
+	lu_buf_alloc(buf, size);
+}
+EXPORT_SYMBOL(lu_buf_realloc);
+
+struct lu_buf *lu_buf_check_and_alloc(struct lu_buf *buf, int len)
+{
+	if (buf->lb_buf == NULL && buf->lb_len == 0)
+		lu_buf_alloc(buf, len);
+
+	if ((len > buf->lb_len) && (buf->lb_buf != NULL))
+		lu_buf_realloc(buf, len);
+
+	return buf;
+}
+EXPORT_SYMBOL(lu_buf_check_and_alloc);
+
+/**
+ * Increase the size of the \a buf.
+ * preserves old data in buffer
+ * old buffer remains unchanged on error
+ * \retval 0 or -ENOMEM
+ */
+int lu_buf_check_and_grow(struct lu_buf *buf, int len)
+{
+	char *ptr;
+
+	if (len <= buf->lb_len)
+		return 0;
+
+	OBD_ALLOC_LARGE(ptr, len);
+	if (ptr == NULL)
+		return -ENOMEM;
+
+	/* Free the old buf */
+	if (buf->lb_buf != NULL) {
+		memcpy(ptr, buf->lb_buf, buf->lb_len);
+		OBD_FREE_LARGE(buf->lb_buf, buf->lb_len);
+	}
+
+	buf->lb_buf = ptr;
+	buf->lb_len = len;
+	return 0;
+}
+EXPORT_SYMBOL(lu_buf_check_and_grow);
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ref.c b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
new file mode 100644
index 000000000000..23a76f158356
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ref.c
@@ -0,0 +1,50 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_ref.c
+ *
+ * Lustre reference.
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd.h>
+#include <obd_class.h>
+#include <obd_support.h>
+#include <lu_ref.h>
diff --git a/drivers/staging/lustre/lustre/obdclass/lu_ucred.c b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
new file mode 100644
index 000000000000..229db6c39b78
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lu_ucred.c
@@ -0,0 +1,107 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lu_object.c
+ *
+ * Lustre Object.
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ *
+ *   Author: Nikita Danilov <nikita.danilov@sun.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <linux/libcfs/libcfs.h>
+#include <obd_support.h>
+#include <lu_object.h>
+#include <md_object.h>
+
+/* context key constructor/destructor: lu_ucred_key_init, lu_ucred_key_fini */
+LU_KEY_INIT_FINI(lu_ucred, struct lu_ucred);
+
+static struct lu_context_key lu_ucred_key = {
+	.lct_tags = LCT_SESSION,
+	.lct_init = lu_ucred_key_init,
+	.lct_fini = lu_ucred_key_fini
+};
+
+/**
+ * Get ucred key if session exists and ucred key is allocated on it.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred(const struct lu_env *env)
+{
+	if (!env->le_ses)
+		return NULL;
+	return lu_context_key_get(env->le_ses, &lu_ucred_key);
+}
+EXPORT_SYMBOL(lu_ucred);
+
+/**
+ * Get ucred key and check if it is properly initialized.
+ * Return NULL otherwise.
+ */
+struct lu_ucred *lu_ucred_check(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred(env);
+	if (uc && uc->uc_valid != UCRED_OLD && uc->uc_valid != UCRED_NEW)
+		return NULL;
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_check);
+
+/**
+ * Get ucred key, which must exist and must be properly initialized.
+ * Assert otherwise.
+ */
+struct lu_ucred *lu_ucred_assert(const struct lu_env *env)
+{
+	struct lu_ucred *uc = lu_ucred_check(env);
+	LASSERT(uc != NULL);
+	return uc;
+}
+EXPORT_SYMBOL(lu_ucred_assert);
+
+int lu_ucred_global_init(void)
+{
+	LU_CONTEXT_KEY_INIT(&lu_ucred_key);
+	return lu_context_key_register(&lu_ucred_key);
+}
+
+void lu_ucred_global_fini(void)
+{
+	lu_context_key_degister(&lu_ucred_key);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_handles.c b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
new file mode 100644
index 000000000000..69d6499ef731
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_handles.c
@@ -0,0 +1,263 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/lustre_handles.c
+ *
+ * Author: Phil Schwan <phil@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_support.h>
+#include <lustre_handles.h>
+#include <lustre_lib.h>
+
+
+static __u64 handle_base;
+#define HANDLE_INCR 7
+static spinlock_t handle_base_lock;
+
+static struct handle_bucket {
+	spinlock_t	lock;
+	struct list_head	head;
+} *handle_hash;
+
+#define HANDLE_HASH_SIZE (1 << 16)
+#define HANDLE_HASH_MASK (HANDLE_HASH_SIZE - 1)
+
+/*
+ * Generate a unique 64bit cookie (hash) for a handle and insert it into
+ * global (per-node) hash-table.
+ */
+void class_handle_hash(struct portals_handle *h,
+		       struct portals_handle_ops *ops)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	LASSERT(h != NULL);
+	LASSERT(list_empty(&h->h_link));
+
+	/*
+	 * This is fast, but simplistic cookie generation algorithm, it will
+	 * need a re-do at some point in the future for security.
+	 */
+	spin_lock(&handle_base_lock);
+	handle_base += HANDLE_INCR;
+
+	if (unlikely(handle_base == 0)) {
+		/*
+		 * Cookie of zero is "dangerous", because in many places it's
+		 * assumed that 0 means "unassigned" handle, not bound to any
+		 * object.
+		 */
+		CWARN("The universe has been exhausted: cookie wrap-around.\n");
+		handle_base += HANDLE_INCR;
+	}
+	h->h_cookie = handle_base;
+	spin_unlock(&handle_base_lock);
+
+	h->h_ops = ops;
+	spin_lock_init(&h->h_lock);
+
+	bucket = &handle_hash[h->h_cookie & HANDLE_HASH_MASK];
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	CDEBUG(D_INFO, "added object %p with handle "LPX64" to hash\n",
+	       h, h->h_cookie);
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash);
+
+static void class_handle_unhash_nolock(struct portals_handle *h)
+{
+	if (list_empty(&h->h_link)) {
+		CERROR("removing an already-removed handle ("LPX64")\n",
+		       h->h_cookie);
+		return;
+	}
+
+	CDEBUG(D_INFO, "removing object %p with handle "LPX64" from hash\n",
+	       h, h->h_cookie);
+
+	spin_lock(&h->h_lock);
+	if (h->h_in == 0) {
+		spin_unlock(&h->h_lock);
+		return;
+	}
+	h->h_in = 0;
+	spin_unlock(&h->h_lock);
+	list_del_rcu(&h->h_link);
+}
+
+void class_handle_unhash(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	class_handle_unhash_nolock(h);
+	spin_unlock(&bucket->lock);
+}
+EXPORT_SYMBOL(class_handle_unhash);
+
+void class_handle_hash_back(struct portals_handle *h)
+{
+	struct handle_bucket *bucket;
+	ENTRY;
+
+	bucket = handle_hash + (h->h_cookie & HANDLE_HASH_MASK);
+
+	spin_lock(&bucket->lock);
+	list_add_rcu(&h->h_link, &bucket->head);
+	h->h_in = 1;
+	spin_unlock(&bucket->lock);
+
+	EXIT;
+}
+EXPORT_SYMBOL(class_handle_hash_back);
+
+void *class_handle2object(__u64 cookie)
+{
+	struct handle_bucket *bucket;
+	struct portals_handle *h;
+	void *retval = NULL;
+	ENTRY;
+
+	LASSERT(handle_hash != NULL);
+
+	/* Be careful when you want to change this code. See the
+	 * rcu_read_lock() definition on top this file. - jxiong */
+	bucket = handle_hash + (cookie & HANDLE_HASH_MASK);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(h, &bucket->head, h_link) {
+		if (h->h_cookie != cookie)
+			continue;
+
+		spin_lock(&h->h_lock);
+		if (likely(h->h_in != 0)) {
+			h->h_ops->hop_addref(h);
+			retval = h;
+		}
+		spin_unlock(&h->h_lock);
+		break;
+	}
+	rcu_read_unlock();
+
+	RETURN(retval);
+}
+EXPORT_SYMBOL(class_handle2object);
+
+void class_handle_free_cb(cfs_rcu_head_t *rcu)
+{
+	struct portals_handle *h = RCU2HANDLE(rcu);
+	void *ptr = (void *)(unsigned long)h->h_cookie;
+
+	if (h->h_ops->hop_free != NULL)
+		h->h_ops->hop_free(ptr, h->h_size);
+	else
+		OBD_FREE(ptr, h->h_size);
+}
+EXPORT_SYMBOL(class_handle_free_cb);
+
+int class_handle_init(void)
+{
+	struct handle_bucket *bucket;
+	struct timeval tv;
+	int seed[2];
+
+	LASSERT(handle_hash == NULL);
+
+	OBD_ALLOC_LARGE(handle_hash, sizeof(*bucket) * HANDLE_HASH_SIZE);
+	if (handle_hash == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&handle_base_lock);
+	for (bucket = handle_hash + HANDLE_HASH_SIZE - 1; bucket >= handle_hash;
+	     bucket--) {
+		INIT_LIST_HEAD(&bucket->head);
+		spin_lock_init(&bucket->lock);
+	}
+
+	/** bug 21430: add randomness to the initial base */
+	cfs_get_random_bytes(seed, sizeof(seed));
+	do_gettimeofday(&tv);
+	cfs_srand(tv.tv_sec ^ seed[0], tv.tv_usec ^ seed[1]);
+
+	cfs_get_random_bytes(&handle_base, sizeof(handle_base));
+	LASSERT(handle_base != 0ULL);
+
+	return 0;
+}
+
+static int cleanup_all_handles(void)
+{
+	int rc;
+	int i;
+
+	for (rc = i = 0; i < HANDLE_HASH_SIZE; i++) {
+		struct portals_handle *h;
+
+		spin_lock(&handle_hash[i].lock);
+		list_for_each_entry_rcu(h, &(handle_hash[i].head), h_link) {
+			CERROR("force clean handle "LPX64" addr %p ops %p\n",
+			       h->h_cookie, h, h->h_ops);
+
+			class_handle_unhash_nolock(h);
+			rc++;
+		}
+		spin_unlock(&handle_hash[i].lock);
+	}
+
+	return rc;
+}
+
+void class_handle_cleanup(void)
+{
+	int count;
+	LASSERT(handle_hash != NULL);
+
+	count = cleanup_all_handles();
+
+	OBD_FREE_LARGE(handle_hash, sizeof(*handle_hash) * HANDLE_HASH_SIZE);
+	handle_hash = NULL;
+
+	if (count != 0)
+		CERROR("handle_count at cleanup: %d\n", count);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/lustre_peer.c b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
new file mode 100644
index 000000000000..2fa2589dc8eb
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/lustre_peer.c
@@ -0,0 +1,218 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_RPC
+
+#include <obd.h>
+#include <obd_support.h>
+#include <obd_class.h>
+#include <lustre_lib.h>
+#include <lustre_ha.h>
+#include <lustre_net.h>
+#include <lprocfs_status.h>
+
+#define NIDS_MAX	32
+
+struct uuid_nid_data {
+	struct list_head       un_list;
+	struct obd_uuid  un_uuid;
+	int	      un_nid_count;
+	lnet_nid_t       un_nids[NIDS_MAX];
+};
+
+/* FIXME: This should probably become more elegant than a global linked list */
+static struct list_head	g_uuid_list;
+static spinlock_t	g_uuid_lock;
+
+void class_init_uuidlist(void)
+{
+	INIT_LIST_HEAD(&g_uuid_list);
+	spin_lock_init(&g_uuid_lock);
+}
+
+void class_exit_uuidlist(void)
+{
+	/* delete all */
+	class_del_uuid(NULL);
+}
+
+int lustre_uuid_to_peer(const char *uuid, lnet_nid_t *peer_nid, int index)
+{
+	struct uuid_nid_data *data;
+	struct obd_uuid tmp;
+	int rc = -ENOENT;
+
+	obd_str2uuid(&tmp, uuid);
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(data, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+			if (index >= data->un_nid_count)
+				break;
+
+			rc = 0;
+			*peer_nid = data->un_nids[index];
+			break;
+		}
+	}
+	spin_unlock(&g_uuid_lock);
+	return rc;
+}
+EXPORT_SYMBOL(lustre_uuid_to_peer);
+
+/* Add a nid to a niduuid.  Multiple nids can be added to a single uuid;
+   LNET will choose the best one. */
+int class_add_uuid(const char *uuid, __u64 nid)
+{
+	struct uuid_nid_data *data, *entry;
+	int found = 0;
+
+	LASSERT(nid != 0);  /* valid newconfig NID is never zero */
+
+	if (strlen(uuid) > UUID_MAX - 1)
+		return -EOVERFLOW;
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		return -ENOMEM;
+
+	obd_str2uuid(&data->un_uuid, uuid);
+	data->un_nids[0] = nid;
+	data->un_nid_count = 1;
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		if (obd_uuid_equals(&entry->un_uuid, &data->un_uuid)) {
+			int i;
+
+			found = 1;
+			for (i = 0; i < entry->un_nid_count; i++)
+				if (nid == entry->un_nids[i])
+					break;
+
+			if (i == entry->un_nid_count) {
+				LASSERT(entry->un_nid_count < NIDS_MAX);
+				entry->un_nids[entry->un_nid_count++] = nid;
+			}
+			break;
+		}
+	}
+	if (!found)
+		list_add(&data->un_list, &g_uuid_list);
+	spin_unlock(&g_uuid_lock);
+
+	if (found) {
+		CDEBUG(D_INFO, "found uuid %s %s cnt=%d\n", uuid,
+		       libcfs_nid2str(nid), entry->un_nid_count);
+		OBD_FREE(data, sizeof(*data));
+	} else {
+		CDEBUG(D_INFO, "add uuid %s %s\n", uuid, libcfs_nid2str(nid));
+	}
+	return 0;
+}
+EXPORT_SYMBOL(class_add_uuid);
+
+/* Delete the nids for one uuid if specified, otherwise delete all */
+int class_del_uuid(const char *uuid)
+{
+	LIST_HEAD(deathrow);
+	struct uuid_nid_data *data;
+
+	spin_lock(&g_uuid_lock);
+	if (uuid != NULL) {
+		struct obd_uuid tmp;
+
+		obd_str2uuid(&tmp, uuid);
+		list_for_each_entry(data, &g_uuid_list, un_list) {
+			if (obd_uuid_equals(&data->un_uuid, &tmp)) {
+				list_move(&data->un_list, &deathrow);
+				break;
+			}
+		}
+	} else
+		list_splice_init(&g_uuid_list, &deathrow);
+	spin_unlock(&g_uuid_lock);
+
+	if (uuid != NULL && list_empty(&deathrow)) {
+		CDEBUG(D_INFO, "Try to delete a non-existent uuid %s\n", uuid);
+		return -EINVAL;
+	}
+
+	while (!list_empty(&deathrow)) {
+		data = list_entry(deathrow.next, struct uuid_nid_data,
+				      un_list);
+		list_del(&data->un_list);
+
+		CDEBUG(D_INFO, "del uuid %s %s/%d\n",
+		       obd_uuid2str(&data->un_uuid),
+		       libcfs_nid2str(data->un_nids[0]),
+		       data->un_nid_count);
+
+		OBD_FREE(data, sizeof(*data));
+	}
+
+	return 0;
+}
+
+/* check if @nid exists in nid list of @uuid */
+int class_check_uuid(struct obd_uuid *uuid, __u64 nid)
+{
+	struct uuid_nid_data *entry;
+	int found = 0;
+	ENTRY;
+
+	CDEBUG(D_INFO, "check if uuid %s has %s.\n",
+	       obd_uuid2str(uuid), libcfs_nid2str(nid));
+
+	spin_lock(&g_uuid_lock);
+	list_for_each_entry(entry, &g_uuid_list, un_list) {
+		int i;
+
+		if (!obd_uuid_equals(&entry->un_uuid, uuid))
+			continue;
+
+		/* found the uuid, check if it has @nid */
+		for (i = 0; i < entry->un_nid_count; i++) {
+			if (entry->un_nids[i] == nid) {
+				found = 1;
+				break;
+			}
+		}
+		break;
+	}
+	spin_unlock(&g_uuid_lock);
+	RETURN(found);
+}
+EXPORT_SYMBOL(class_check_uuid);
diff --git a/drivers/staging/lustre/lustre/obdclass/md_attrs.c b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
new file mode 100644
index 000000000000..b71344a04c7e
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/md_attrs.c
@@ -0,0 +1,202 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Intel Corporation.
+ * Use is subject to license terms.
+ *
+ * Author: Johann Lombardi <johann.lombardi@intel.com>
+ */
+
+#include <lustre/lustre_idl.h>
+#include <obd.h>
+#include <md_object.h>
+
+/**
+ * Initialize new \a lma. Only fid is stored.
+ *
+ * \param lma - is the new LMA structure to be initialized
+ * \param fid - is the FID of the object this LMA belongs to
+ * \param incompat - features that MDS must understand to access object
+ */
+void lustre_lma_init(struct lustre_mdt_attrs *lma, const struct lu_fid *fid,
+		     __u32 incompat)
+{
+	lma->lma_compat   = 0;
+	lma->lma_incompat = incompat;
+	lma->lma_self_fid = *fid;
+
+	/* If a field is added in struct lustre_mdt_attrs, zero it explicitly
+	 * and change the test below. */
+	LASSERT(sizeof(*lma) ==
+		(offsetof(struct lustre_mdt_attrs, lma_self_fid) +
+		 sizeof(lma->lma_self_fid)));
+};
+EXPORT_SYMBOL(lustre_lma_init);
+
+/**
+ * Swab, if needed, LMA structure which is stored on-disk in little-endian order.
+ *
+ * \param lma - is a pointer to the LMA structure to be swabbed.
+ */
+void lustre_lma_swab(struct lustre_mdt_attrs *lma)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&lma->lma_compat);
+		__swab32s(&lma->lma_incompat);
+		lustre_swab_lu_fid(&lma->lma_self_fid);
+	}
+};
+EXPORT_SYMBOL(lustre_lma_swab);
+
+/**
+ * Swab, if needed, SOM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the SOM structure to be swabbed.
+ */
+void lustre_som_swab(struct som_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->som_compat);
+		__swab32s(&attrs->som_incompat);
+		__swab64s(&attrs->som_ioepoch);
+		__swab64s(&attrs->som_size);
+		__swab64s(&attrs->som_blocks);
+		__swab64s(&attrs->som_mountid);
+	}
+};
+EXPORT_SYMBOL(lustre_som_swab);
+
+/*
+ * Swab and extract SOM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk SOM extended attribute.
+ * \param rc  - is the SOM xattr stored in \a buf
+ * \param msd - is the md_som_data structure where to extract SOM attributes.
+ */
+int lustre_buf2som(void *buf, int rc, struct md_som_data *msd)
+{
+	struct som_attrs *attrs = (struct som_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no SOM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* check SOM compatibility */
+	if (attrs->som_incompat & ~cpu_to_le32(SOM_INCOMPAT_SUPP))
+		RETURN(-ENODATA);
+
+	/* unpack SOM attributes */
+	lustre_som_swab(attrs);
+
+	/* fill in-memory msd structure */
+	msd->msd_compat   = attrs->som_compat;
+	msd->msd_incompat = attrs->som_incompat;
+	msd->msd_ioepoch  = attrs->som_ioepoch;
+	msd->msd_size     = attrs->som_size;
+	msd->msd_blocks   = attrs->som_blocks;
+	msd->msd_mountid  = attrs->som_mountid;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2som);
+
+/**
+ * Swab, if needed, HSM structure which is stored on-disk in little-endian
+ * order.
+ *
+ * \param attrs - is a pointer to the HSM structure to be swabbed.
+ */
+void lustre_hsm_swab(struct hsm_attrs *attrs)
+{
+	/* Use LUSTRE_MSG_MAGIC to detect local endianess. */
+	if (LUSTRE_MSG_MAGIC != cpu_to_le32(LUSTRE_MSG_MAGIC)) {
+		__swab32s(&attrs->hsm_compat);
+		__swab32s(&attrs->hsm_flags);
+		__swab64s(&attrs->hsm_arch_id);
+		__swab64s(&attrs->hsm_arch_ver);
+	}
+};
+EXPORT_SYMBOL(lustre_hsm_swab);
+
+/*
+ * Swab and extract HSM attributes from on-disk xattr.
+ *
+ * \param buf - is a buffer containing the on-disk HSM extended attribute.
+ * \param rc  - is the HSM xattr stored in \a buf
+ * \param mh  - is the md_hsm structure where to extract HSM attributes.
+ */
+int lustre_buf2hsm(void *buf, int rc, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	if (rc == 0 ||  rc == -ENODATA)
+		/* no HSM attributes */
+		RETURN(-ENODATA);
+
+	if (rc < 0)
+		/* error hit while fetching xattr */
+		RETURN(rc);
+
+	/* unpack HSM attributes */
+	lustre_hsm_swab(attrs);
+
+	/* fill md_hsm structure */
+	mh->mh_compat   = attrs->hsm_compat;
+	mh->mh_flags    = attrs->hsm_flags;
+	mh->mh_arch_id  = attrs->hsm_arch_id;
+	mh->mh_arch_ver = attrs->hsm_arch_ver;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(lustre_buf2hsm);
+
+/*
+ * Pack HSM attributes.
+ *
+ * \param buf - is the output buffer where to pack the on-disk HSM xattr.
+ * \param mh  - is the md_hsm structure to pack.
+ */
+void lustre_hsm2buf(void *buf, struct md_hsm *mh)
+{
+	struct hsm_attrs *attrs = (struct hsm_attrs *)buf;
+	ENTRY;
+
+	/* copy HSM attributes */
+	attrs->hsm_compat   = mh->mh_compat;
+	attrs->hsm_flags    = mh->mh_flags;
+	attrs->hsm_arch_id  = mh->mh_arch_id;
+	attrs->hsm_arch_ver = mh->mh_arch_ver;
+
+	/* pack xattr */
+	lustre_hsm_swab(attrs);
+}
+EXPORT_SYMBOL(lustre_hsm2buf);
diff --git a/drivers/staging/lustre/lustre/obdclass/mea.c b/drivers/staging/lustre/lustre/obdclass/mea.c
new file mode 100644
index 000000000000..c4f0dbc23611
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/mea.c
@@ -0,0 +1,112 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/kmod.h>   /* for request_module() */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/pagemap.h>
+#include <lprocfs_status.h>
+#include <lustre/lustre_idl.h>
+
+static int mea_last_char_hash(int count, char *name, int namelen)
+{
+	unsigned int c;
+
+	c = name[namelen - 1];
+	if (c == 0)
+		CWARN("looks like wrong len is passed\n");
+	c = c % count;
+	return c;
+}
+
+static int mea_all_chars_hash(int count, char *name, int namelen)
+{
+	unsigned int c = 0;
+
+	while (--namelen >= 0)
+		c += name[namelen];
+	c = c % count;
+	return c;
+}
+
+int raw_name2idx(int hashtype, int count, const char *name, int namelen)
+{
+	unsigned int	c = 0;
+	int		idx;
+
+	LASSERT(namelen > 0);
+
+	if (filename_is_volatile(name, namelen, &idx)) {
+		if ((idx >= 0) && (idx < count))
+			return idx;
+		goto hashchoice;
+	}
+
+	if (count <= 1)
+		return 0;
+
+hashchoice:
+	switch (hashtype) {
+	case MEA_MAGIC_LAST_CHAR:
+		c = mea_last_char_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_ALL_CHARS:
+		c = mea_all_chars_hash(count, (char *)name, namelen);
+		break;
+	case MEA_MAGIC_HASH_SEGMENT:
+		CERROR("Unsupported hash type MEA_MAGIC_HASH_SEGMENT\n");
+		break;
+	default:
+		CERROR("Unknown hash type 0x%x\n", hashtype);
+	}
+
+	LASSERT(c < count);
+	return c;
+}
+EXPORT_SYMBOL(raw_name2idx);
+
+int mea_name2idx(struct lmv_stripe_md *mea, const char *name, int namelen)
+{
+	unsigned int c;
+
+	LASSERT(mea && mea->mea_count);
+
+	c = raw_name2idx(mea->mea_magic, mea->mea_count, name, namelen);
+
+	LASSERT(c < mea->mea_count);
+	return c;
+}
+EXPORT_SYMBOL(mea_name2idx);
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_config.c b/drivers/staging/lustre/lustre/obdclass/obd_config.c
new file mode 100644
index 000000000000..bbf06d009fd0
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_config.c
@@ -0,0 +1,1904 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_config.c
+ *
+ * Config API
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#include <obd_class.h>
+#include <linux/string.h>
+#include <lustre_log.h>
+#include <lprocfs_status.h>
+#include <lustre_param.h>
+
+#include "llog_internal.h"
+
+static cfs_hash_ops_t uuid_hash_ops;
+static cfs_hash_ops_t nid_hash_ops;
+static cfs_hash_ops_t nid_stat_hash_ops;
+
+/*********** string parsing utils *********/
+
+/* returns 0 if we find this key in the buffer, else 1 */
+int class_find_param(char *buf, char *key, char **valp)
+{
+	char *ptr;
+
+	if (!buf)
+		return 1;
+
+	if ((ptr = strstr(buf, key)) == NULL)
+		return 1;
+
+	if (valp)
+		*valp = ptr + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_find_param);
+
+/**
+ * Check whether the proc parameter \a param is an old parameter or not from
+ * the array \a ptr which contains the mapping from old parameters to new ones.
+ * If it's an old one, then return the pointer to the cfg_interop_param struc-
+ * ture which contains both the old and new parameters.
+ *
+ * \param param			proc parameter
+ * \param ptr			an array which contains the mapping from
+ *				old parameters to new ones
+ *
+ * \retval valid-pointer	pointer to the cfg_interop_param structure
+ *				which contains the old and new parameters
+ * \retval NULL			\a param or \a ptr is NULL,
+ *				or \a param is not an old parameter
+ */
+struct cfg_interop_param *class_find_old_param(const char *param,
+					       struct cfg_interop_param *ptr)
+{
+	char *value = NULL;
+	int   name_len = 0;
+
+	if (param == NULL || ptr == NULL)
+		RETURN(NULL);
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	while (ptr->old_param != NULL) {
+		if (strncmp(param, ptr->old_param, name_len) == 0 &&
+		    name_len == strlen(ptr->old_param))
+			RETURN(ptr);
+		ptr++;
+	}
+
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_find_old_param);
+
+/**
+ * Finds a parameter in \a params and copies it to \a copy.
+ *
+ * Leading spaces are skipped. Next space or end of string is the
+ * parameter terminator with the exception that spaces inside single or double
+ * quotes get included into a parameter. The parameter is copied into \a copy
+ * which has to be allocated big enough by a caller, quotes are stripped in
+ * the copy and the copy is terminated by 0.
+ *
+ * On return \a params is set to next parameter or to NULL if last
+ * parameter is returned.
+ *
+ * \retval 0 if parameter is returned in \a copy
+ * \retval 1 otherwise
+ * \retval -EINVAL if unbalanced quota is found
+ */
+int class_get_next_param(char **params, char *copy)
+{
+	char *q1, *q2, *str;
+	int len;
+
+	str = *params;
+	while (*str == ' ')
+		str++;
+
+	if (*str == '\0') {
+		*params = NULL;
+		return 1;
+	}
+
+	while (1) {
+		q1 = strpbrk(str, " '\"");
+		if (q1 == NULL) {
+			len = strlen(str);
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = NULL;
+			return 0;
+		}
+		len = q1 - str;
+		if (*q1 == ' ') {
+			memcpy(copy, str, len);
+			copy[len] = '\0';
+			*params = str + len;
+			return 0;
+		}
+
+		memcpy(copy, str, len);
+		copy += len;
+
+		/* search for the matching closing quote */
+		str = q1 + 1;
+		q2 = strchr(str, *q1);
+		if (q2 == NULL) {
+			CERROR("Unbalanced quota in parameters: \"%s\"\n",
+			       *params);
+			return -EINVAL;
+		}
+		len = q2 - str;
+		memcpy(copy, str, len);
+		copy += len;
+		str = q2 + 1;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(class_get_next_param);
+
+/* returns 0 if this is the first key in the buffer, else 1.
+   valp points to first char after key. */
+int class_match_param(char *buf, char *key, char **valp)
+{
+	if (!buf)
+		return 1;
+
+	if (memcmp(buf, key, strlen(key)) != 0)
+		return 1;
+
+	if (valp)
+		*valp = buf + strlen(key);
+
+	return 0;
+}
+EXPORT_SYMBOL(class_match_param);
+
+static int parse_nid(char *buf, void *value, int quiet)
+{
+	lnet_nid_t *nid = (lnet_nid_t *)value;
+
+	*nid = libcfs_str2nid(buf);
+	if (*nid != LNET_NID_ANY)
+		return 0;
+
+	if (!quiet)
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", buf);
+	return -EINVAL;
+}
+
+static int parse_net(char *buf, void *value)
+{
+	__u32 *net = (__u32 *)value;
+
+	*net = libcfs_str2net(buf);
+	CDEBUG(D_INFO, "Net %s\n", libcfs_net2str(*net));
+	return 0;
+}
+
+enum {
+	CLASS_PARSE_NID = 1,
+	CLASS_PARSE_NET,
+};
+
+/* 0 is good nid,
+   1 not found
+   < 0 error
+   endh is set to next separator */
+static int class_parse_value(char *buf, int opc, void *value, char **endh,
+			     int quiet)
+{
+	char *endp;
+	char  tmp;
+	int   rc = 0;
+
+	if (!buf)
+		return 1;
+	while (*buf == ',' || *buf == ':')
+		buf++;
+	if (*buf == ' ' || *buf == '/' || *buf == '\0')
+		return 1;
+
+	/* nid separators or end of nids */
+	endp = strpbrk(buf, ",: /");
+	if (endp == NULL)
+		endp = buf + strlen(buf);
+
+	tmp = *endp;
+	*endp = '\0';
+	switch (opc) {
+	default:
+		LBUG();
+	case CLASS_PARSE_NID:
+		rc = parse_nid(buf, value, quiet);
+		break;
+	case CLASS_PARSE_NET:
+		rc = parse_net(buf, value);
+		break;
+	}
+	*endp = tmp;
+	if (rc != 0)
+		return rc;
+	if (endh)
+		*endh = endp;
+	return 0;
+}
+
+int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_nid);
+
+int class_parse_nid_quiet(char *buf, lnet_nid_t *nid, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NID, (void *)nid, endh, 1);
+}
+EXPORT_SYMBOL(class_parse_nid_quiet);
+
+int class_parse_net(char *buf, __u32 *net, char **endh)
+{
+	return class_parse_value(buf, CLASS_PARSE_NET, (void *)net, endh, 0);
+}
+EXPORT_SYMBOL(class_parse_net);
+
+/* 1 param contains key and match
+ * 0 param contains key and not match
+ * -1 param does not contain key
+ */
+int class_match_nid(char *buf, char *key, lnet_nid_t nid)
+{
+	lnet_nid_t tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified nids */
+		while (class_parse_nid(buf, &tmp, &buf) == 0) {
+			if (tmp == nid)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_nid);
+
+int class_match_net(char *buf, char *key, __u32 net)
+{
+	__u32 tmp;
+	int   rc = -1;
+
+	while (class_find_param(buf, key, &buf) == 0) {
+		/* please restrict to the nids pertaining to
+		 * the specified networks */
+		while (class_parse_net(buf, &tmp, &buf) == 0) {
+			if (tmp == net)
+				return 1;
+		}
+		rc = 0;
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_match_net);
+
+/********************** class fns **********************/
+
+/**
+ * Create a new obd device and set the type, name and uuid.  If successful,
+ * the new device can be accessed by either name or uuid.
+ */
+int class_attach(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd = NULL;
+	char *typename, *name, *uuid;
+	int rc, len;
+	ENTRY;
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 1)) {
+		CERROR("No type passed!\n");
+		RETURN(-EINVAL);
+	}
+	typename = lustre_cfg_string(lcfg, 1);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 0)) {
+		CERROR("No name passed!\n");
+		RETURN(-EINVAL);
+	}
+	name = lustre_cfg_string(lcfg, 0);
+
+	if (!LUSTRE_CFG_BUFLEN(lcfg, 2)) {
+		CERROR("No UUID passed!\n");
+		RETURN(-EINVAL);
+	}
+	uuid = lustre_cfg_string(lcfg, 2);
+
+	CDEBUG(D_IOCTL, "attach type %s name: %s uuid: %s\n",
+	       MKSTR(typename), MKSTR(name), MKSTR(uuid));
+
+	obd = class_newdev(typename, name);
+	if (IS_ERR(obd)) {
+		/* Already exists or out of obds */
+		rc = PTR_ERR(obd);
+		obd = NULL;
+		CERROR("Cannot create device %s of type %s : %d\n",
+		       name, typename, rc);
+		GOTO(out, rc);
+	}
+	LASSERTF(obd != NULL, "Cannot get obd device %s of type %s\n",
+		 name, typename);
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08X != %08X\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+	LASSERTF(strncmp(obd->obd_name, name, strlen(name)) == 0,
+		 "%p obd_name %s != %s\n", obd, obd->obd_name, name);
+
+	rwlock_init(&obd->obd_pool_lock);
+	obd->obd_pool_limit = 0;
+	obd->obd_pool_slv = 0;
+
+	INIT_LIST_HEAD(&obd->obd_exports);
+	INIT_LIST_HEAD(&obd->obd_unlinked_exports);
+	INIT_LIST_HEAD(&obd->obd_delayed_exports);
+	INIT_LIST_HEAD(&obd->obd_exports_timed);
+	INIT_LIST_HEAD(&obd->obd_nid_stats);
+	spin_lock_init(&obd->obd_nid_lock);
+	spin_lock_init(&obd->obd_dev_lock);
+	mutex_init(&obd->obd_dev_mutex);
+	spin_lock_init(&obd->obd_osfs_lock);
+	/* obd->obd_osfs_age must be set to a value in the distant
+	 * past to guarantee a fresh statfs is fetched on mount. */
+	obd->obd_osfs_age = cfs_time_shift_64(-1000);
+
+	/* XXX belongs in setup not attach  */
+	init_rwsem(&obd->obd_observer_link_sem);
+	/* recovery data */
+	cfs_init_timer(&obd->obd_recovery_timer);
+	spin_lock_init(&obd->obd_recovery_task_lock);
+	init_waitqueue_head(&obd->obd_next_transno_waitq);
+	init_waitqueue_head(&obd->obd_evict_inprogress_waitq);
+	INIT_LIST_HEAD(&obd->obd_req_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_lock_replay_queue);
+	INIT_LIST_HEAD(&obd->obd_final_req_queue);
+	INIT_LIST_HEAD(&obd->obd_evict_list);
+
+	llog_group_init(&obd->obd_olg, FID_SEQ_LLOG);
+
+	obd->obd_conn_inprogress = 0;
+
+	len = strlen(uuid);
+	if (len >= sizeof(obd->obd_uuid)) {
+		CERROR("uuid must be < %d bytes long\n",
+		       (int)sizeof(obd->obd_uuid));
+		GOTO(out, rc = -EINVAL);
+	}
+	memcpy(obd->obd_uuid.uuid, uuid, len);
+
+	/* do the attach */
+	if (OBP(obd, attach)) {
+		rc = OBP(obd,attach)(obd, sizeof *lcfg, lcfg);
+		if (rc)
+			GOTO(out, rc = -EINVAL);
+	}
+
+	/* Detach drops this */
+	spin_lock(&obd->obd_dev_lock);
+	atomic_set(&obd->obd_refcount, 1);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_init(&obd->obd_reference);
+	lu_ref_add(&obd->obd_reference, "attach", obd);
+
+	obd->obd_attached = 1;
+	CDEBUG(D_IOCTL, "OBD: dev %d attached type %s with refcount %d\n",
+	       obd->obd_minor, typename, atomic_read(&obd->obd_refcount));
+	RETURN(0);
+ out:
+	if (obd != NULL) {
+		class_release_dev(obd);
+	}
+	return rc;
+}
+EXPORT_SYMBOL(class_attach);
+
+/** Create hashes, self-export, and call type-specific setup.
+ * Setup is effectively the "start this obd" call.
+ */
+int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	struct obd_export *exp;
+	ENTRY;
+
+	LASSERT(obd != NULL);
+	LASSERTF(obd == class_num2obd(obd->obd_minor),
+		 "obd %p != obd_devs[%d] %p\n",
+		 obd, obd->obd_minor, class_num2obd(obd->obd_minor));
+	LASSERTF(obd->obd_magic == OBD_DEVICE_MAGIC,
+		 "obd %p obd_magic %08x != %08x\n",
+		 obd, obd->obd_magic, OBD_DEVICE_MAGIC);
+
+	/* have we attached a type to this device? */
+	if (!obd->obd_attached) {
+		CERROR("Device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	if (obd->obd_set_up) {
+		CERROR("Device %d already setup (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+
+	/* is someone else setting us up right now? (attach inits spinlock) */
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_starting) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("Device %d setup in progress (type %s)\n",
+		       obd->obd_minor, obd->obd_type->typ_name);
+		RETURN(-EEXIST);
+	}
+	/* just leave this on forever.  I can't use obd_set_up here because
+	   other fns check that status, and we're not actually set up yet. */
+	obd->obd_starting = 1;
+	obd->obd_uuid_hash = NULL;
+	obd->obd_nid_hash = NULL;
+	obd->obd_nid_stats_hash = NULL;
+	spin_unlock(&obd->obd_dev_lock);
+
+	/* create an uuid-export lustre hash */
+	obd->obd_uuid_hash = cfs_hash_create("UUID_HASH",
+					     HASH_UUID_CUR_BITS,
+					     HASH_UUID_MAX_BITS,
+					     HASH_UUID_BKT_BITS, 0,
+					     CFS_HASH_MIN_THETA,
+					     CFS_HASH_MAX_THETA,
+					     &uuid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_uuid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-export lustre hash */
+	obd->obd_nid_hash = cfs_hash_create("NID_HASH",
+					    HASH_NID_CUR_BITS,
+					    HASH_NID_MAX_BITS,
+					    HASH_NID_BKT_BITS, 0,
+					    CFS_HASH_MIN_THETA,
+					    CFS_HASH_MAX_THETA,
+					    &nid_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	/* create a nid-stats lustre hash */
+	obd->obd_nid_stats_hash = cfs_hash_create("NID_STATS",
+						  HASH_NID_STATS_CUR_BITS,
+						  HASH_NID_STATS_MAX_BITS,
+						  HASH_NID_STATS_BKT_BITS, 0,
+						  CFS_HASH_MIN_THETA,
+						  CFS_HASH_MAX_THETA,
+						  &nid_stat_hash_ops, CFS_HASH_DEFAULT);
+	if (!obd->obd_nid_stats_hash)
+		GOTO(err_hash, err = -ENOMEM);
+
+	exp = class_new_export(obd, &obd->obd_uuid);
+	if (IS_ERR(exp))
+		GOTO(err_hash, err = PTR_ERR(exp));
+
+	obd->obd_self_export = exp;
+	list_del_init(&exp->exp_obd_chain_timed);
+	class_export_put(exp);
+
+	err = obd_setup(obd, lcfg);
+	if (err)
+		GOTO(err_exp, err);
+
+	obd->obd_set_up = 1;
+
+	spin_lock(&obd->obd_dev_lock);
+	/* cleanup drops this */
+	class_incref(obd, "setup", obd);
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "finished setup of obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	RETURN(0);
+err_exp:
+	if (obd->obd_self_export) {
+		class_unlink_export(obd->obd_self_export);
+		obd->obd_self_export = NULL;
+	}
+err_hash:
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+	obd->obd_starting = 0;
+	CERROR("setup %s failed (%d)\n", obd->obd_name, err);
+	return err;
+}
+EXPORT_SYMBOL(class_setup);
+
+/** We have finished using this obd and are ready to destroy it.
+ * There can be no more references to this obd.
+ */
+int class_detach(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+
+	if (obd->obd_set_up) {
+		CERROR("OBD device %d still set up\n", obd->obd_minor);
+		RETURN(-EBUSY);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (!obd->obd_attached) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD device %d not attached\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	obd->obd_attached = 0;
+	spin_unlock(&obd->obd_dev_lock);
+
+	CDEBUG(D_IOCTL, "detach on obd %s (uuid %s)\n",
+	       obd->obd_name, obd->obd_uuid.uuid);
+
+	class_decref(obd, "attach", obd);
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_detach);
+
+/** Start shutting down the obd.  There may be in-progess ops when
+ * this is called.  We tell them to start shutting down with a call
+ * to class_disconnect_exports().
+ */
+int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	int err = 0;
+	char *flag;
+	ENTRY;
+
+	OBD_RACE(OBD_FAIL_LDLM_RECOV_CLIENTS);
+
+	if (!obd->obd_set_up) {
+		CERROR("Device %d not setup\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+
+	spin_lock(&obd->obd_dev_lock);
+	if (obd->obd_stopping) {
+		spin_unlock(&obd->obd_dev_lock);
+		CERROR("OBD %d already stopping\n", obd->obd_minor);
+		RETURN(-ENODEV);
+	}
+	/* Leave this on forever */
+	obd->obd_stopping = 1;
+
+	/* wait for already-arrived-connections to finish. */
+	while (obd->obd_conn_inprogress > 0) {
+		spin_unlock(&obd->obd_dev_lock);
+
+		cond_resched();
+
+		spin_lock(&obd->obd_dev_lock);
+	}
+	spin_unlock(&obd->obd_dev_lock);
+
+	if (lcfg->lcfg_bufcount >= 2 && LUSTRE_CFG_BUFLEN(lcfg, 1) > 0) {
+		for (flag = lustre_cfg_string(lcfg, 1); *flag != 0; flag++)
+			switch (*flag) {
+			case 'F':
+				obd->obd_force = 1;
+				break;
+			case 'A':
+				LCONSOLE_WARN("Failing over %s\n",
+					      obd->obd_name);
+				obd->obd_fail = 1;
+				obd->obd_no_transno = 1;
+				obd->obd_no_recov = 1;
+				if (OBP(obd, iocontrol)) {
+					obd_iocontrol(OBD_IOC_SYNC,
+						      obd->obd_self_export,
+						      0, NULL, NULL);
+				}
+				break;
+			default:
+				CERROR("Unrecognised flag '%c'\n", *flag);
+			}
+	}
+
+	LASSERT(obd->obd_self_export);
+
+	/* The three references that should be remaining are the
+	 * obd_self_export and the attach and setup references. */
+	if (atomic_read(&obd->obd_refcount) > 3) {
+		/* refcounf - 3 might be the number of real exports
+		   (excluding self export). But class_incref is called
+		   by other things as well, so don't count on it. */
+		CDEBUG(D_IOCTL, "%s: forcing exports to disconnect: %d\n",
+		       obd->obd_name, atomic_read(&obd->obd_refcount) - 3);
+		dump_exports(obd, 0);
+		class_disconnect_exports(obd);
+	}
+
+	/* Precleanup, we must make sure all exports get destroyed. */
+	err = obd_precleanup(obd, OBD_CLEANUP_EXPORTS);
+	if (err)
+		CERROR("Precleanup %s returned %d\n",
+		       obd->obd_name, err);
+
+	/* destroy an uuid-export hash body */
+	if (obd->obd_uuid_hash) {
+		cfs_hash_putref(obd->obd_uuid_hash);
+		obd->obd_uuid_hash = NULL;
+	}
+
+	/* destroy a nid-export hash body */
+	if (obd->obd_nid_hash) {
+		cfs_hash_putref(obd->obd_nid_hash);
+		obd->obd_nid_hash = NULL;
+	}
+
+	/* destroy a nid-stats hash body */
+	if (obd->obd_nid_stats_hash) {
+		cfs_hash_putref(obd->obd_nid_stats_hash);
+		obd->obd_nid_stats_hash = NULL;
+	}
+
+	class_decref(obd, "setup", obd);
+	obd->obd_set_up = 0;
+
+	RETURN(0);
+}
+EXPORT_SYMBOL(class_cleanup);
+
+struct obd_device *class_incref(struct obd_device *obd,
+				const char *scope, const void *source)
+{
+	lu_ref_add_atomic(&obd->obd_reference, scope, source);
+	atomic_inc(&obd->obd_refcount);
+	CDEBUG(D_INFO, "incref %s (%p) now %d\n", obd->obd_name, obd,
+	       atomic_read(&obd->obd_refcount));
+
+	return obd;
+}
+EXPORT_SYMBOL(class_incref);
+
+void class_decref(struct obd_device *obd, const char *scope, const void *source)
+{
+	int err;
+	int refs;
+
+	spin_lock(&obd->obd_dev_lock);
+	atomic_dec(&obd->obd_refcount);
+	refs = atomic_read(&obd->obd_refcount);
+	spin_unlock(&obd->obd_dev_lock);
+	lu_ref_del(&obd->obd_reference, scope, source);
+
+	CDEBUG(D_INFO, "Decref %s (%p) now %d\n", obd->obd_name, obd, refs);
+
+	if ((refs == 1) && obd->obd_stopping) {
+		/* All exports have been destroyed; there should
+		   be no more in-progress ops by this point.*/
+
+		spin_lock(&obd->obd_self_export->exp_lock);
+		obd->obd_self_export->exp_flags |= exp_flags_from_obd(obd);
+		spin_unlock(&obd->obd_self_export->exp_lock);
+
+		/* note that we'll recurse into class_decref again */
+		class_unlink_export(obd->obd_self_export);
+		return;
+	}
+
+	if (refs == 0) {
+		CDEBUG(D_CONFIG, "finishing cleanup of obd %s (%s)\n",
+		       obd->obd_name, obd->obd_uuid.uuid);
+		LASSERT(!obd->obd_attached);
+		if (obd->obd_stopping) {
+			/* If we're not stopping, we were never set up */
+			err = obd_cleanup(obd);
+			if (err)
+				CERROR("Cleanup %s returned %d\n",
+				       obd->obd_name, err);
+		}
+		if (OBP(obd, detach)) {
+			err = OBP(obd, detach)(obd);
+			if (err)
+				CERROR("Detach returned %d\n", err);
+		}
+		class_release_dev(obd);
+	}
+}
+EXPORT_SYMBOL(class_decref);
+
+/** Add a failover nid location.
+ * Client obd types contact server obd types using this nid list.
+ */
+int class_add_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_LWP_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) {
+		CERROR("can't add connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to add conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_add_conn(imp, &uuid, lcfg->lcfg_num);
+
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_add_conn);
+
+/** Remove a failover nid location.
+ */
+int class_del_conn(struct obd_device *obd, struct lustre_cfg *lcfg)
+{
+	struct obd_import *imp;
+	struct obd_uuid uuid;
+	int rc;
+	ENTRY;
+
+	if (LUSTRE_CFG_BUFLEN(lcfg, 1) < 1 ||
+	    LUSTRE_CFG_BUFLEN(lcfg, 1) > sizeof(struct obd_uuid)) {
+		CERROR("invalid conn_uuid\n");
+		RETURN(-EINVAL);
+	}
+	if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) &&
+	    strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME)) {
+		CERROR("can't del connection on non-client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	imp = obd->u.cli.cl_import;
+	if (!imp) {
+		CERROR("try to del conn on immature client dev\n");
+		RETURN(-EINVAL);
+	}
+
+	obd_str2uuid(&uuid, lustre_cfg_string(lcfg, 1));
+	rc = obd_del_conn(imp, &uuid);
+
+	RETURN(rc);
+}
+
+LIST_HEAD(lustre_profile_list);
+
+struct lustre_profile *class_get_profile(const char * prof)
+{
+	struct lustre_profile *lprof;
+
+	ENTRY;
+	list_for_each_entry(lprof, &lustre_profile_list, lp_list) {
+		if (!strcmp(lprof->lp_profile, prof)) {
+			RETURN(lprof);
+		}
+	}
+	RETURN(NULL);
+}
+EXPORT_SYMBOL(class_get_profile);
+
+/** Create a named "profile".
+ * This defines the mdc and osc names to use for a client.
+ * This also is used to define the lov to be used by a mdt.
+ */
+int class_add_profile(int proflen, char *prof, int osclen, char *osc,
+		      int mdclen, char *mdc)
+{
+	struct lustre_profile *lprof;
+	int err = 0;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Add profile %s\n", prof);
+
+	OBD_ALLOC(lprof, sizeof(*lprof));
+	if (lprof == NULL)
+		RETURN(-ENOMEM);
+	INIT_LIST_HEAD(&lprof->lp_list);
+
+	LASSERT(proflen == (strlen(prof) + 1));
+	OBD_ALLOC(lprof->lp_profile, proflen);
+	if (lprof->lp_profile == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_profile, prof, proflen);
+
+	LASSERT(osclen == (strlen(osc) + 1));
+	OBD_ALLOC(lprof->lp_dt, osclen);
+	if (lprof->lp_dt == NULL)
+		GOTO(out, err = -ENOMEM);
+	memcpy(lprof->lp_dt, osc, osclen);
+
+	if (mdclen > 0) {
+		LASSERT(mdclen == (strlen(mdc) + 1));
+		OBD_ALLOC(lprof->lp_md, mdclen);
+		if (lprof->lp_md == NULL)
+			GOTO(out, err = -ENOMEM);
+		memcpy(lprof->lp_md, mdc, mdclen);
+	}
+
+	list_add(&lprof->lp_list, &lustre_profile_list);
+	RETURN(err);
+
+out:
+	if (lprof->lp_md)
+		OBD_FREE(lprof->lp_md, mdclen);
+	if (lprof->lp_dt)
+		OBD_FREE(lprof->lp_dt, osclen);
+	if (lprof->lp_profile)
+		OBD_FREE(lprof->lp_profile, proflen);
+	OBD_FREE(lprof, sizeof(*lprof));
+	RETURN(err);
+}
+
+void class_del_profile(const char *prof)
+{
+	struct lustre_profile *lprof;
+	ENTRY;
+
+	CDEBUG(D_CONFIG, "Del profile %s\n", prof);
+
+	lprof = class_get_profile(prof);
+	if (lprof) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profile);
+
+/* COMPAT_146 */
+void class_del_profiles(void)
+{
+	struct lustre_profile *lprof, *n;
+	ENTRY;
+
+	list_for_each_entry_safe(lprof, n, &lustre_profile_list, lp_list) {
+		list_del(&lprof->lp_list);
+		OBD_FREE(lprof->lp_profile, strlen(lprof->lp_profile) + 1);
+		OBD_FREE(lprof->lp_dt, strlen(lprof->lp_dt) + 1);
+		if (lprof->lp_md)
+			OBD_FREE(lprof->lp_md, strlen(lprof->lp_md) + 1);
+		OBD_FREE(lprof, sizeof *lprof);
+	}
+	EXIT;
+}
+EXPORT_SYMBOL(class_del_profiles);
+
+static int class_set_global(char *ptr, int val, struct lustre_cfg *lcfg)
+{
+	ENTRY;
+	if (class_match_param(ptr, PARAM_AT_MIN, NULL) == 0)
+		at_min = val;
+	else if (class_match_param(ptr, PARAM_AT_MAX, NULL) == 0)
+		at_max = val;
+	else if (class_match_param(ptr, PARAM_AT_EXTRA, NULL) == 0)
+		at_extra = val;
+	else if (class_match_param(ptr, PARAM_AT_EARLY_MARGIN, NULL) == 0)
+		at_early_margin = val;
+	else if (class_match_param(ptr, PARAM_AT_HISTORY, NULL) == 0)
+		at_history = val;
+	else if (class_match_param(ptr, PARAM_JOBID_VAR, NULL) == 0)
+		strlcpy(obd_jobid_var, lustre_cfg_string(lcfg, 2),
+			JOBSTATS_JOBID_VAR_MAX_LEN + 1);
+	else
+		RETURN(-EINVAL);
+
+	CDEBUG(D_IOCTL, "global %s = %d\n", ptr, val);
+	RETURN(0);
+}
+
+
+/* We can't call ll_process_config or lquota_process_config directly because
+ * it lives in a module that must be loaded after this one. */
+static int (*client_process_config)(struct lustre_cfg *lcfg) = NULL;
+static int (*quota_process_config)(struct lustre_cfg *lcfg) = NULL;
+
+void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg))
+{
+	client_process_config = cpc;
+}
+EXPORT_SYMBOL(lustre_register_client_process_config);
+
+/**
+ * Rename the proc parameter in \a cfg with a new name \a new_name.
+ *
+ * \param cfg	   config structure which contains the proc parameter
+ * \param new_name new name of the proc parameter
+ *
+ * \retval valid-pointer    pointer to the newly-allocated config structure
+ *			    which contains the renamed proc parameter
+ * \retval ERR_PTR(-EINVAL) if \a cfg or \a new_name is NULL, or \a cfg does
+ *			    not contain a proc parameter
+ * \retval ERR_PTR(-ENOMEM) if memory allocation failure occurs
+ */
+struct lustre_cfg *lustre_cfg_rename(struct lustre_cfg *cfg,
+				     const char *new_name)
+{
+	struct lustre_cfg_bufs	*bufs = NULL;
+	struct lustre_cfg	*new_cfg = NULL;
+	char			*param = NULL;
+	char			*new_param = NULL;
+	char			*value = NULL;
+	int			 name_len = 0;
+	int			 new_len = 0;
+	ENTRY;
+
+	if (cfg == NULL || new_name == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	param = lustre_cfg_string(cfg, 1);
+	if (param == NULL)
+		RETURN(ERR_PTR(-EINVAL));
+
+	value = strchr(param, '=');
+	if (value == NULL)
+		name_len = strlen(param);
+	else
+		name_len = value - param;
+
+	new_len = LUSTRE_CFG_BUFLEN(cfg, 1) + strlen(new_name) - name_len;
+
+	OBD_ALLOC(new_param, new_len);
+	if (new_param == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	strcpy(new_param, new_name);
+	if (value != NULL)
+		strcat(new_param, value);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL) {
+		OBD_FREE(new_param, new_len);
+		RETURN(ERR_PTR(-ENOMEM));
+	}
+
+	lustre_cfg_bufs_reset(bufs, NULL);
+	lustre_cfg_bufs_init(bufs, cfg);
+	lustre_cfg_bufs_set_string(bufs, 1, new_param);
+
+	new_cfg = lustre_cfg_new(cfg->lcfg_command, bufs);
+
+	OBD_FREE(new_param, new_len);
+	OBD_FREE_PTR(bufs);
+	if (new_cfg == NULL)
+		RETURN(ERR_PTR(-ENOMEM));
+
+	new_cfg->lcfg_num = cfg->lcfg_num;
+	new_cfg->lcfg_flags = cfg->lcfg_flags;
+	new_cfg->lcfg_nid = cfg->lcfg_nid;
+	new_cfg->lcfg_nal = cfg->lcfg_nal;
+
+	RETURN(new_cfg);
+}
+EXPORT_SYMBOL(lustre_cfg_rename);
+
+void lustre_register_quota_process_config(int (*qpc)(struct lustre_cfg *lcfg))
+{
+	quota_process_config = qpc;
+}
+EXPORT_SYMBOL(lustre_register_quota_process_config);
+
+/** Process configuration commands given in lustre_cfg form.
+ * These may come from direct calls (e.g. class_manual_cleanup)
+ * or processing the config llog, or ioctl from lctl.
+ */
+int class_process_config(struct lustre_cfg *lcfg)
+{
+	struct obd_device *obd;
+	int err;
+
+	LASSERT(lcfg && !IS_ERR(lcfg));
+	CDEBUG(D_IOCTL, "processing cmd: %x\n", lcfg->lcfg_command);
+
+	/* Commands that don't need a device */
+	switch(lcfg->lcfg_command) {
+	case LCFG_ATTACH: {
+		err = class_attach(lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_ADD_UUID: {
+		CDEBUG(D_IOCTL, "adding mapping from uuid %s to nid "LPX64
+		       " (%s)\n", lustre_cfg_string(lcfg, 1),
+		       lcfg->lcfg_nid, libcfs_nid2str(lcfg->lcfg_nid));
+
+		err = class_add_uuid(lustre_cfg_string(lcfg, 1), lcfg->lcfg_nid);
+		GOTO(out, err);
+	}
+	case LCFG_DEL_UUID: {
+		CDEBUG(D_IOCTL, "removing mappings for uuid %s\n",
+		       (lcfg->lcfg_bufcount < 2 || LUSTRE_CFG_BUFLEN(lcfg, 1) == 0)
+		       ? "<all uuids>" : lustre_cfg_string(lcfg, 1));
+
+		err = class_del_uuid(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err);
+	}
+	case LCFG_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s osc %s mdc %s\n",
+		       lustre_cfg_string(lcfg, 1),
+		       lustre_cfg_string(lcfg, 2),
+		       lustre_cfg_string(lcfg, 3));
+		/* set these mount options somewhere, so ll_fill_super
+		 * can find them. */
+		err = class_add_profile(LUSTRE_CFG_BUFLEN(lcfg, 1),
+					lustre_cfg_string(lcfg, 1),
+					LUSTRE_CFG_BUFLEN(lcfg, 2),
+					lustre_cfg_string(lcfg, 2),
+					LUSTRE_CFG_BUFLEN(lcfg, 3),
+					lustre_cfg_string(lcfg, 3));
+		GOTO(out, err);
+	}
+	case LCFG_DEL_MOUNTOPT: {
+		CDEBUG(D_IOCTL, "mountopt: profile %s\n",
+		       lustre_cfg_string(lcfg, 1));
+		class_del_profile(lustre_cfg_string(lcfg, 1));
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre timeout from %d to %d\n",
+		       obd_timeout, lcfg->lcfg_num);
+		obd_timeout = max(lcfg->lcfg_num, 1U);
+		obd_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_LDLM_TIMEOUT: {
+		CDEBUG(D_IOCTL, "changing lustre ldlm_timeout from %d to %d\n",
+		       ldlm_timeout, lcfg->lcfg_num);
+		ldlm_timeout = max(lcfg->lcfg_num, 1U);
+		if (ldlm_timeout >= obd_timeout)
+			ldlm_timeout = max(obd_timeout / 3, 1U);
+		ldlm_timeout_set = 1;
+		GOTO(out, err = 0);
+	}
+	case LCFG_SET_UPCALL: {
+		LCONSOLE_ERROR_MSG(0x15a, "recovery upcall is deprecated\n");
+		/* COMPAT_146 Don't fail on old configs */
+		GOTO(out, err = 0);
+	}
+	case LCFG_MARKER: {
+		struct cfg_marker *marker;
+		marker = lustre_cfg_buf(lcfg, 1);
+		CDEBUG(D_IOCTL, "marker %d (%#x) %.16s %s\n", marker->cm_step,
+		       marker->cm_flags, marker->cm_tgtname, marker->cm_comment);
+		GOTO(out, err = 0);
+	}
+	case LCFG_PARAM: {
+		char *tmp;
+		/* llite has no obd */
+		if ((class_match_param(lustre_cfg_string(lcfg, 1),
+				       PARAM_LLITE, 0) == 0) &&
+		    client_process_config) {
+			err = (*client_process_config)(lcfg);
+			GOTO(out, err);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_SYS, &tmp) == 0)) {
+			/* Global param settings */
+			err = class_set_global(tmp, lcfg->lcfg_num, lcfg);
+			/*
+			 * Client or server should not fail to mount if
+			 * it hits an unknown configuration parameter.
+			 */
+			if (err != 0)
+				CWARN("Ignoring unknown param %s\n", tmp);
+
+			GOTO(out, err = 0);
+		} else if ((class_match_param(lustre_cfg_string(lcfg, 1),
+					      PARAM_QUOTA, &tmp) == 0) &&
+			   quota_process_config) {
+			err = (*quota_process_config)(lcfg);
+			GOTO(out, err);
+		}
+		/* Fall through */
+		break;
+	}
+	}
+
+	/* Commands that require a device */
+	obd = class_name2obd(lustre_cfg_string(lcfg, 0));
+	if (obd == NULL) {
+		if (!LUSTRE_CFG_BUFLEN(lcfg, 0))
+			CERROR("this lcfg command requires a device name\n");
+		else
+			CERROR("no device for: %s\n",
+			       lustre_cfg_string(lcfg, 0));
+
+		GOTO(out, err = -EINVAL);
+	}
+
+	switch(lcfg->lcfg_command) {
+	case LCFG_SETUP: {
+		err = class_setup(obd, lcfg);
+		GOTO(out, err);
+	}
+	case LCFG_DETACH: {
+		err = class_detach(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_CLEANUP: {
+		err = class_cleanup(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_ADD_CONN: {
+		err = class_add_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_DEL_CONN: {
+		err = class_del_conn(obd, lcfg);
+		GOTO(out, err = 0);
+	}
+	case LCFG_POOL_NEW: {
+		err = obd_pool_new(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_ADD: {
+		err = obd_pool_add(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_REM: {
+		err = obd_pool_rem(obd, lustre_cfg_string(lcfg, 2),
+				   lustre_cfg_string(lcfg, 3));
+		GOTO(out, err = 0);
+		break;
+	}
+	case LCFG_POOL_DEL: {
+		err = obd_pool_del(obd, lustre_cfg_string(lcfg, 2));
+		GOTO(out, err = 0);
+		break;
+	}
+	default: {
+		err = obd_process_config(obd, sizeof(*lcfg), lcfg);
+		GOTO(out, err);
+
+	}
+	}
+out:
+	if ((err < 0) && !(lcfg->lcfg_command & LCFG_REQUIRED)) {
+		CWARN("Ignoring error %d on optional command %#x\n", err,
+		      lcfg->lcfg_command);
+		err = 0;
+	}
+	return err;
+}
+EXPORT_SYMBOL(class_process_config);
+
+int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars,
+			     struct lustre_cfg *lcfg, void *data)
+{
+	struct lprocfs_vars *var;
+	struct file fakefile;
+	struct seq_file fake_seqfile;
+	char *key, *sval;
+	int i, keylen, vallen;
+	int matched = 0, j = 0;
+	int rc = 0;
+	int skip = 0;
+	ENTRY;
+
+	if (lcfg->lcfg_command != LCFG_PARAM) {
+		CERROR("Unknown command: %d\n", lcfg->lcfg_command);
+		RETURN(-EINVAL);
+	}
+
+	/* fake a seq file so that var->fops->write can work... */
+	fakefile.private_data = &fake_seqfile;
+	fake_seqfile.private = data;
+	/* e.g. tunefs.lustre --param mdt.group_upcall=foo /r/tmp/lustre-mdt
+	   or   lctl conf_param lustre-MDT0000.mdt.group_upcall=bar
+	   or   lctl conf_param lustre-OST0000.osc.max_dirty_mb=36 */
+	for (i = 1; i < lcfg->lcfg_bufcount; i++) {
+		key = lustre_cfg_buf(lcfg, i);
+		/* Strip off prefix */
+		class_match_param(key, prefix, &key);
+		sval = strchr(key, '=');
+		if (!sval || (*(sval + 1) == 0)) {
+			CERROR("Can't parse param %s (missing '=')\n", key);
+			/* rc = -EINVAL;	continue parsing other params */
+			continue;
+		}
+		keylen = sval - key;
+		sval++;
+		vallen = strlen(sval);
+		matched = 0;
+		j = 0;
+		/* Search proc entries */
+		while (lvars[j].name) {
+			var = &lvars[j];
+			if (class_match_param(key, (char *)var->name, 0) == 0 &&
+			    keylen == strlen(var->name)) {
+				matched++;
+				rc = -EROFS;
+				if (var->fops && var->fops->write) {
+					mm_segment_t oldfs;
+					oldfs = get_fs();
+					set_fs(KERNEL_DS);
+					rc = (var->fops->write)(&fakefile, sval,
+								vallen, NULL);
+					set_fs(oldfs);
+				}
+				break;
+			}
+			j++;
+		}
+		if (!matched) {
+			/* If the prefix doesn't match, return error so we
+			   can pass it down the stack */
+			if (strnchr(key, keylen, '.'))
+			    RETURN(-ENOSYS);
+			CERROR("%s: unknown param %s\n",
+			       (char *)lustre_cfg_string(lcfg, 0), key);
+			/* rc = -EINVAL;	continue parsing other params */
+			skip++;
+		} else if (rc < 0) {
+			CERROR("writing proc entry %s err %d\n",
+			       var->name, rc);
+			rc = 0;
+		} else {
+			CDEBUG(D_CONFIG, "%s.%.*s: Set parameter %.*s=%s\n",
+					 lustre_cfg_string(lcfg, 0),
+					 (int)strlen(prefix) - 1, prefix,
+					 (int)(sval - key - 1), key, sval);
+		}
+	}
+
+	if (rc > 0)
+		rc = 0;
+	if (!rc && skip)
+		rc = skip;
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_process_proc_param);
+
+extern int lustre_check_exclusion(struct super_block *sb, char *svname);
+
+/** Parse a configuration llog, doing various manipulations on them
+ * for various reasons, (modifications for compatibility, skip obsolete
+ * records, change uuids, etc), then class_process_config() resulting
+ * net records.
+ */
+int class_config_llog_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	struct config_llog_instance *clli = data;
+	int cfg_len = rec->lrh_len;
+	char *cfg_buf = (char*) (rec + 1);
+	int rc = 0;
+	ENTRY;
+
+	//class_config_dump_handler(handle, rec, data);
+
+	switch (rec->lrh_type) {
+	case OBD_CFG_REC: {
+		struct lustre_cfg *lcfg, *lcfg_new;
+		struct lustre_cfg_bufs bufs;
+		char *inst_name = NULL;
+		int inst_len = 0;
+		int inst = 0, swab = 0;
+
+		lcfg = (struct lustre_cfg *)cfg_buf;
+		if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) {
+			lustre_swab_lustre_cfg(lcfg);
+			swab = 1;
+		}
+
+		rc = lustre_cfg_sanity_check(cfg_buf, cfg_len);
+		if (rc)
+			GOTO(out, rc);
+
+		/* Figure out config state info */
+		if (lcfg->lcfg_command == LCFG_MARKER) {
+			struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+			lustre_swab_cfg_marker(marker, swab,
+					       LUSTRE_CFG_BUFLEN(lcfg, 1));
+			CDEBUG(D_CONFIG, "Marker, inst_flg=%#x mark_flg=%#x\n",
+			       clli->cfg_flags, marker->cm_flags);
+			if (marker->cm_flags & CM_START) {
+				/* all previous flags off */
+				clli->cfg_flags = CFG_F_MARKER;
+				if (marker->cm_flags & CM_SKIP) {
+					clli->cfg_flags |= CFG_F_SKIP;
+					CDEBUG(D_CONFIG, "SKIP #%d\n",
+					       marker->cm_step);
+				} else if ((marker->cm_flags & CM_EXCLUDE) ||
+					   (clli->cfg_sb &&
+					    lustre_check_exclusion(clli->cfg_sb,
+							 marker->cm_tgtname))) {
+					clli->cfg_flags |= CFG_F_EXCLUDE;
+					CDEBUG(D_CONFIG, "EXCLUDE %d\n",
+					       marker->cm_step);
+				}
+			} else if (marker->cm_flags & CM_END) {
+				clli->cfg_flags = 0;
+			}
+		}
+		/* A config command without a start marker before it is
+		   illegal (post 146) */
+		if (!(clli->cfg_flags & CFG_F_COMPAT146) &&
+		    !(clli->cfg_flags & CFG_F_MARKER) &&
+		    (lcfg->lcfg_command != LCFG_MARKER)) {
+			CWARN("Config not inside markers, ignoring! "
+			      "(inst: %p, uuid: %s, flags: %#x)\n",
+			      clli->cfg_instance,
+			      clli->cfg_uuid.uuid, clli->cfg_flags);
+			clli->cfg_flags |= CFG_F_SKIP;
+		}
+		if (clli->cfg_flags & CFG_F_SKIP) {
+			CDEBUG(D_CONFIG, "skipping %#x\n",
+			       clli->cfg_flags);
+			rc = 0;
+			/* No processing! */
+			break;
+		}
+
+		/*
+		 * For interoperability between 1.8 and 2.0,
+		 * rename "mds" obd device type to "mdt".
+		 */
+		{
+			char *typename = lustre_cfg_string(lcfg, 1);
+			char *index = lustre_cfg_string(lcfg, 2);
+
+			if ((lcfg->lcfg_command == LCFG_ATTACH && typename &&
+			     strcmp(typename, "mds") == 0)) {
+				CWARN("For 1.8 interoperability, rename obd "
+				       "type from mds to mdt\n");
+				typename[2] = 't';
+			}
+			if ((lcfg->lcfg_command == LCFG_SETUP && index &&
+			     strcmp(index, "type") == 0)) {
+				CDEBUG(D_INFO, "For 1.8 interoperability, "
+				       "set this index to '0'\n");
+				index[0] = '0';
+				index[1] = 0;
+			}
+		}
+
+
+		if ((clli->cfg_flags & CFG_F_EXCLUDE) &&
+		    (lcfg->lcfg_command == LCFG_LOV_ADD_OBD))
+			/* Add inactive instead */
+			lcfg->lcfg_command = LCFG_LOV_ADD_INA;
+
+		lustre_cfg_bufs_init(&bufs, lcfg);
+
+		if (clli && clli->cfg_instance &&
+		    LUSTRE_CFG_BUFLEN(lcfg, 0) > 0){
+			inst = 1;
+			inst_len = LUSTRE_CFG_BUFLEN(lcfg, 0) +
+				   sizeof(clli->cfg_instance) * 2 + 4;
+			OBD_ALLOC(inst_name, inst_len);
+			if (inst_name == NULL)
+				GOTO(out, rc = -ENOMEM);
+			sprintf(inst_name, "%s-%p",
+				lustre_cfg_string(lcfg, 0),
+				clli->cfg_instance);
+			lustre_cfg_bufs_set_string(&bufs, 0, inst_name);
+			CDEBUG(D_CONFIG, "cmd %x, instance name: %s\n",
+			       lcfg->lcfg_command, inst_name);
+		}
+
+		/* we override the llog's uuid for clients, to insure they
+		are unique */
+		if (clli && clli->cfg_instance != NULL &&
+		    lcfg->lcfg_command == LCFG_ATTACH) {
+			lustre_cfg_bufs_set_string(&bufs, 2,
+						   clli->cfg_uuid.uuid);
+		}
+		/*
+		 * sptlrpc config record, we expect 2 data segments:
+		 *  [0]: fs_name/target_name,
+		 *  [1]: rule string
+		 * moving them to index [1] and [2], and insert MGC's
+		 * obdname at index [0].
+		 */
+		if (clli && clli->cfg_instance == NULL &&
+		    lcfg->lcfg_command == LCFG_SPTLRPC_CONF) {
+			lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1],
+					    bufs.lcfg_buflen[1]);
+			lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0],
+					    bufs.lcfg_buflen[0]);
+			lustre_cfg_bufs_set_string(&bufs, 0,
+						   clli->cfg_obdname);
+		}
+
+		lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs);
+
+		lcfg_new->lcfg_num   = lcfg->lcfg_num;
+		lcfg_new->lcfg_flags = lcfg->lcfg_flags;
+
+		/* XXX Hack to try to remain binary compatible with
+		 * pre-newconfig logs */
+		if (lcfg->lcfg_nal != 0 &&      /* pre-newconfig log? */
+		    (lcfg->lcfg_nid >> 32) == 0) {
+			__u32 addr = (__u32)(lcfg->lcfg_nid & 0xffffffff);
+
+			lcfg_new->lcfg_nid =
+				LNET_MKNID(LNET_MKNET(lcfg->lcfg_nal, 0), addr);
+			CWARN("Converted pre-newconfig NAL %d NID %x to %s\n",
+			      lcfg->lcfg_nal, addr,
+			      libcfs_nid2str(lcfg_new->lcfg_nid));
+		} else {
+			lcfg_new->lcfg_nid = lcfg->lcfg_nid;
+		}
+
+		lcfg_new->lcfg_nal = 0; /* illegal value for obsolete field */
+
+		rc = class_process_config(lcfg_new);
+		lustre_cfg_free(lcfg_new);
+
+		if (inst)
+			OBD_FREE(inst_name, inst_len);
+		break;
+	}
+	default:
+		CERROR("Unknown llog record type %#x encountered\n",
+		       rec->lrh_type);
+		break;
+	}
+out:
+	if (rc) {
+		CERROR("%s: cfg command failed: rc = %d\n",
+		       handle->lgh_ctxt->loc_obd->obd_name, rc);
+		class_config_dump_handler(NULL, handle, rec, data);
+	}
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_llog_handler);
+
+int class_config_parse_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			    char *name, struct config_llog_instance *cfg)
+{
+	struct llog_process_cat_data	 cd = {0, 0};
+	struct llog_handle		*llh;
+	llog_cb_t			 callback;
+	int				 rc;
+	ENTRY;
+
+	CDEBUG(D_INFO, "looking up llog %s\n", name);
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	/* continue processing from where we last stopped to end-of-log */
+	if (cfg) {
+		cd.lpcd_first_idx = cfg->cfg_last_idx;
+		callback = cfg->cfg_callback;
+		LASSERT(callback != NULL);
+	} else {
+		callback = class_config_llog_handler;
+	}
+
+	cd.lpcd_last_idx = 0;
+
+	rc = llog_process(env, llh, callback, cfg, &cd);
+
+	CDEBUG(D_CONFIG, "Processed log %s gen %d-%d (rc=%d)\n", name,
+	       cd.lpcd_first_idx + 1, cd.lpcd_last_idx, rc);
+	if (cfg)
+		cfg->cfg_last_idx = cd.lpcd_last_idx;
+
+parse_out:
+	llog_close(env, llh);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_parse_llog);
+
+/**
+ * parse config record and output dump in supplied buffer.
+ * This is separated from class_config_dump_handler() to use
+ * for ioctl needs as well
+ */
+int class_config_parse_rec(struct llog_rec_hdr *rec, char *buf, int size)
+{
+	struct lustre_cfg	*lcfg = (struct lustre_cfg *)(rec + 1);
+	char			*ptr = buf;
+	char			*end = buf + size;
+	int			 rc = 0;
+
+	ENTRY;
+
+	LASSERT(rec->lrh_type == OBD_CFG_REC);
+	rc = lustre_cfg_sanity_check(lcfg, rec->lrh_len);
+	if (rc < 0)
+		RETURN(rc);
+
+	ptr += snprintf(ptr, end-ptr, "cmd=%05x ", lcfg->lcfg_command);
+	if (lcfg->lcfg_flags)
+		ptr += snprintf(ptr, end-ptr, "flags=%#08x ",
+				lcfg->lcfg_flags);
+
+	if (lcfg->lcfg_num)
+		ptr += snprintf(ptr, end-ptr, "num=%#08x ", lcfg->lcfg_num);
+
+	if (lcfg->lcfg_nid)
+		ptr += snprintf(ptr, end-ptr, "nid=%s("LPX64")\n     ",
+				libcfs_nid2str(lcfg->lcfg_nid),
+				lcfg->lcfg_nid);
+
+	if (lcfg->lcfg_command == LCFG_MARKER) {
+		struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1);
+
+		ptr += snprintf(ptr, end-ptr, "marker=%d(%#x)%s '%s'",
+				marker->cm_step, marker->cm_flags,
+				marker->cm_tgtname, marker->cm_comment);
+	} else {
+		int i;
+
+		for (i = 0; i <  lcfg->lcfg_bufcount; i++) {
+			ptr += snprintf(ptr, end-ptr, "%d:%s  ", i,
+					lustre_cfg_string(lcfg, i));
+		}
+	}
+	/* return consumed bytes */
+	rc = ptr - buf;
+	RETURN(rc);
+}
+
+int class_config_dump_handler(const struct lu_env *env,
+			      struct llog_handle *handle,
+			      struct llog_rec_hdr *rec, void *data)
+{
+	char	*outstr;
+	int	 rc = 0;
+
+	ENTRY;
+
+	OBD_ALLOC(outstr, 256);
+	if (outstr == NULL)
+		RETURN(-ENOMEM);
+
+	if (rec->lrh_type == OBD_CFG_REC) {
+		class_config_parse_rec(rec, outstr, 256);
+		LCONSOLE(D_WARNING, "   %s\n", outstr);
+	} else {
+		LCONSOLE(D_WARNING, "unhandled lrh_type: %#x\n", rec->lrh_type);
+		rc = -EINVAL;
+	}
+
+	OBD_FREE(outstr, 256);
+	RETURN(rc);
+}
+
+int class_config_dump_llog(const struct lu_env *env, struct llog_ctxt *ctxt,
+			   char *name, struct config_llog_instance *cfg)
+{
+	struct llog_handle	*llh;
+	int			 rc;
+
+	ENTRY;
+
+	LCONSOLE_INFO("Dumping config log %s\n", name);
+
+	rc = llog_open(env, ctxt, &llh, NULL, name, LLOG_OPEN_EXISTS);
+	if (rc)
+		RETURN(rc);
+
+	rc = llog_init_handle(env, llh, LLOG_F_IS_PLAIN, NULL);
+	if (rc)
+		GOTO(parse_out, rc);
+
+	rc = llog_process(env, llh, class_config_dump_handler, cfg, NULL);
+parse_out:
+	llog_close(env, llh);
+
+	LCONSOLE_INFO("End config log %s\n", name);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_config_dump_llog);
+
+/** Call class_cleanup and class_detach.
+ * "Manual" only in the sense that we're faking lcfg commands.
+ */
+int class_manual_cleanup(struct obd_device *obd)
+{
+	char		    flags[3] = "";
+	struct lustre_cfg      *lcfg;
+	struct lustre_cfg_bufs  bufs;
+	int		     rc;
+	ENTRY;
+
+	if (!obd) {
+		CERROR("empty cleanup\n");
+		RETURN(-EALREADY);
+	}
+
+	if (obd->obd_force)
+		strcat(flags, "F");
+	if (obd->obd_fail)
+		strcat(flags, "A");
+
+	CDEBUG(D_CONFIG, "Manual cleanup of %s (flags='%s')\n",
+	       obd->obd_name, flags);
+
+	lustre_cfg_bufs_reset(&bufs, obd->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, flags);
+	lcfg = lustre_cfg_new(LCFG_CLEANUP, &bufs);
+	if (!lcfg)
+		RETURN(-ENOMEM);
+
+	rc = class_process_config(lcfg);
+	if (rc) {
+		CERROR("cleanup failed %d: %s\n", rc, obd->obd_name);
+		GOTO(out, rc);
+	}
+
+	/* the lcfg is almost the same for both ops */
+	lcfg->lcfg_command = LCFG_DETACH;
+	rc = class_process_config(lcfg);
+	if (rc)
+		CERROR("detach failed %d: %s\n", rc, obd->obd_name);
+out:
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(class_manual_cleanup);
+
+/*
+ * uuid<->export lustre hash operations
+ */
+
+static unsigned
+uuid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(((struct obd_uuid *)key)->uuid,
+				  sizeof(((struct obd_uuid *)key)->uuid), mask);
+}
+
+static void *
+uuid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return &exp->exp_client_uuid;
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+uuid_keycmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+
+	return obd_uuid_equals(key, &exp->exp_client_uuid) &&
+	       !exp->exp_failed;
+}
+
+static void *
+uuid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+}
+
+static void
+uuid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_get(exp);
+}
+
+static void
+uuid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_uuid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t uuid_hash_ops = {
+	.hs_hash	= uuid_hash,
+	.hs_key	 = uuid_key,
+	.hs_keycmp      = uuid_keycmp,
+	.hs_object      = uuid_export_object,
+	.hs_get	 = uuid_export_get,
+	.hs_put_locked  = uuid_export_put_locked,
+};
+
+
+/*
+ * nid<->export hash operations
+ */
+
+static unsigned
+nid_hash(cfs_hash_t *hs, const void *key, unsigned mask)
+{
+	return cfs_hash_djb2_hash(key, sizeof(lnet_nid_t), mask);
+}
+
+static void *
+nid_key(struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(&exp->exp_connection->c_peer.nid);
+}
+
+/*
+ * NOTE: It is impossible to find an export that is in failed
+ *       state with this function
+ */
+static int
+nid_kepcmp(const void *key, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	LASSERT(key);
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+
+	RETURN(exp->exp_connection->c_peer.nid == *(lnet_nid_t *)key &&
+	       !exp->exp_failed);
+}
+
+static void *
+nid_export_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct obd_export, exp_nid_hash);
+}
+
+static void
+nid_export_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_get(exp);
+}
+
+static void
+nid_export_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct obd_export *exp;
+
+	exp = hlist_entry(hnode, struct obd_export, exp_nid_hash);
+	class_export_put(exp);
+}
+
+static cfs_hash_ops_t nid_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nid_key,
+	.hs_keycmp      = nid_kepcmp,
+	.hs_object      = nid_export_object,
+	.hs_get	 = nid_export_get,
+	.hs_put_locked  = nid_export_put_locked,
+};
+
+
+/*
+ * nid<->nidstats hash operations
+ */
+
+static void *
+nidstats_key(struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+
+	return &ns->nid;
+}
+
+static int
+nidstats_keycmp(const void *key, struct hlist_node *hnode)
+{
+	return *(lnet_nid_t *)nidstats_key(hnode) == *(lnet_nid_t *)key;
+}
+
+static void *
+nidstats_object(struct hlist_node *hnode)
+{
+	return hlist_entry(hnode, struct nid_stat, nid_hash);
+}
+
+static void
+nidstats_get(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_getref(ns);
+}
+
+static void
+nidstats_put_locked(cfs_hash_t *hs, struct hlist_node *hnode)
+{
+	struct nid_stat *ns;
+
+	ns = hlist_entry(hnode, struct nid_stat, nid_hash);
+	nidstat_putref(ns);
+}
+
+static cfs_hash_ops_t nid_stat_hash_ops = {
+	.hs_hash	= nid_hash,
+	.hs_key	 = nidstats_key,
+	.hs_keycmp      = nidstats_keycmp,
+	.hs_object      = nidstats_object,
+	.hs_get	 = nidstats_get,
+	.hs_put_locked  = nidstats_put_locked,
+};
diff --git a/drivers/staging/lustre/lustre/obdclass/obd_mount.c b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
new file mode 100644
index 000000000000..99adad9793c5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obd_mount.c
@@ -0,0 +1,1321 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obd_mount.c
+ *
+ * Client mount routines
+ *
+ * Author: Nathan Rutman <nathan@clusterfs.com>
+ */
+
+
+#define DEBUG_SUBSYSTEM S_CLASS
+#define D_MOUNT (D_SUPER|D_CONFIG/*|D_WARNING */)
+#define PRINT_CMD CDEBUG
+
+#include <obd.h>
+#include <lvfs.h>
+#include <lustre_fsfilt.h>
+#include <obd_class.h>
+#include <lustre/lustre_user.h>
+#include <linux/version.h>
+#include <lustre_log.h>
+#include <lustre_disk.h>
+#include <lustre_param.h>
+
+static int (*client_fill_super)(struct super_block *sb,
+				struct vfsmount *mnt);
+
+static void (*kill_super_cb)(struct super_block *sb);
+
+/**************** config llog ********************/
+
+/** Get a config log from the MGS and process it.
+ * This func is called for both clients and servers.
+ * Continue to process new statements appended to the logs
+ * (whenever the config lock is revoked) until lustre_end_log
+ * is called.
+ * @param sb The superblock is used by the MGC to write to the local copy of
+ *   the config log
+ * @param logname The name of the llog to replicate from the MGS
+ * @param cfg Since the same mgc may be used to follow multiple config logs
+ *   (e.g. ost1, ost2, client), the config_llog_instance keeps the state for
+ *   this log, and is added to the mgc's list of logs to follow.
+ */
+int lustre_process_log(struct super_block *sb, char *logname,
+		     struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs *bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	LASSERT(mgc);
+	LASSERT(cfg);
+
+	OBD_ALLOC_PTR(bufs);
+	if (bufs == NULL)
+		RETURN(-ENOMEM);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(bufs, 1, logname);
+	lustre_cfg_bufs_set(bufs, 2, cfg, sizeof(*cfg));
+	lustre_cfg_bufs_set(bufs, 3, &sb, sizeof(sb));
+	lcfg = lustre_cfg_new(LCFG_LOG_START, bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+
+	OBD_FREE_PTR(bufs);
+
+	if (rc == -EINVAL)
+		LCONSOLE_ERROR_MSG(0x15b, "%s: The configuration from log '%s'"
+				   "failed from the MGS (%d).  Make sure this "
+				   "client and the MGS are running compatible "
+				   "versions of Lustre.\n",
+				   mgc->obd_name, logname, rc);
+
+	if (rc)
+		LCONSOLE_ERROR_MSG(0x15c, "%s: The configuration from log '%s' "
+				   "failed (%d). This may be the result of "
+				   "communication errors between this node and "
+				   "the MGS, a bad configuration, or other "
+				   "errors. See the syslog for more "
+				   "information.\n", mgc->obd_name, logname,
+				   rc);
+
+	/* class_obd_list(); */
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_process_log);
+
+/* Stop watching this config log for updates */
+int lustre_end_log(struct super_block *sb, char *logname,
+		       struct config_llog_instance *cfg)
+{
+	struct lustre_cfg *lcfg;
+	struct lustre_cfg_bufs bufs;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *mgc = lsi->lsi_mgc;
+	int rc;
+	ENTRY;
+
+	if (!mgc)
+		RETURN(-ENOENT);
+
+	/* mgc_process_config */
+	lustre_cfg_bufs_reset(&bufs, mgc->obd_name);
+	lustre_cfg_bufs_set_string(&bufs, 1, logname);
+	if (cfg)
+		lustre_cfg_bufs_set(&bufs, 2, cfg, sizeof(*cfg));
+	lcfg = lustre_cfg_new(LCFG_LOG_END, &bufs);
+	rc = obd_process_config(mgc, sizeof(*lcfg), lcfg);
+	lustre_cfg_free(lcfg);
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_end_log);
+
+/**************** obd start *******************/
+
+/** lustre_cfg_bufs are a holdover from 1.4; we can still set these up from
+ * lctl (and do for echo cli/srv.
+ */
+int do_lcfg(char *cfgname, lnet_nid_t nid, int cmd,
+	    char *s1, char *s2, char *s3, char *s4)
+{
+	struct lustre_cfg_bufs bufs;
+	struct lustre_cfg    * lcfg = NULL;
+	int rc;
+
+	CDEBUG(D_TRACE, "lcfg %s %#x %s %s %s %s\n", cfgname,
+	       cmd, s1, s2, s3, s4);
+
+	lustre_cfg_bufs_reset(&bufs, cfgname);
+	if (s1)
+		lustre_cfg_bufs_set_string(&bufs, 1, s1);
+	if (s2)
+		lustre_cfg_bufs_set_string(&bufs, 2, s2);
+	if (s3)
+		lustre_cfg_bufs_set_string(&bufs, 3, s3);
+	if (s4)
+		lustre_cfg_bufs_set_string(&bufs, 4, s4);
+
+	lcfg = lustre_cfg_new(cmd, &bufs);
+	lcfg->lcfg_nid = nid;
+	rc = class_process_config(lcfg);
+	lustre_cfg_free(lcfg);
+	return(rc);
+}
+EXPORT_SYMBOL(do_lcfg);
+
+/** Call class_attach and class_setup.  These methods in turn call
+ * obd type-specific methods.
+ */
+int lustre_start_simple(char *obdname, char *type, char *uuid,
+			char *s1, char *s2, char *s3, char *s4)
+{
+	int rc;
+	CDEBUG(D_MOUNT, "Starting obd %s (typ=%s)\n", obdname, type);
+
+	rc = do_lcfg(obdname, 0, LCFG_ATTACH, type, uuid, 0, 0);
+	if (rc) {
+		CERROR("%s attach error %d\n", obdname, rc);
+		return rc;
+	}
+	rc = do_lcfg(obdname, 0, LCFG_SETUP, s1, s2, s3, s4);
+	if (rc) {
+		CERROR("%s setup error %d\n", obdname, rc);
+		do_lcfg(obdname, 0, LCFG_DETACH, 0, 0, 0, 0);
+	}
+	return rc;
+}
+
+DEFINE_MUTEX(mgc_start_lock);
+
+/** Set up a mgc obd to process startup logs
+ *
+ * \param sb [in] super block of the mgc obd
+ *
+ * \retval 0 success, otherwise error code
+ */
+int lustre_start_mgc(struct super_block *sb)
+{
+	struct obd_connect_data *data = NULL;
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	struct obd_export *exp;
+	struct obd_uuid *uuid;
+	class_uuid_t uuidc;
+	lnet_nid_t nid;
+	char *mgcname = NULL, *niduuid = NULL, *mgssec = NULL;
+	char *ptr;
+	int recov_bk;
+	int rc = 0, i = 0, j, len;
+	ENTRY;
+
+	LASSERT(lsi->lsi_lmd);
+
+	/* Find the first non-lo MGS nid for our MGC name */
+	if (IS_SERVER(lsi)) {
+		/* mount -o mgsnode=nid */
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (lsi->lsi_lmd->lmd_mgs &&
+		    (class_parse_nid(lsi->lsi_lmd->lmd_mgs, &nid, &ptr) == 0)) {
+			i++;
+		} else if (IS_MGS(lsi)) {
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				if (LNET_NETTYP(LNET_NIDNET(id.nid)) == LOLND)
+					continue;
+				nid = id.nid;
+				i++;
+				break;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		if (class_parse_nid(ptr, &nid, &ptr) == 0)
+			i++;
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		RETURN(-EINVAL);
+	}
+
+	mutex_lock(&mgc_start_lock);
+
+	len = strlen(LUSTRE_MGC_OBDNAME) + strlen(libcfs_nid2str(nid)) + 1;
+	OBD_ALLOC(mgcname, len);
+	OBD_ALLOC(niduuid, len + 2);
+	if (!mgcname || !niduuid)
+		GOTO(out_free, rc = -ENOMEM);
+	sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid));
+
+	mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : "";
+
+	OBD_ALLOC_PTR(data);
+	if (data == NULL)
+		GOTO(out_free, rc = -ENOMEM);
+
+	obd = class_name2obd(mgcname);
+	if (obd && !obd->obd_stopping) {
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					strlen(KEY_MGSSEC), KEY_MGSSEC,
+					strlen(mgssec), mgssec, NULL);
+		if (rc)
+			GOTO(out_free, rc);
+
+		/* Re-using an existing MGC */
+		atomic_inc(&obd->u.cli.cl_mgc_refcount);
+
+		/* IR compatibility check, only for clients */
+		if (lmd_is_client(lsi->lsi_lmd)) {
+			int has_ir;
+			int vallen = sizeof(*data);
+			__u32 *flags = &lsi->lsi_lmd->lmd_flags;
+
+			rc = obd_get_info(NULL, obd->obd_self_export,
+					  strlen(KEY_CONN_DATA), KEY_CONN_DATA,
+					  &vallen, data, NULL);
+			LASSERT(rc == 0);
+			has_ir = OCD_HAS_FLAG(data, IMP_RECOV);
+			if (has_ir ^ !(*flags & LMD_FLG_NOIR)) {
+				/* LMD_FLG_NOIR is for test purpose only */
+				LCONSOLE_WARN(
+				    "Trying to mount a client with IR setting "
+				    "not compatible with current mgc. "
+				    "Force to use current mgc setting that is "
+				    "IR %s.\n",
+				    has_ir ? "enabled" : "disabled");
+				if (has_ir)
+					*flags &= ~LMD_FLG_NOIR;
+				else
+					*flags |= LMD_FLG_NOIR;
+			}
+		}
+
+		recov_bk = 0;
+		/* If we are restarting the MGS, don't try to keep the MGC's
+		   old connection, or registration will fail. */
+		if (IS_MGS(lsi)) {
+			CDEBUG(D_MOUNT, "New MGS with live MGC\n");
+			recov_bk = 1;
+		}
+
+		/* Try all connections, but only once (again).
+		   We don't want to block another target from starting
+		   (using its local copy of the log), but we do want to connect
+		   if at all possible. */
+		recov_bk++;
+		CDEBUG(D_MOUNT, "%s: Set MGC reconnect %d\n", mgcname,recov_bk);
+		rc = obd_set_info_async(NULL, obd->obd_self_export,
+					sizeof(KEY_INIT_RECOV_BACKUP),
+					KEY_INIT_RECOV_BACKUP,
+					sizeof(recov_bk), &recov_bk, NULL);
+		GOTO(out, rc = 0);
+	}
+
+	CDEBUG(D_MOUNT, "Start MGC '%s'\n", mgcname);
+
+	/* Add the primary nids for the MGS */
+	i = 0;
+	sprintf(niduuid, "%s_%x", mgcname, i);
+	if (IS_SERVER(lsi)) {
+		ptr = lsi->lsi_lmd->lmd_mgs;
+		if (IS_MGS(lsi)) {
+			/* Use local nids (including LO) */
+			lnet_process_id_t id;
+			while ((rc = LNetGetId(i++, &id)) != -ENOENT) {
+				rc = do_lcfg(mgcname, id.nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+			}
+		} else {
+			/* Use mgsnode= nids */
+			/* mount -o mgsnode=nid */
+			if (lsi->lsi_lmd->lmd_mgs) {
+				ptr = lsi->lsi_lmd->lmd_mgs;
+			} else if (class_find_param(ptr, PARAM_MGSNODE,
+						    &ptr) != 0) {
+				CERROR("No MGS nids given.\n");
+				GOTO(out_free, rc = -EINVAL);
+			}
+			while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+				rc = do_lcfg(mgcname, nid,
+					     LCFG_ADD_UUID, niduuid, 0,0,0);
+				i++;
+			}
+		}
+	} else { /* client */
+		/* Use nids from mount line: uml1,1@elan:uml2,2@elan:/lustre */
+		ptr = lsi->lsi_lmd->lmd_dev;
+		while (class_parse_nid(ptr, &nid, &ptr) == 0) {
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			i++;
+			/* Stop at the first failover nid */
+			if (*ptr == ':')
+				break;
+		}
+	}
+	if (i == 0) {
+		CERROR("No valid MGS nids found.\n");
+		GOTO(out_free, rc = -EINVAL);
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = 1;
+
+	/* Random uuid for MGC allows easier reconnects */
+	OBD_ALLOC_PTR(uuid);
+	ll_generate_random_uuid(uuidc);
+	class_uuid_unparse(uuidc, uuid);
+
+	/* Start the MGC */
+	rc = lustre_start_simple(mgcname, LUSTRE_MGC_NAME,
+				 (char *)uuid->uuid, LUSTRE_MGS_OBDNAME,
+				 niduuid, 0, 0);
+	OBD_FREE_PTR(uuid);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Add any failover MGS nids */
+	i = 1;
+	while (ptr && ((*ptr == ':' ||
+	       class_find_param(ptr, PARAM_MGSNODE, &ptr) == 0))) {
+		/* New failover node */
+		sprintf(niduuid, "%s_%x", mgcname, i);
+		j = 0;
+		while (class_parse_nid_quiet(ptr, &nid, &ptr) == 0) {
+			j++;
+			rc = do_lcfg(mgcname, nid,
+				     LCFG_ADD_UUID, niduuid, 0,0,0);
+			if (*ptr == ':')
+				break;
+		}
+		if (j > 0) {
+			rc = do_lcfg(mgcname, 0, LCFG_ADD_CONN,
+				     niduuid, 0, 0, 0);
+			i++;
+		} else {
+			/* at ":/fsname" */
+			break;
+		}
+	}
+	lsi->lsi_lmd->lmd_mgs_failnodes = i;
+
+	obd = class_name2obd(mgcname);
+	if (!obd) {
+		CERROR("Can't find mgcobd %s\n", mgcname);
+		GOTO(out_free, rc = -ENOTCONN);
+	}
+
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				strlen(KEY_MGSSEC), KEY_MGSSEC,
+				strlen(mgssec), mgssec, NULL);
+	if (rc)
+		GOTO(out_free, rc);
+
+	/* Keep a refcount of servers/clients who started with "mount",
+	   so we know when we can get rid of the mgc. */
+	atomic_set(&obd->u.cli.cl_mgc_refcount, 1);
+
+	/* Try all connections, but only once. */
+	recov_bk = 1;
+	rc = obd_set_info_async(NULL, obd->obd_self_export,
+				sizeof(KEY_INIT_RECOV_BACKUP),
+				KEY_INIT_RECOV_BACKUP,
+				sizeof(recov_bk), &recov_bk, NULL);
+	if (rc)
+		/* nonfatal */
+		CWARN("can't set %s %d\n", KEY_INIT_RECOV_BACKUP, rc);
+
+	/* We connect to the MGS at setup, and don't disconnect until cleanup */
+	data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_AT |
+				  OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV |
+				  OBD_CONNECT_LVB_TYPE;
+
+#if LUSTRE_VERSION_CODE < OBD_OCD_VERSION(3, 2, 50, 0)
+	data->ocd_connect_flags |= OBD_CONNECT_MNE_SWAB;
+#else
+#warning "LU-1644: Remove old OBD_CONNECT_MNE_SWAB fixup and imp_need_mne_swab"
+#endif
+
+	if (lmd_is_client(lsi->lsi_lmd) &&
+	    lsi->lsi_lmd->lmd_flags & LMD_FLG_NOIR)
+		data->ocd_connect_flags &= ~OBD_CONNECT_IMP_RECOV;
+	data->ocd_version = LUSTRE_VERSION_CODE;
+	rc = obd_connect(NULL, &exp, obd, &(obd->obd_uuid), data, NULL);
+	if (rc) {
+		CERROR("connect failed %d\n", rc);
+		GOTO(out, rc);
+	}
+
+	obd->u.cli.cl_mgc_mgsexp = exp;
+
+out:
+	/* Keep the mgc info in the sb. Note that many lsi's can point
+	   to the same mgc.*/
+	lsi->lsi_mgc = obd;
+out_free:
+	mutex_unlock(&mgc_start_lock);
+
+	if (data)
+		OBD_FREE_PTR(data);
+	if (mgcname)
+		OBD_FREE(mgcname, len);
+	if (niduuid)
+		OBD_FREE(niduuid, len + 2);
+	RETURN(rc);
+}
+
+static int lustre_stop_mgc(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct obd_device *obd;
+	char *niduuid = 0, *ptr = 0;
+	int i, rc = 0, len = 0;
+	ENTRY;
+
+	if (!lsi)
+		RETURN(-ENOENT);
+	obd = lsi->lsi_mgc;
+	if (!obd)
+		RETURN(-ENOENT);
+	lsi->lsi_mgc = NULL;
+
+	mutex_lock(&mgc_start_lock);
+	LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0);
+	if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) {
+		/* This is not fatal, every client that stops
+		   will call in here. */
+		CDEBUG(D_MOUNT, "mgc still has %d references.\n",
+		       atomic_read(&obd->u.cli.cl_mgc_refcount));
+		GOTO(out, rc = -EBUSY);
+	}
+
+	/* The MGC has no recoverable data in any case.
+	 * force shotdown set in umount_begin */
+	obd->obd_no_recov = 1;
+
+	if (obd->u.cli.cl_mgc_mgsexp) {
+		/* An error is not fatal, if we are unable to send the
+		   disconnect mgs ping evictor cleans up the export */
+		rc = obd_disconnect(obd->u.cli.cl_mgc_mgsexp);
+		if (rc)
+			CDEBUG(D_MOUNT, "disconnect failed %d\n", rc);
+	}
+
+	/* Save the obdname for cleaning the nid uuids, which are
+	   obdname_XX */
+	len = strlen(obd->obd_name) + 6;
+	OBD_ALLOC(niduuid, len);
+	if (niduuid) {
+		strcpy(niduuid, obd->obd_name);
+		ptr = niduuid + strlen(niduuid);
+	}
+
+	rc = class_manual_cleanup(obd);
+	if (rc)
+		GOTO(out, rc);
+
+	/* Clean the nid uuids */
+	if (!niduuid)
+		GOTO(out, rc = -ENOMEM);
+
+	for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) {
+		sprintf(ptr, "_%x", i);
+		rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID,
+			     niduuid, 0, 0, 0);
+		if (rc)
+			CERROR("del MDC UUID %s failed: rc = %d\n",
+			       niduuid, rc);
+	}
+out:
+	if (niduuid)
+		OBD_FREE(niduuid, len);
+
+	/* class_import_put will get rid of the additional connections */
+	mutex_unlock(&mgc_start_lock);
+	RETURN(rc);
+}
+
+/***************** lustre superblock **************/
+
+struct lustre_sb_info *lustre_init_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi;
+	ENTRY;
+
+	OBD_ALLOC_PTR(lsi);
+	if (!lsi)
+		RETURN(NULL);
+	OBD_ALLOC_PTR(lsi->lsi_lmd);
+	if (!lsi->lsi_lmd) {
+		OBD_FREE_PTR(lsi);
+		RETURN(NULL);
+	}
+
+	lsi->lsi_lmd->lmd_exclude_count = 0;
+	lsi->lsi_lmd->lmd_recovery_time_soft = 0;
+	lsi->lsi_lmd->lmd_recovery_time_hard = 0;
+	s2lsi_nocast(sb) = lsi;
+	/* we take 1 extra ref for our setup */
+	atomic_set(&lsi->lsi_mounts, 1);
+
+	/* Default umount style */
+	lsi->lsi_flags = LSI_UMOUNT_FAILOVER;
+
+	RETURN(lsi);
+}
+
+static int lustre_free_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+	CDEBUG(D_MOUNT, "Freeing lsi %p\n", lsi);
+
+	/* someone didn't call server_put_mount. */
+	LASSERT(atomic_read(&lsi->lsi_mounts) == 0);
+
+	if (lsi->lsi_lmd != NULL) {
+		if (lsi->lsi_lmd->lmd_dev != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_dev,
+				 strlen(lsi->lsi_lmd->lmd_dev) + 1);
+		if (lsi->lsi_lmd->lmd_profile != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_profile,
+				 strlen(lsi->lsi_lmd->lmd_profile) + 1);
+		if (lsi->lsi_lmd->lmd_mgssec != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgssec,
+				 strlen(lsi->lsi_lmd->lmd_mgssec) + 1);
+		if (lsi->lsi_lmd->lmd_opts != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_opts,
+				 strlen(lsi->lsi_lmd->lmd_opts) + 1);
+		if (lsi->lsi_lmd->lmd_exclude_count)
+			OBD_FREE(lsi->lsi_lmd->lmd_exclude,
+				 sizeof(lsi->lsi_lmd->lmd_exclude[0]) *
+				 lsi->lsi_lmd->lmd_exclude_count);
+		if (lsi->lsi_lmd->lmd_mgs != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_mgs,
+				 strlen(lsi->lsi_lmd->lmd_mgs) + 1);
+		if (lsi->lsi_lmd->lmd_osd_type != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_osd_type,
+				 strlen(lsi->lsi_lmd->lmd_osd_type) + 1);
+		if (lsi->lsi_lmd->lmd_params != NULL)
+			OBD_FREE(lsi->lsi_lmd->lmd_params, 4096);
+
+		OBD_FREE(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd));
+	}
+
+	LASSERT(lsi->lsi_llsbi == NULL);
+	OBD_FREE(lsi, sizeof(*lsi));
+	s2lsi_nocast(sb) = NULL;
+
+	RETURN(0);
+}
+
+/* The lsi has one reference for every server that is using the disk -
+   e.g. MDT, MGS, and potentially MGC */
+int lustre_put_lsi(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	ENTRY;
+
+	LASSERT(lsi != NULL);
+
+	CDEBUG(D_MOUNT, "put %p %d\n", sb, atomic_read(&lsi->lsi_mounts));
+	if (atomic_dec_and_test(&lsi->lsi_mounts)) {
+		if (IS_SERVER(lsi) && lsi->lsi_osd_exp) {
+			obd_disconnect(lsi->lsi_osd_exp);
+			/* wait till OSD is gone */
+			obd_zombie_barrier();
+		}
+		lustre_free_lsi(sb);
+		RETURN(1);
+	}
+	RETURN(0);
+}
+
+/** Get the fsname ("lustre") from the server name ("lustre-OST003F").
+ * @param [in] svname server name including type and index
+ * @param [out] fsname Buffer to copy filesystem name prefix into.
+ *  Must have at least 'strlen(fsname) + 1' chars.
+ * @param [out] endptr if endptr isn't NULL it is set to end of fsname
+ * rc < 0  on error
+ */
+int server_name2fsname(const char *svname, char *fsname, const char **endptr)
+{
+	const char *dash = strrchr(svname, '-');
+	if (!dash) {
+		dash = strrchr(svname, ':');
+		if (!dash)
+			return -EINVAL;
+	}
+
+	/* interpret <fsname>-MDTXXXXX-mdc as mdt, the better way is to pass
+	 * in the fsname, then determine the server index */
+	if (!strcmp(LUSTRE_MDC_NAME, dash + 1)) {
+		dash--;
+		for (; dash > svname && *dash != '-' && *dash != ':'; dash--)
+			;
+		if (dash == svname)
+			return -EINVAL;
+	}
+
+	if (fsname != NULL) {
+		strncpy(fsname, svname, dash - svname);
+		fsname[dash - svname] = '\0';
+	}
+
+	if (endptr != NULL)
+		*endptr = dash;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2fsname);
+
+/**
+ * Get service name (svname) from string
+ * rc < 0 on error
+ * if endptr isn't NULL it is set to end of fsname *
+ */
+int server_name2svname(const char *label, char *svname, const char **endptr,
+		       size_t svsize)
+{
+	int rc;
+	const const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(label, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -1;
+
+	if (strlcpy(svname, dash + 1, svsize) >= svsize)
+		return -E2BIG;
+
+	return 0;
+}
+EXPORT_SYMBOL(server_name2svname);
+
+
+/* Get the index from the obd name.
+   rc = server type, or
+   rc < 0  on error
+   if endptr isn't NULL it is set to end of name */
+int server_name2index(const char *svname, __u32 *idx, const char **endptr)
+{
+	unsigned long index;
+	int rc;
+	const char *dash;
+
+	/* We use server_name2fsname() just for parsing */
+	rc = server_name2fsname(svname, NULL, &dash);
+	if (rc != 0)
+		return rc;
+
+	if (*dash != '-')
+		return -EINVAL;
+
+	dash++;
+
+	if (strncmp(dash, "MDT", 3) == 0)
+		rc = LDD_F_SV_TYPE_MDT;
+	else if (strncmp(dash, "OST", 3) == 0)
+		rc = LDD_F_SV_TYPE_OST;
+	else
+		return -EINVAL;
+
+	dash += 3;
+
+	if (strcmp(dash, "all") == 0)
+		return rc | LDD_F_SV_ALL;
+
+	index = simple_strtoul(dash, (char **)endptr, 16);
+	*idx = index;
+
+	return rc;
+}
+EXPORT_SYMBOL(server_name2index);
+
+/*************** mount common betweeen server and client ***************/
+
+/* Common umount */
+int lustre_common_put_super(struct super_block *sb)
+{
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT, "dropping sb %p\n", sb);
+
+	/* Drop a ref to the MGC */
+	rc = lustre_stop_mgc(sb);
+	if (rc && (rc != -ENOENT)) {
+		if (rc != -EBUSY) {
+			CERROR("Can't stop MGC: %d\n", rc);
+			RETURN(rc);
+		}
+		/* BUSY just means that there's some other obd that
+		   needs the mgc.  Let him clean it up. */
+		CDEBUG(D_MOUNT, "MGC still in use\n");
+	}
+	/* Drop a ref to the mounted disk */
+	lustre_put_lsi(sb);
+	lu_types_stop();
+	RETURN(rc);
+}
+EXPORT_SYMBOL(lustre_common_put_super);
+
+static void lmd_print(struct lustre_mount_data *lmd)
+{
+	int i;
+
+	PRINT_CMD(D_MOUNT, "  mount data:\n");
+	if (lmd_is_client(lmd))
+		PRINT_CMD(D_MOUNT, "profile: %s\n", lmd->lmd_profile);
+	PRINT_CMD(D_MOUNT, "device:  %s\n", lmd->lmd_dev);
+	PRINT_CMD(D_MOUNT, "flags:   %x\n", lmd->lmd_flags);
+
+	if (lmd->lmd_opts)
+		PRINT_CMD(D_MOUNT, "options: %s\n", lmd->lmd_opts);
+
+	if (lmd->lmd_recovery_time_soft)
+		PRINT_CMD(D_MOUNT, "recovery time soft: %d\n",
+			  lmd->lmd_recovery_time_soft);
+
+	if (lmd->lmd_recovery_time_hard)
+		PRINT_CMD(D_MOUNT, "recovery time hard: %d\n",
+			  lmd->lmd_recovery_time_hard);
+
+	for (i = 0; i < lmd->lmd_exclude_count; i++) {
+		PRINT_CMD(D_MOUNT, "exclude %d:  OST%04x\n", i,
+			  lmd->lmd_exclude[i]);
+	}
+}
+
+/* Is this server on the exclusion list */
+int lustre_check_exclusion(struct super_block *sb, char *svname)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+	struct lustre_mount_data *lmd = lsi->lsi_lmd;
+	__u32 index;
+	int i, rc;
+	ENTRY;
+
+	rc = server_name2index(svname, &index, NULL);
+	if (rc != LDD_F_SV_TYPE_OST)
+		/* Only exclude OSTs */
+		RETURN(0);
+
+	CDEBUG(D_MOUNT, "Check exclusion %s (%d) in %d of %s\n", svname,
+	       index, lmd->lmd_exclude_count, lmd->lmd_dev);
+
+	for(i = 0; i < lmd->lmd_exclude_count; i++) {
+		if (index == lmd->lmd_exclude[i]) {
+			CWARN("Excluding %s (on exclusion list)\n", svname);
+			RETURN(1);
+		}
+	}
+	RETURN(0);
+}
+
+/* mount -v  -o exclude=lustre-OST0001:lustre-OST0002 -t lustre ... */
+static int lmd_make_exclusion(struct lustre_mount_data *lmd, const char *ptr)
+{
+	const char *s1 = ptr, *s2;
+	__u32 index, *exclude_list;
+	int rc = 0, devmax;
+	ENTRY;
+
+	/* The shortest an ost name can be is 8 chars: -OST0000.
+	   We don't actually know the fsname at this time, so in fact
+	   a user could specify any fsname. */
+	devmax = strlen(ptr) / 8 + 1;
+
+	/* temp storage until we figure out how many we have */
+	OBD_ALLOC(exclude_list, sizeof(index) * devmax);
+	if (!exclude_list)
+		RETURN(-ENOMEM);
+
+	/* we enter this fn pointing at the '=' */
+	while (*s1 && *s1 != ' ' && *s1 != ',') {
+		s1++;
+		rc = server_name2index(s1, &index, &s2);
+		if (rc < 0) {
+			CERROR("Can't parse server name '%s'\n", s1);
+			break;
+		}
+		if (rc == LDD_F_SV_TYPE_OST)
+			exclude_list[lmd->lmd_exclude_count++] = index;
+		else
+			CDEBUG(D_MOUNT, "ignoring exclude %.7s\n", s1);
+		s1 = s2;
+		/* now we are pointing at ':' (next exclude)
+		   or ',' (end of excludes) */
+		if (lmd->lmd_exclude_count >= devmax)
+			break;
+	}
+	if (rc >= 0) /* non-err */
+		rc = 0;
+
+	if (lmd->lmd_exclude_count) {
+		/* permanent, freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_exclude, sizeof(index) *
+			  lmd->lmd_exclude_count);
+		if (lmd->lmd_exclude) {
+			memcpy(lmd->lmd_exclude, exclude_list,
+			       sizeof(index) * lmd->lmd_exclude_count);
+		} else {
+			rc = -ENOMEM;
+			lmd->lmd_exclude_count = 0;
+		}
+	}
+	OBD_FREE(exclude_list, sizeof(index) * devmax);
+	RETURN(rc);
+}
+
+static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if (lmd->lmd_mgssec != NULL) {
+		OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1);
+		lmd->lmd_mgssec = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(lmd->lmd_mgssec, length + 1);
+	if (lmd->lmd_mgssec == NULL)
+		return -ENOMEM;
+
+	memcpy(lmd->lmd_mgssec, ptr, length);
+	lmd->lmd_mgssec[length] = '\0';
+	return 0;
+}
+
+static int lmd_parse_string(char **handle, char *ptr)
+{
+	char   *tail;
+	int     length;
+
+	if ((handle == NULL) || (ptr == NULL))
+		return -EINVAL;
+
+	if (*handle != NULL) {
+		OBD_FREE(*handle, strlen(*handle) + 1);
+		*handle = NULL;
+	}
+
+	tail = strchr(ptr, ',');
+	if (tail == NULL)
+		length = strlen(ptr);
+	else
+		length = tail - ptr;
+
+	OBD_ALLOC(*handle, length + 1);
+	if (*handle == NULL)
+		return -ENOMEM;
+
+	memcpy(*handle, ptr, length);
+	(*handle)[length] = '\0';
+
+	return 0;
+}
+
+/* Collect multiple values for mgsnid specifiers */
+static int lmd_parse_mgs(struct lustre_mount_data *lmd, char **ptr)
+{
+	lnet_nid_t nid;
+	char *tail = *ptr;
+	char *mgsnid;
+	int   length;
+	int   oldlen = 0;
+
+	/* Find end of nidlist */
+	while (class_parse_nid_quiet(tail, &nid, &tail) == 0) {}
+	length = tail - *ptr;
+	if (length == 0) {
+		LCONSOLE_ERROR_MSG(0x159, "Can't parse NID '%s'\n", *ptr);
+		return -EINVAL;
+	}
+
+	if (lmd->lmd_mgs != NULL)
+		oldlen = strlen(lmd->lmd_mgs) + 1;
+
+	OBD_ALLOC(mgsnid, oldlen + length + 1);
+	if (mgsnid == NULL)
+		return -ENOMEM;
+
+	if (lmd->lmd_mgs != NULL) {
+		/* Multiple mgsnid= are taken to mean failover locations */
+		memcpy(mgsnid, lmd->lmd_mgs, oldlen);
+		mgsnid[oldlen - 1] = ':';
+		OBD_FREE(lmd->lmd_mgs, oldlen);
+	}
+	memcpy(mgsnid + oldlen, *ptr, length);
+	mgsnid[oldlen + length] = '\0';
+	lmd->lmd_mgs = mgsnid;
+	*ptr = tail;
+
+	return 0;
+}
+
+/** Parse mount line options
+ * e.g. mount -v -t lustre -o abort_recov uml1:uml2:/lustre-client /mnt/lustre
+ * dev is passed as device=uml1:/lustre by mount.lustre
+ */
+static int lmd_parse(char *options, struct lustre_mount_data *lmd)
+{
+	char *s1, *s2, *devname = NULL;
+	struct lustre_mount_data *raw = (struct lustre_mount_data *)options;
+	int rc = 0;
+	ENTRY;
+
+	LASSERT(lmd);
+	if (!options) {
+		LCONSOLE_ERROR_MSG(0x162, "Missing mount data: check that "
+				   "/sbin/mount.lustre is installed.\n");
+		RETURN(-EINVAL);
+	}
+
+	/* Options should be a string - try to detect old lmd data */
+	if ((raw->lmd_magic & 0xffffff00) == (LMD_MAGIC & 0xffffff00)) {
+		LCONSOLE_ERROR_MSG(0x163, "You're using an old version of "
+				   "/sbin/mount.lustre.  Please install "
+				   "version %s\n", LUSTRE_VERSION_STRING);
+		RETURN(-EINVAL);
+	}
+	lmd->lmd_magic = LMD_MAGIC;
+
+	OBD_ALLOC(lmd->lmd_params, 4096);
+	if (lmd->lmd_params == NULL)
+		RETURN(-ENOMEM);
+	lmd->lmd_params[0] = '\0';
+
+	/* Set default flags here */
+
+	s1 = options;
+	while (*s1) {
+		int clear = 0;
+		int time_min = OBD_RECOVERY_TIME_MIN;
+
+		/* Skip whitespace and extra commas */
+		while (*s1 == ' ' || *s1 == ',')
+			s1++;
+
+		/* Client options are parsed in ll_options: eg. flock,
+		   user_xattr, acl */
+
+		/* Parse non-ldiskfs options here. Rather than modifying
+		   ldiskfs, we just zero these out here */
+		if (strncmp(s1, "abort_recov", 11) == 0) {
+			lmd->lmd_flags |= LMD_FLG_ABORT_RECOV;
+			clear++;
+		} else if (strncmp(s1, "recovery_time_soft=", 19) == 0) {
+			lmd->lmd_recovery_time_soft = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "recovery_time_hard=", 19) == 0) {
+			lmd->lmd_recovery_time_hard = max_t(int,
+				simple_strtoul(s1 + 19, NULL, 10), time_min);
+			clear++;
+		} else if (strncmp(s1, "noir", 4) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOIR; /* test purpose only. */
+			clear++;
+		} else if (strncmp(s1, "nosvc", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSVC;
+			clear++;
+		} else if (strncmp(s1, "nomgs", 5) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOMGS;
+			clear++;
+		} else if (strncmp(s1, "noscrub", 7) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NOSCRUB;
+			clear++;
+		} else if (strncmp(s1, PARAM_MGSNODE,
+				   sizeof(PARAM_MGSNODE) - 1) == 0) {
+			s2 = s1 + sizeof(PARAM_MGSNODE) - 1;
+			/* Assume the next mount opt is the first
+			   invalid nid we get to. */
+			rc = lmd_parse_mgs(lmd, &s2);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "writeconf", 9) == 0) {
+			lmd->lmd_flags |= LMD_FLG_WRITECONF;
+			clear++;
+		} else if (strncmp(s1, "update", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_UPDATE;
+			clear++;
+		} else if (strncmp(s1, "virgin", 6) == 0) {
+			lmd->lmd_flags |= LMD_FLG_VIRGIN;
+			clear++;
+		} else if (strncmp(s1, "noprimnode", 10) == 0) {
+			lmd->lmd_flags |= LMD_FLG_NO_PRIMNODE;
+			clear++;
+		} else if (strncmp(s1, "mgssec=", 7) == 0) {
+			rc = lmd_parse_mgssec(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		/* ost exclusion list */
+		} else if (strncmp(s1, "exclude=", 8) == 0) {
+			rc = lmd_make_exclusion(lmd, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "mgs", 3) == 0) {
+			/* We are an MGS */
+			lmd->lmd_flags |= LMD_FLG_MGS;
+			clear++;
+		} else if (strncmp(s1, "svname=", 7) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_profile, s1 + 7);
+			if (rc)
+				goto invalid;
+			clear++;
+		} else if (strncmp(s1, "param=", 6) == 0) {
+			int length;
+			char *tail = strchr(s1 + 6, ',');
+			if (tail == NULL)
+				length = strlen(s1);
+			else
+				length = tail - s1;
+			length -= 6;
+			strncat(lmd->lmd_params, s1 + 6, length);
+			strcat(lmd->lmd_params, " ");
+			clear++;
+		} else if (strncmp(s1, "osd=", 4) == 0) {
+			rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4);
+			if (rc)
+				goto invalid;
+			clear++;
+		}
+		/* Linux 2.4 doesn't pass the device, so we stuck it at the
+		   end of the options. */
+		else if (strncmp(s1, "device=", 7) == 0) {
+			devname = s1 + 7;
+			/* terminate options right before device.  device
+			   must be the last one. */
+			*s1 = '\0';
+			break;
+		}
+
+		/* Find next opt */
+		s2 = strchr(s1, ',');
+		if (s2 == NULL) {
+			if (clear)
+				*s1 = '\0';
+			break;
+		}
+		s2++;
+		if (clear)
+			memmove(s1, s2, strlen(s2) + 1);
+		else
+			s1 = s2;
+	}
+
+	if (!devname) {
+		LCONSOLE_ERROR_MSG(0x164, "Can't find the device name "
+				   "(need mount option 'device=...')\n");
+		goto invalid;
+	}
+
+	s1 = strstr(devname, ":/");
+	if (s1) {
+		++s1;
+		lmd->lmd_flags |= LMD_FLG_CLIENT;
+		/* Remove leading /s from fsname */
+		while (*++s1 == '/') ;
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_profile, strlen(s1) + 8);
+		if (!lmd->lmd_profile)
+			RETURN(-ENOMEM);
+		sprintf(lmd->lmd_profile, "%s-client", s1);
+	}
+
+	/* Freed in lustre_free_lsi */
+	OBD_ALLOC(lmd->lmd_dev, strlen(devname) + 1);
+	if (!lmd->lmd_dev)
+		RETURN(-ENOMEM);
+	strcpy(lmd->lmd_dev, devname);
+
+	/* Save mount options */
+	s1 = options + strlen(options) - 1;
+	while (s1 >= options && (*s1 == ',' || *s1 == ' '))
+		*s1-- = 0;
+	if (*options != 0) {
+		/* Freed in lustre_free_lsi */
+		OBD_ALLOC(lmd->lmd_opts, strlen(options) + 1);
+		if (!lmd->lmd_opts)
+			RETURN(-ENOMEM);
+		strcpy(lmd->lmd_opts, options);
+	}
+
+	lmd_print(lmd);
+	lmd->lmd_magic = LMD_MAGIC;
+
+	RETURN(rc);
+
+invalid:
+	CERROR("Bad mount options %s\n", options);
+	RETURN(-EINVAL);
+}
+
+struct lustre_mount_data2 {
+	void *lmd2_data;
+	struct vfsmount *lmd2_mnt;
+};
+
+/** This is the entry point for the mount call into Lustre.
+ * This is called when a server or client is mounted,
+ * and this is where we start setting things up.
+ * @param data Mount options (e.g. -o flock,abort_recov)
+ */
+int lustre_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct lustre_mount_data *lmd;
+	struct lustre_mount_data2 *lmd2 = data;
+	struct lustre_sb_info *lsi;
+	int rc;
+	ENTRY;
+
+	CDEBUG(D_MOUNT|D_VFSTRACE, "VFS Op: sb %p\n", sb);
+
+	lsi = lustre_init_lsi(sb);
+	if (!lsi)
+		RETURN(-ENOMEM);
+	lmd = lsi->lsi_lmd;
+
+	/*
+	 * Disable lockdep during mount, because mount locking patterns are
+	 * `special'.
+	 */
+	lockdep_off();
+
+	/*
+	 * LU-639: the obd cleanup of last mount may not finish yet, wait here.
+	 */
+	obd_zombie_barrier();
+
+	/* Figure out the lmd from the mount options */
+	if (lmd_parse((char *)(lmd2->lmd2_data), lmd)) {
+		lustre_put_lsi(sb);
+		GOTO(out, rc = -EINVAL);
+	}
+
+	if (lmd_is_client(lmd)) {
+		CDEBUG(D_MOUNT, "Mounting client %s\n", lmd->lmd_profile);
+		if (!client_fill_super) {
+			LCONSOLE_ERROR_MSG(0x165, "Nothing registered for "
+					   "client mount! Is the 'lustre' "
+					   "module loaded?\n");
+			lustre_put_lsi(sb);
+			rc = -ENODEV;
+		} else {
+			rc = lustre_start_mgc(sb);
+			if (rc) {
+				lustre_put_lsi(sb);
+				GOTO(out, rc);
+			}
+			/* Connect and start */
+			/* (should always be ll_fill_super) */
+			rc = (*client_fill_super)(sb, lmd2->lmd2_mnt);
+			/* c_f_s will call lustre_common_put_super on failure */
+		}
+	} else {
+		CERROR("This is client-side-only module, "
+		       "cannot handle server mount.\n");
+		rc = -EINVAL;
+	}
+
+	/* If error happens in fill_super() call, @lsi will be killed there.
+	 * This is why we do not put it here. */
+	GOTO(out, rc);
+out:
+	if (rc) {
+		CERROR("Unable to mount %s (%d)\n",
+		       s2lsi(sb) ? lmd->lmd_dev : "", rc);
+	} else {
+		CDEBUG(D_SUPER, "Mount %s complete\n",
+		       lmd->lmd_dev);
+	}
+	lockdep_on();
+	return rc;
+}
+
+
+/* We can't call ll_fill_super by name because it lives in a module that
+   must be loaded after this one. */
+void lustre_register_client_fill_super(int (*cfs)(struct super_block *sb,
+						  struct vfsmount *mnt))
+{
+	client_fill_super = cfs;
+}
+EXPORT_SYMBOL(lustre_register_client_fill_super);
+
+void lustre_register_kill_super_cb(void (*cfs)(struct super_block *sb))
+{
+	kill_super_cb = cfs;
+}
+EXPORT_SYMBOL(lustre_register_kill_super_cb);
+
+/***************** FS registration ******************/
+struct dentry *lustre_mount(struct file_system_type *fs_type, int flags,
+				const char *devname, void *data)
+{
+	struct lustre_mount_data2 lmd2 = { data, NULL };
+
+	return mount_nodev(fs_type, flags, &lmd2, lustre_fill_super);
+}
+
+void lustre_kill_super(struct super_block *sb)
+{
+	struct lustre_sb_info *lsi = s2lsi(sb);
+
+	if (kill_super_cb && lsi && !IS_SERVER(lsi))
+		(*kill_super_cb)(sb);
+
+	kill_anon_super(sb);
+}
+
+/** Register the "lustre" fs type
+ */
+struct file_system_type lustre_fs_type = {
+	.owner	= THIS_MODULE,
+	.name	 = "lustre",
+	.mount	= lustre_mount,
+	.kill_sb      = lustre_kill_super,
+	.fs_flags     = FS_BINARY_MOUNTDATA | FS_REQUIRES_DEV |
+			FS_HAS_FIEMAP | FS_RENAME_DOES_D_MOVE,
+};
+
+int lustre_register_fs(void)
+{
+	return register_filesystem(&lustre_fs_type);
+}
+
+int lustre_unregister_fs(void)
+{
+	return unregister_filesystem(&lustre_fs_type);
+}
diff --git a/drivers/staging/lustre/lustre/obdclass/obdo.c b/drivers/staging/lustre/lustre/obdclass/obdo.c
new file mode 100644
index 000000000000..01a0e1f83a68
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/obdo.c
@@ -0,0 +1,362 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ *
+ * Copyright (c) 2011, 2012, Intel Corporation.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/obdo.c
+ *
+ * Object Devices Class Driver
+ * These are the only exported functions, they provide some generic
+ * infrastructure for managing object devices
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+#include <obd_class.h>
+#include <lustre/lustre_idl.h>
+
+void obdo_set_parent_fid(struct obdo *dst, const struct lu_fid *parent)
+{
+	dst->o_parent_oid = fid_oid(parent);
+	dst->o_parent_seq = fid_seq(parent);
+	dst->o_parent_ver = fid_ver(parent);
+	dst->o_valid |= OBD_MD_FLGENER | OBD_MD_FLFID;
+}
+EXPORT_SYMBOL(obdo_set_parent_fid);
+
+/* WARNING: the file systems must take care not to tinker with
+   attributes they don't manage (such as blocks). */
+void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid)
+{
+	obd_flag newvalid = 0;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n",
+		       valid, LTIME_S(src->i_mtime),
+		       LTIME_S(src->i_ctime));
+
+	if (valid & OBD_MD_FLATIME) {
+		dst->o_atime = LTIME_S(src->i_atime);
+		newvalid |= OBD_MD_FLATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		dst->o_mtime = LTIME_S(src->i_mtime);
+		newvalid |= OBD_MD_FLMTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		dst->o_ctime = LTIME_S(src->i_ctime);
+		newvalid |= OBD_MD_FLCTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		dst->o_size = i_size_read(src);
+		newvalid |= OBD_MD_FLSIZE;
+	}
+	if (valid & OBD_MD_FLBLOCKS) {  /* allocation of space (x512 bytes) */
+		dst->o_blocks = src->i_blocks;
+		newvalid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & OBD_MD_FLBLKSZ) {   /* optimal block size */
+		dst->o_blksize = ll_inode_blksize(src);
+		newvalid |= OBD_MD_FLBLKSZ;
+	}
+	if (valid & OBD_MD_FLTYPE) {
+		dst->o_mode = (dst->o_mode & S_IALLUGO) |
+			      (src->i_mode & S_IFMT);
+		newvalid |= OBD_MD_FLTYPE;
+	}
+	if (valid & OBD_MD_FLMODE) {
+		dst->o_mode = (dst->o_mode & S_IFMT) |
+			      (src->i_mode & S_IALLUGO);
+		newvalid |= OBD_MD_FLMODE;
+	}
+	if (valid & OBD_MD_FLUID) {
+		dst->o_uid = src->i_uid;
+		newvalid |= OBD_MD_FLUID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		dst->o_gid = src->i_gid;
+		newvalid |= OBD_MD_FLGID;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		dst->o_flags = ll_inode_flags(src);
+		newvalid |= OBD_MD_FLFLAGS;
+	}
+	dst->o_valid |= newvalid;
+}
+EXPORT_SYMBOL(obdo_from_inode);
+
+void obdo_cpy_md(struct obdo *dst, struct obdo *src, obd_flag valid)
+{
+	CDEBUG(D_INODE, "src obdo "DOSTID" valid "LPX64", dst obdo "DOSTID"\n",
+	       POSTID(&src->o_oi), src->o_valid, POSTID(&dst->o_oi));
+	if (valid & OBD_MD_FLATIME)
+		dst->o_atime = src->o_atime;
+	if (valid & OBD_MD_FLMTIME)
+		dst->o_mtime = src->o_mtime;
+	if (valid & OBD_MD_FLCTIME)
+		dst->o_ctime = src->o_ctime;
+	if (valid & OBD_MD_FLSIZE)
+		dst->o_size = src->o_size;
+	if (valid & OBD_MD_FLBLOCKS) /* allocation of space */
+		dst->o_blocks = src->o_blocks;
+	if (valid & OBD_MD_FLBLKSZ)
+		dst->o_blksize = src->o_blksize;
+	if (valid & OBD_MD_FLTYPE)
+		dst->o_mode = (dst->o_mode & ~S_IFMT) | (src->o_mode & S_IFMT);
+	if (valid & OBD_MD_FLMODE)
+		dst->o_mode = (dst->o_mode & S_IFMT) | (src->o_mode & ~S_IFMT);
+	if (valid & OBD_MD_FLUID)
+		dst->o_uid = src->o_uid;
+	if (valid & OBD_MD_FLGID)
+		dst->o_gid = src->o_gid;
+	if (valid & OBD_MD_FLFLAGS)
+		dst->o_flags = src->o_flags;
+	if (valid & OBD_MD_FLFID) {
+		dst->o_parent_seq = src->o_parent_seq;
+		dst->o_parent_ver = src->o_parent_ver;
+	}
+	if (valid & OBD_MD_FLGENER)
+		dst->o_parent_oid = src->o_parent_oid;
+	if (valid & OBD_MD_FLHANDLE)
+		dst->o_handle = src->o_handle;
+	if (valid & OBD_MD_FLCOOKIE)
+		dst->o_lcookie = src->o_lcookie;
+
+	dst->o_valid |= valid;
+}
+EXPORT_SYMBOL(obdo_cpy_md);
+
+/* returns FALSE if comparison (by flags) is same, TRUE if changed */
+int obdo_cmp_md(struct obdo *dst, struct obdo *src, obd_flag compare)
+{
+	int res = 0;
+
+	if ( compare & OBD_MD_FLATIME )
+		res = (res || (dst->o_atime != src->o_atime));
+	if ( compare & OBD_MD_FLMTIME )
+		res = (res || (dst->o_mtime != src->o_mtime));
+	if ( compare & OBD_MD_FLCTIME )
+		res = (res || (dst->o_ctime != src->o_ctime));
+	if ( compare & OBD_MD_FLSIZE )
+		res = (res || (dst->o_size != src->o_size));
+	if ( compare & OBD_MD_FLBLOCKS ) /* allocation of space */
+		res = (res || (dst->o_blocks != src->o_blocks));
+	if ( compare & OBD_MD_FLBLKSZ )
+		res = (res || (dst->o_blksize != src->o_blksize));
+	if ( compare & OBD_MD_FLTYPE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & S_IFMT) != 0));
+	if ( compare & OBD_MD_FLMODE )
+		res = (res || (((dst->o_mode ^ src->o_mode) & ~S_IFMT) != 0));
+	if ( compare & OBD_MD_FLUID )
+		res = (res || (dst->o_uid != src->o_uid));
+	if ( compare & OBD_MD_FLGID )
+		res = (res || (dst->o_gid != src->o_gid));
+	if ( compare & OBD_MD_FLFLAGS )
+		res = (res || (dst->o_flags != src->o_flags));
+	if ( compare & OBD_MD_FLNLINK )
+		res = (res || (dst->o_nlink != src->o_nlink));
+	if ( compare & OBD_MD_FLFID ) {
+		res = (res || (dst->o_parent_seq != src->o_parent_seq));
+		res = (res || (dst->o_parent_ver != src->o_parent_ver));
+	}
+	if ( compare & OBD_MD_FLGENER )
+		res = (res || (dst->o_parent_oid != src->o_parent_oid));
+	/* XXX Don't know if thses should be included here - wasn't previously
+	if ( compare & OBD_MD_FLINLINE )
+		res = (res || memcmp(dst->o_inline, src->o_inline));
+	*/
+	return res;
+}
+EXPORT_SYMBOL(obdo_cmp_md);
+
+void obdo_to_ioobj(struct obdo *oa, struct obd_ioobj *ioobj)
+{
+	ioobj->ioo_oid = oa->o_oi;
+	if (unlikely(!(oa->o_valid & OBD_MD_FLGROUP)))
+		ostid_set_seq_mdt0(&ioobj->ioo_oid);
+
+	/* Since 2.4 this does not contain o_mode in the low 16 bits.
+	 * Instead, it holds (bd_md_max_brw - 1) for multi-bulk BRW RPCs */
+	ioobj->ioo_max_brw = 0;
+}
+EXPORT_SYMBOL(obdo_to_ioobj);
+
+void obdo_from_iattr(struct obdo *oa, struct iattr *attr, unsigned int ia_valid)
+{
+	if (ia_valid & ATTR_ATIME) {
+		oa->o_atime = LTIME_S(attr->ia_atime);
+		oa->o_valid |= OBD_MD_FLATIME;
+	}
+	if (ia_valid & ATTR_MTIME) {
+		oa->o_mtime = LTIME_S(attr->ia_mtime);
+		oa->o_valid |= OBD_MD_FLMTIME;
+	}
+	if (ia_valid & ATTR_CTIME) {
+		oa->o_ctime = LTIME_S(attr->ia_ctime);
+		oa->o_valid |= OBD_MD_FLCTIME;
+	}
+	if (ia_valid & ATTR_SIZE) {
+		oa->o_size = attr->ia_size;
+		oa->o_valid |= OBD_MD_FLSIZE;
+	}
+	if (ia_valid & ATTR_MODE) {
+		oa->o_mode = attr->ia_mode;
+		oa->o_valid |= OBD_MD_FLTYPE | OBD_MD_FLMODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			oa->o_mode &= ~S_ISGID;
+	}
+	if (ia_valid & ATTR_UID) {
+		oa->o_uid = attr->ia_uid;
+		oa->o_valid |= OBD_MD_FLUID;
+	}
+	if (ia_valid & ATTR_GID) {
+		oa->o_gid = attr->ia_gid;
+		oa->o_valid |= OBD_MD_FLGID;
+	}
+}
+EXPORT_SYMBOL(obdo_from_iattr);
+
+void iattr_from_obdo(struct iattr *attr, struct obdo *oa, obd_flag valid)
+{
+	valid &= oa->o_valid;
+
+	if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME))
+		CDEBUG(D_INODE, "valid "LPX64", new time "LPU64"/"LPU64"\n",
+		       oa->o_valid, oa->o_mtime, oa->o_ctime);
+
+	attr->ia_valid = 0;
+	if (valid & OBD_MD_FLATIME) {
+		LTIME_S(attr->ia_atime) = oa->o_atime;
+		attr->ia_valid |= ATTR_ATIME;
+	}
+	if (valid & OBD_MD_FLMTIME) {
+		LTIME_S(attr->ia_mtime) = oa->o_mtime;
+		attr->ia_valid |= ATTR_MTIME;
+	}
+	if (valid & OBD_MD_FLCTIME) {
+		LTIME_S(attr->ia_ctime) = oa->o_ctime;
+		attr->ia_valid |= ATTR_CTIME;
+	}
+	if (valid & OBD_MD_FLSIZE) {
+		attr->ia_size = oa->o_size;
+		attr->ia_valid |= ATTR_SIZE;
+	}
+#if 0   /* you shouldn't be able to change a file's type with setattr */
+	if (valid & OBD_MD_FLTYPE) {
+		attr->ia_mode = (attr->ia_mode & ~S_IFMT)|(oa->o_mode & S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+	}
+#endif
+	if (valid & OBD_MD_FLMODE) {
+		attr->ia_mode = (attr->ia_mode & S_IFMT)|(oa->o_mode & ~S_IFMT);
+		attr->ia_valid |= ATTR_MODE;
+		if (!current_is_in_group(oa->o_gid) &&
+		    !cfs_capable(CFS_CAP_FSETID))
+			attr->ia_mode &= ~S_ISGID;
+	}
+	if (valid & OBD_MD_FLUID) {
+		attr->ia_uid = oa->o_uid;
+		attr->ia_valid |= ATTR_UID;
+	}
+	if (valid & OBD_MD_FLGID) {
+		attr->ia_gid = oa->o_gid;
+		attr->ia_valid |= ATTR_GID;
+	}
+}
+EXPORT_SYMBOL(iattr_from_obdo);
+
+void md_from_obdo(struct md_op_data *op_data, struct obdo *oa, obd_flag valid)
+{
+	iattr_from_obdo(&op_data->op_attr, oa, valid);
+	if (valid & OBD_MD_FLBLOCKS) {
+		op_data->op_attr_blocks = oa->o_blocks;
+		op_data->op_attr.ia_valid |= ATTR_BLOCKS;
+	}
+	if (valid & OBD_MD_FLFLAGS) {
+		((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags =
+			oa->o_flags;
+		op_data->op_attr.ia_valid |= ATTR_ATTR_FLAG;
+	}
+}
+EXPORT_SYMBOL(md_from_obdo);
+
+void obdo_from_md(struct obdo *oa, struct md_op_data *op_data,
+		  unsigned int valid)
+{
+	obdo_from_iattr(oa, &op_data->op_attr, valid);
+	if (valid & ATTR_BLOCKS) {
+		oa->o_blocks = op_data->op_attr_blocks;
+		oa->o_valid |= OBD_MD_FLBLOCKS;
+	}
+	if (valid & ATTR_ATTR_FLAG) {
+		oa->o_flags =
+			((struct ll_iattr *)&op_data->op_attr)->ia_attr_flags;
+		oa->o_valid |= OBD_MD_FLFLAGS;
+	}
+}
+EXPORT_SYMBOL(obdo_from_md);
+
+void obdo_cpu_to_le(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = cpu_to_le64(sobdo->o_size);
+	dobdo->o_mtime = cpu_to_le64(sobdo->o_mtime);
+	dobdo->o_atime = cpu_to_le64(sobdo->o_atime);
+	dobdo->o_ctime = cpu_to_le64(sobdo->o_ctime);
+	dobdo->o_blocks = cpu_to_le64(sobdo->o_blocks);
+	dobdo->o_mode = cpu_to_le32(sobdo->o_mode);
+	dobdo->o_uid = cpu_to_le32(sobdo->o_uid);
+	dobdo->o_gid = cpu_to_le32(sobdo->o_gid);
+	dobdo->o_flags = cpu_to_le32(sobdo->o_flags);
+	dobdo->o_nlink = cpu_to_le32(sobdo->o_nlink);
+	dobdo->o_blksize = cpu_to_le32(sobdo->o_blksize);
+	dobdo->o_valid = cpu_to_le64(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_cpu_to_le);
+
+void obdo_le_to_cpu(struct obdo *dobdo, struct obdo *sobdo)
+{
+	dobdo->o_size = le64_to_cpu(sobdo->o_size);
+	dobdo->o_mtime = le64_to_cpu(sobdo->o_mtime);
+	dobdo->o_atime = le64_to_cpu(sobdo->o_atime);
+	dobdo->o_ctime = le64_to_cpu(sobdo->o_ctime);
+	dobdo->o_blocks = le64_to_cpu(sobdo->o_blocks);
+	dobdo->o_mode = le32_to_cpu(sobdo->o_mode);
+	dobdo->o_uid = le32_to_cpu(sobdo->o_uid);
+	dobdo->o_gid = le32_to_cpu(sobdo->o_gid);
+	dobdo->o_flags = le32_to_cpu(sobdo->o_flags);
+	dobdo->o_nlink = le32_to_cpu(sobdo->o_nlink);
+	dobdo->o_blksize = le32_to_cpu(sobdo->o_blksize);
+	dobdo->o_valid = le64_to_cpu(sobdo->o_valid);
+}
+EXPORT_SYMBOL(obdo_le_to_cpu);
diff --git a/drivers/staging/lustre/lustre/obdclass/statfs_pack.c b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
new file mode 100644
index 000000000000..c3b7a78dba50
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/statfs_pack.c
@@ -0,0 +1,75 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/statfs_pack.c
+ *
+ * (Un)packing of OST/MDS requests
+ *
+ * Author: Andreas Dilger <adilger@clusterfs.com>
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+
+#include <lustre_export.h>
+#include <lustre_net.h>
+#include <obd_support.h>
+#include <obd_class.h>
+
+void statfs_pack(struct obd_statfs *osfs, struct kstatfs *sfs)
+{
+	memset(osfs, 0, sizeof(*osfs));
+	osfs->os_type = sfs->f_type;
+	osfs->os_blocks = sfs->f_blocks;
+	osfs->os_bfree = sfs->f_bfree;
+	osfs->os_bavail = sfs->f_bavail;
+	osfs->os_files = sfs->f_files;
+	osfs->os_ffree = sfs->f_ffree;
+	osfs->os_bsize = sfs->f_bsize;
+	osfs->os_namelen = sfs->f_namelen;
+}
+EXPORT_SYMBOL(statfs_pack);
+
+void statfs_unpack(struct kstatfs *sfs, struct obd_statfs *osfs)
+{
+	memset(sfs, 0, sizeof(*sfs));
+	sfs->f_type = osfs->os_type;
+	sfs->f_blocks = osfs->os_blocks;
+	sfs->f_bfree = osfs->os_bfree;
+	sfs->f_bavail = osfs->os_bavail;
+	sfs->f_files = osfs->os_files;
+	sfs->f_ffree = osfs->os_ffree;
+	sfs->f_bsize = osfs->os_bsize;
+	sfs->f_namelen = osfs->os_namelen;
+}
+EXPORT_SYMBOL(statfs_unpack);
diff --git a/drivers/staging/lustre/lustre/obdclass/uuid.c b/drivers/staging/lustre/lustre/obdclass/uuid.c
new file mode 100644
index 000000000000..af5f27f82bc5
--- /dev/null
+++ b/drivers/staging/lustre/lustre/obdclass/uuid.c
@@ -0,0 +1,82 @@
+/*
+ * GPL HEADER START
+ *
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 only,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License version 2 for more details (a copy is included
+ * in the LICENSE file that accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
+ *
+ * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
+ * CA 95054 USA or visit www.sun.com if you need additional information or
+ * have any questions.
+ *
+ * GPL HEADER END
+ */
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Use is subject to license terms.
+ */
+/*
+ * This file is part of Lustre, http://www.lustre.org/
+ * Lustre is a trademark of Sun Microsystems, Inc.
+ *
+ * lustre/obdclass/uuid.c
+ *
+ * Public include file for the UUID library
+ */
+
+#define DEBUG_SUBSYSTEM S_CLASS
+
+# include <linux/libcfs/libcfs.h>
+
+#include <obd_support.h>
+#include <obd_class.h>
+
+
+static inline __u32 consume(int nob, __u8 **ptr)
+{
+	__u32 value;
+
+	LASSERT(nob <= sizeof value);
+
+	for (value = 0; nob > 0; --nob)
+		value = (value << 8) | *((*ptr)++);
+	return value;
+}
+
+#define CONSUME(val, ptr) (val) = consume(sizeof(val), (ptr))
+
+static void uuid_unpack(class_uuid_t in, __u16 *uu, int nr)
+{
+	__u8 *ptr = in;
+
+	LASSERT(nr * sizeof *uu == sizeof(class_uuid_t));
+
+	while (nr-- > 0)
+		CONSUME(uu[nr], &ptr);
+}
+
+void class_uuid_unparse(class_uuid_t uu, struct obd_uuid *out)
+{
+	/* uu as an array of __u16's */
+	__u16 uuid[sizeof(class_uuid_t) / sizeof(__u16)];
+
+	CLASSERT(ARRAY_SIZE(uuid) == 8);
+
+	uuid_unpack(uu, uuid, ARRAY_SIZE(uuid));
+	sprintf(out->uuid, "%04x%04x-%04x-%04x-%04x-%04x%04x%04x",
+		uuid[0], uuid[1], uuid[2], uuid[3],
+		uuid[4], uuid[5], uuid[6], uuid[7]);
+}
+EXPORT_SYMBOL(class_uuid_unparse);