21 files changed, 640 insertions, 42 deletions
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 652bea9054f0..06344d986eb9 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -597,7 +597,7 @@ endmenu # "Memory Debugging"
 
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
-	depends on DEBUG_KERNEL && GENERIC_HARDIRQS
+	depends on DEBUG_KERNEL
 	help
 	  Enable this to generate a spurious interrupt as soon as a shared
 	  interrupt handler is registered, and just before one is deregistered.
@@ -1461,7 +1461,7 @@ config BACKTRACE_SELF_TEST
 
 config RBTREE_TEST
 	tristate "Red-Black tree test"
-	depends on m && DEBUG_KERNEL
+	depends on DEBUG_KERNEL
 	help
 	  A benchmark measuring the performance of the rbtree library.
 	  Also includes rbtree invariant checks.
diff --git a/lib/Makefile b/lib/Makefile
index f2cb3082697c..f3bb2cb98adf 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -13,7 +13,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 sha1.o md5.o irq_regs.o reciprocal_div.o argv_split.o \
 	 proportions.o flex_proportions.o prio_heap.o ratelimit.o show_mem.o \
 	 is_single_threaded.o plist.o decompress.o kobject_uevent.o \
-	 earlycpio.o percpu-refcount.o
+	 earlycpio.o percpu-refcount.o percpu_ida.o
 
 obj-$(CONFIG_ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS) += usercopy.o
 lib-$(CONFIG_MMU) += ioremap.o
@@ -25,7 +25,8 @@ obj-y	+= lockref.o
 obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \
 	 bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \
 	 gcd.o lcm.o list_sort.o uuid.o flex_array.o iovec.o clz_ctz.o \
-	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o
+	 bsearch.o find_last_bit.o find_next_bit.o llist.o memweight.o kfifo.o \
+	 percpu_ida.o
 obj-y += string_helpers.o
 obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c
index 5fbed5caba6e..4f134d8907a7 100644
--- a/lib/cpu_rmap.c
+++ b/lib/cpu_rmap.c
@@ -8,9 +8,7 @@
  */
 
 #include <linux/cpu_rmap.h>
-#ifdef CONFIG_GENERIC_HARDIRQS
 #include <linux/interrupt.h>
-#endif
 #include <linux/export.h>
 
 /*
@@ -213,8 +211,6 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index,
 }
 EXPORT_SYMBOL(cpu_rmap_update);
 
-#ifdef CONFIG_GENERIC_HARDIRQS
-
 /* Glue between IRQ affinity notifiers and CPU rmaps */
 
 struct irq_glue {
@@ -309,5 +305,3 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq)
 	return rc;
 }
 EXPORT_SYMBOL(irq_cpu_rmap_add);
-
-#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/lib/crc-t10dif.c b/lib/crc-t10dif.c
index 43bc5b071f96..dfe6ec17c0a5 100644
--- a/lib/crc-t10dif.c
+++ b/lib/crc-t10dif.c
@@ -14,8 +14,10 @@
 #include <linux/err.h>
 #include <linux/init.h>
 #include <crypto/hash.h>
+#include <linux/static_key.h>
 
 static struct crypto_shash *crct10dif_tfm;
+static struct static_key crct10dif_fallback __read_mostly;
 
 __u16 crc_t10dif(const unsigned char *buffer, size_t len)
 {
@@ -25,6 +27,9 @@ __u16 crc_t10dif(const unsigned char *buffer, size_t len)
 	} desc;
 	int err;
 
+	if (static_key_false(&crct10dif_fallback))
+		return crc_t10dif_generic(0, buffer, len);
+
 	desc.shash.tfm = crct10dif_tfm;
 	desc.shash.flags = 0;
 	*(__u16 *)desc.ctx = 0;
@@ -39,7 +44,11 @@ EXPORT_SYMBOL(crc_t10dif);
 static int __init crc_t10dif_mod_init(void)
 {
 	crct10dif_tfm = crypto_alloc_shash("crct10dif", 0, 0);
-	return PTR_RET(crct10dif_tfm);
+	if (IS_ERR(crct10dif_tfm)) {
+		static_key_slow_inc(&crct10dif_fallback);
+		crct10dif_tfm = NULL;
+	}
+	return 0;
 }
 
 static void __exit crc_t10dif_mod_fini(void)
diff --git a/lib/crc32.c b/lib/crc32.c
index 072fbd8234d5..410093dbe51c 100644
--- a/lib/crc32.c
+++ b/lib/crc32.c
@@ -131,11 +131,14 @@ crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256])
 #endif
 
 /**
- * crc32_le() - Calculate bitwise little-endian Ethernet AUTODIN II CRC32
- * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
- *	other uses, or the previous crc32 value if computing incrementally.
- * @p: pointer to buffer over which CRC is run
+ * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
+ *			CRC32/CRC32C
+ * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for other
+ *	 uses, or the previous crc32/crc32c value if computing incrementally.
+ * @p: pointer to buffer over which CRC32/CRC32C is run
  * @len: length of buffer @p
+ * @tab: little-endian Ethernet table
+ * @polynomial: CRC32/CRC32c LE polynomial
  */
 static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p,
 					  size_t len, const u32 (*tab)[256],
@@ -201,11 +204,13 @@ EXPORT_SYMBOL(crc32_le);
 EXPORT_SYMBOL(__crc32c_le);
 
 /**
- * crc32_be() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
+ * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
  * @crc: seed value for computation.  ~0 for Ethernet, sometimes 0 for
  *	other uses, or the previous crc32 value if computing incrementally.
- * @p: pointer to buffer over which CRC is run
+ * @p: pointer to buffer over which CRC32 is run
  * @len: length of buffer @p
+ * @tab: big-endian Ethernet table
+ * @polynomial: CRC32 BE polynomial
  */
 static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p,
 					  size_t len, const u32 (*tab)[256],
diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c
index 19ff89e34eec..d619b28c456f 100644
--- a/lib/decompress_inflate.c
+++ b/lib/decompress_inflate.c
@@ -48,7 +48,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len,
 		out_len = 0x8000; /* 32 K */
 		out_buf = malloc(out_len);
 	} else {
-		out_len = 0x7fffffff; /* no limit */
+		out_len = ((size_t)~0) - (size_t)out_buf; /* no limit */
 	}
 	if (!out_buf) {
 		error("Out of memory while allocating output buffer");
diff --git a/lib/div64.c b/lib/div64.c
index a163b6caef73..4382ad77777e 100644
--- a/lib/div64.c
+++ b/lib/div64.c
@@ -79,6 +79,46 @@ EXPORT_SYMBOL(div_s64_rem);
 #endif
 
 /**
+ * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
+ * @dividend:	64bit dividend
+ * @divisor:	64bit divisor
+ * @remainder:  64bit remainder
+ *
+ * This implementation is a comparable to algorithm used by div64_u64.
+ * But this operation, which includes math for calculating the remainder,
+ * is kept distinct to avoid slowing down the div64_u64 operation on 32bit
+ * systems.
+ */
+#ifndef div64_u64_rem
+u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
+{
+	u32 high = divisor >> 32;
+	u64 quot;
+
+	if (high == 0) {
+		u32 rem32;
+		quot = div_u64_rem(dividend, divisor, &rem32);
+		*remainder = rem32;
+	} else {
+		int n = 1 + fls(high);
+		quot = div_u64(dividend >> n, divisor >> n);
+
+		if (quot != 0)
+			quot--;
+
+		*remainder = dividend - quot * divisor;
+		if (*remainder >= divisor) {
+			quot++;
+			*remainder -= divisor;
+		}
+	}
+
+	return quot;
+}
+EXPORT_SYMBOL(div64_u64_rem);
+#endif
+
+/**
  * div64_u64 - unsigned 64bit divide with 64bit divisor
  * @dividend:	64bit dividend
  * @divisor:	64bit divisor
diff --git a/lib/genalloc.c b/lib/genalloc.c
index b35cfa9bc3d4..26cf20be72b7 100644
--- a/lib/genalloc.c
+++ b/lib/genalloc.c
@@ -37,6 +37,11 @@
 #include <linux/of_address.h>
 #include <linux/of_device.h>
 
+static inline size_t chunk_size(const struct gen_pool_chunk *chunk)
+{
+	return chunk->end_addr - chunk->start_addr + 1;
+}
+
 static int set_bits_ll(unsigned long *addr, unsigned long mask_to_set)
 {
 	unsigned long val, nval;
@@ -182,13 +187,13 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy
 	int nbytes = sizeof(struct gen_pool_chunk) +
 				BITS_TO_LONGS(nbits) * sizeof(long);
 
-	chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid);
+	chunk = kzalloc_node(nbytes, GFP_KERNEL, nid);
 	if (unlikely(chunk == NULL))
 		return -ENOMEM;
 
 	chunk->phys_addr = phys;
 	chunk->start_addr = virt;
-	chunk->end_addr = virt + size;
+	chunk->end_addr = virt + size - 1;
 	atomic_set(&chunk->avail, size);
 
 	spin_lock(&pool->lock);
@@ -213,7 +218,7 @@ phys_addr_t gen_pool_virt_to_phys(struct gen_pool *pool, unsigned long addr)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
+		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
 			paddr = chunk->phys_addr + (addr - chunk->start_addr);
 			break;
 		}
@@ -242,7 +247,7 @@ void gen_pool_destroy(struct gen_pool *pool)
 		chunk = list_entry(_chunk, struct gen_pool_chunk, next_chunk);
 		list_del(&chunk->next_chunk);
 
-		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit = chunk_size(chunk) >> order;
 		bit = find_next_bit(chunk->bits, end_bit, 0);
 		BUG_ON(bit < end_bit);
 
@@ -283,7 +288,7 @@ unsigned long gen_pool_alloc(struct gen_pool *pool, size_t size)
 		if (size > atomic_read(&chunk->avail))
 			continue;
 
-		end_bit = (chunk->end_addr - chunk->start_addr) >> order;
+		end_bit = chunk_size(chunk) >> order;
 retry:
 		start_bit = pool->algo(chunk->bits, end_bit, start_bit, nbits,
 				pool->data);
@@ -330,8 +335,8 @@ void gen_pool_free(struct gen_pool *pool, unsigned long addr, size_t size)
 	nbits = (size + (1UL << order) - 1) >> order;
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk) {
-		if (addr >= chunk->start_addr && addr < chunk->end_addr) {
-			BUG_ON(addr + size > chunk->end_addr);
+		if (addr >= chunk->start_addr && addr <= chunk->end_addr) {
+			BUG_ON(addr + size - 1 > chunk->end_addr);
 			start_bit = (addr - chunk->start_addr) >> order;
 			remain = bitmap_clear_ll(chunk->bits, start_bit, nbits);
 			BUG_ON(remain);
@@ -400,7 +405,7 @@ size_t gen_pool_size(struct gen_pool *pool)
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(chunk, &pool->chunks, next_chunk)
-		size += chunk->end_addr - chunk->start_addr;
+		size += chunk_size(chunk);
 	rcu_read_unlock();
 	return size;
 }
@@ -519,7 +524,6 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order,
 /**
  * dev_get_gen_pool - Obtain the gen_pool (if any) for a device
  * @dev: device to retrieve the gen_pool from
- * @name: Optional name for the gen_pool, usually NULL
  *
  * Returns the gen_pool for the device if one is present, or NULL.
  */
diff --git a/lib/hexdump.c b/lib/hexdump.c
index 3f0494c9d57a..8499c810909a 100644
--- a/lib/hexdump.c
+++ b/lib/hexdump.c
@@ -14,6 +14,8 @@
 
 const char hex_asc[] = "0123456789abcdef";
 EXPORT_SYMBOL(hex_asc);
+const char hex_asc_upper[] = "0123456789ABCDEF";
+EXPORT_SYMBOL(hex_asc_upper);
 
 /**
  * hex_to_bin - convert a hex digit to its real value
diff --git a/lib/kobject.c b/lib/kobject.c
index 962175134702..084f7b18d0c0 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -592,7 +592,7 @@ static void kobject_release(struct kref *kref)
 {
 	struct kobject *kobj = container_of(kref, struct kobject, kref);
 #ifdef CONFIG_DEBUG_KOBJECT_RELEASE
-	pr_debug("kobject: '%s' (%p): %s, parent %p (delayed)\n",
+	pr_info("kobject: '%s' (%p): %s, parent %p (delayed)\n",
 		 kobject_name(kobj), kobj, __func__, kobj->parent);
 	INIT_DELAYED_WORK(&kobj->release, kobject_delayed_cleanup);
 	schedule_delayed_work(&kobj->release, HZ);
@@ -933,10 +933,7 @@ const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj)
 
 bool kobj_ns_current_may_mount(enum kobj_ns_type type)
 {
-	bool may_mount = false;
-
-	if (type == KOBJ_NS_TYPE_NONE)
-		return true;
+	bool may_mount = true;
 
 	spin_lock(&kobj_ns_type_lock);
 	if ((type > KOBJ_NS_TYPE_NONE) && (type < KOBJ_NS_TYPES) &&
diff --git a/lib/lockref.c b/lib/lockref.c
index e2cd2c0a8821..6f9d434c1521 100644
--- a/lib/lockref.c
+++ b/lib/lockref.c
@@ -4,6 +4,22 @@
 #ifdef CONFIG_CMPXCHG_LOCKREF
 
 /*
+ * Allow weakly-ordered memory architectures to provide barrier-less
+ * cmpxchg semantics for lockref updates.
+ */
+#ifndef cmpxchg64_relaxed
+# define cmpxchg64_relaxed cmpxchg64
+#endif
+
+/*
+ * Allow architectures to override the default cpu_relax() within CMPXCHG_LOOP.
+ * This is useful for architectures with an expensive cpu_relax().
+ */
+#ifndef arch_mutex_cpu_relax
+# define arch_mutex_cpu_relax() cpu_relax()
+#endif
+
+/*
  * Note that the "cmpxchg()" reloads the "old" value for the
  * failure case.
  */
@@ -14,12 +30,13 @@
 	while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) {  	\
 		struct lockref new = old, prev = old;				\
 		CODE								\
-		old.lock_count = cmpxchg(&lockref->lock_count,			\
-					 old.lock_count, new.lock_count);	\
+		old.lock_count = cmpxchg64_relaxed(&lockref->lock_count,	\
+						   old.lock_count,		\
+						   new.lock_count);		\
 		if (likely(old.lock_count == prev.lock_count)) {		\
 			SUCCESS;						\
 		}								\
-		cpu_relax();							\
+		arch_mutex_cpu_relax();						\
 	}									\
 } while (0)
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 411be80ddb46..df6839e3ce08 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -283,8 +283,8 @@ _output_error:
 	return (int) (-(((char *) ip) - source));
 }
 
-int lz4_decompress(const char *src, size_t *src_len, char *dest,
-		size_t actual_dest_len)
+int lz4_decompress(const unsigned char *src, size_t *src_len,
+		unsigned char *dest, size_t actual_dest_len)
 {
 	int ret = -1;
 	int input_len = 0;
@@ -302,8 +302,8 @@ exit_0:
 EXPORT_SYMBOL(lz4_decompress);
 #endif
 
-int lz4_decompress_unknownoutputsize(const char *src, size_t src_len,
-		char *dest, size_t *dest_len)
+int lz4_decompress_unknownoutputsize(const unsigned char *src, size_t src_len,
+		unsigned char *dest, size_t *dest_len)
 {
 	int ret = -1;
 	int out_len = 0;
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 7deeb6297a48..1a53d497a8c5 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release)
 	ref->release = release;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(percpu_ref_init);
 
 /**
  * percpu_ref_cancel_init - cancel percpu_ref_init()
@@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref)
 		free_percpu(ref->pcpu_count);
 	}
 }
+EXPORT_SYMBOL_GPL(percpu_ref_cancel_init);
 
 static void percpu_ref_kill_rcu(struct rcu_head *rcu)
 {
@@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 
 	call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu);
 }
+EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
new file mode 100644
index 000000000000..bab1ba2a4c71
--- /dev/null
+++ b/lib/percpu_ida.c
@@ -0,0 +1,335 @@
+/*
+ * Percpu IDA library
+ *
+ * Copyright (C) 2013 Datera, Inc. Kent Overstreet
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/hardirq.h>
+#include <linux/idr.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_ida.h>
+
+/*
+ * Number of tags we move between the percpu freelist and the global freelist at
+ * a time
+ */
+#define IDA_PCPU_BATCH_MOVE	32U
+
+/* Max size of percpu freelist, */
+#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
+
+struct percpu_ida_cpu {
+	/*
+	 * Even though this is percpu, we need a lock for tag stealing by remote
+	 * CPUs:
+	 */
+	spinlock_t			lock;
+
+	/* nr_free/freelist form a stack of free IDs */
+	unsigned			nr_free;
+	unsigned			freelist[];
+};
+
+static inline void move_tags(unsigned *dst, unsigned *dst_nr,
+			     unsigned *src, unsigned *src_nr,
+			     unsigned nr)
+{
+	*src_nr -= nr;
+	memcpy(dst + *dst_nr, src + *src_nr, sizeof(unsigned) * nr);
+	*dst_nr += nr;
+}
+
+/*
+ * Try to steal tags from a remote cpu's percpu freelist.
+ *
+ * We first check how many percpu freelists have tags - we don't steal tags
+ * unless enough percpu freelists have tags on them that it's possible more than
+ * half the total tags could be stuck on remote percpu freelists.
+ *
+ * Then we iterate through the cpus until we find some tags - we don't attempt
+ * to find the "best" cpu to steal from, to keep cacheline bouncing to a
+ * minimum.
+ */
+static inline void steal_tags(struct percpu_ida *pool,
+			      struct percpu_ida_cpu *tags)
+{
+	unsigned cpus_have_tags, cpu = pool->cpu_last_stolen;
+	struct percpu_ida_cpu *remote;
+
+	for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
+	     cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+	     cpus_have_tags--) {
+		cpu = cpumask_next(cpu, &pool->cpus_have_tags);
+
+		if (cpu >= nr_cpu_ids) {
+			cpu = cpumask_first(&pool->cpus_have_tags);
+			if (cpu >= nr_cpu_ids)
+				BUG();
+		}
+
+		pool->cpu_last_stolen = cpu;
+		remote = per_cpu_ptr(pool->tag_cpu, cpu);
+
+		cpumask_clear_cpu(cpu, &pool->cpus_have_tags);
+
+		if (remote == tags)
+			continue;
+
+		spin_lock(&remote->lock);
+
+		if (remote->nr_free) {
+			memcpy(tags->freelist,
+			       remote->freelist,
+			       sizeof(unsigned) * remote->nr_free);
+
+			tags->nr_free = remote->nr_free;
+			remote->nr_free = 0;
+		}
+
+		spin_unlock(&remote->lock);
+
+		if (tags->nr_free)
+			break;
+	}
+}
+
+/*
+ * Pop up to IDA_PCPU_BATCH_MOVE IDs off the global freelist, and push them onto
+ * our percpu freelist:
+ */
+static inline void alloc_global_tags(struct percpu_ida *pool,
+				     struct percpu_ida_cpu *tags)
+{
+	move_tags(tags->freelist, &tags->nr_free,
+		  pool->freelist, &pool->nr_free,
+		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+}
+
+static inline unsigned alloc_local_tag(struct percpu_ida *pool,
+				       struct percpu_ida_cpu *tags)
+{
+	int tag = -ENOSPC;
+
+	spin_lock(&tags->lock);
+	if (tags->nr_free)
+		tag = tags->freelist[--tags->nr_free];
+	spin_unlock(&tags->lock);
+
+	return tag;
+}
+
+/**
+ * percpu_ida_alloc - allocate a tag
+ * @pool: pool to allocate from
+ * @gfp: gfp flags
+ *
+ * Returns a tag - an integer in the range [0..nr_tags) (passed to
+ * tag_pool_init()), or otherwise -ENOSPC on allocation failure.
+ *
+ * Safe to be called from interrupt context (assuming it isn't passed
+ * __GFP_WAIT, of course).
+ *
+ * @gfp indicates whether or not to wait until a free id is available (it's not
+ * used for internal memory allocations); thus if passed __GFP_WAIT we may sleep
+ * however long it takes until another thread frees an id (same semantics as a
+ * mempool).
+ *
+ * Will not fail if passed __GFP_WAIT.
+ */
+int percpu_ida_alloc(struct percpu_ida *pool, gfp_t gfp)
+{
+	DEFINE_WAIT(wait);
+	struct percpu_ida_cpu *tags;
+	unsigned long flags;
+	int tag;
+
+	local_irq_save(flags);
+	tags = this_cpu_ptr(pool->tag_cpu);
+
+	/* Fastpath */
+	tag = alloc_local_tag(pool, tags);
+	if (likely(tag >= 0)) {
+		local_irq_restore(flags);
+		return tag;
+	}
+
+	while (1) {
+		spin_lock(&pool->lock);
+
+		/*
+		 * prepare_to_wait() must come before steal_tags(), in case
+		 * percpu_ida_free() on another cpu flips a bit in
+		 * cpus_have_tags
+		 *
+		 * global lock held and irqs disabled, don't need percpu lock
+		 */
+		prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+
+		if (!tags->nr_free)
+			alloc_global_tags(pool, tags);
+		if (!tags->nr_free)
+			steal_tags(pool, tags);
+
+		if (tags->nr_free) {
+			tag = tags->freelist[--tags->nr_free];
+			if (tags->nr_free)
+				cpumask_set_cpu(smp_processor_id(),
+						&pool->cpus_have_tags);
+		}
+
+		spin_unlock(&pool->lock);
+		local_irq_restore(flags);
+
+		if (tag >= 0 || !(gfp & __GFP_WAIT))
+			break;
+
+		schedule();
+
+		local_irq_save(flags);
+		tags = this_cpu_ptr(pool->tag_cpu);
+	}
+
+	finish_wait(&pool->wait, &wait);
+	return tag;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_alloc);
+
+/**
+ * percpu_ida_free - free a tag
+ * @pool: pool @tag was allocated from
+ * @tag: a tag previously allocated with percpu_ida_alloc()
+ *
+ * Safe to be called from interrupt context.
+ */
+void percpu_ida_free(struct percpu_ida *pool, unsigned tag)
+{
+	struct percpu_ida_cpu *tags;
+	unsigned long flags;
+	unsigned nr_free;
+
+	BUG_ON(tag >= pool->nr_tags);
+
+	local_irq_save(flags);
+	tags = this_cpu_ptr(pool->tag_cpu);
+
+	spin_lock(&tags->lock);
+	tags->freelist[tags->nr_free++] = tag;
+
+	nr_free = tags->nr_free;
+	spin_unlock(&tags->lock);
+
+	if (nr_free == 1) {
+		cpumask_set_cpu(smp_processor_id(),
+				&pool->cpus_have_tags);
+		wake_up(&pool->wait);
+	}
+
+	if (nr_free == IDA_PCPU_SIZE) {
+		spin_lock(&pool->lock);
+
+		/*
+		 * Global lock held and irqs disabled, don't need percpu
+		 * lock
+		 */
+		if (tags->nr_free == IDA_PCPU_SIZE) {
+			move_tags(pool->freelist, &pool->nr_free,
+				  tags->freelist, &tags->nr_free,
+				  IDA_PCPU_BATCH_MOVE);
+
+			wake_up(&pool->wait);
+		}
+		spin_unlock(&pool->lock);
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(percpu_ida_free);
+
+/**
+ * percpu_ida_destroy - release a tag pool's resources
+ * @pool: pool to free
+ *
+ * Frees the resources allocated by percpu_ida_init().
+ */
+void percpu_ida_destroy(struct percpu_ida *pool)
+{
+	free_percpu(pool->tag_cpu);
+	free_pages((unsigned long) pool->freelist,
+		   get_order(pool->nr_tags * sizeof(unsigned)));
+}
+EXPORT_SYMBOL_GPL(percpu_ida_destroy);
+
+/**
+ * percpu_ida_init - initialize a percpu tag pool
+ * @pool: pool to initialize
+ * @nr_tags: number of tags that will be available for allocation
+ *
+ * Initializes @pool so that it can be used to allocate tags - integers in the
+ * range [0, nr_tags). Typically, they'll be used by driver code to refer to a
+ * preallocated array of tag structures.
+ *
+ * Allocation is percpu, but sharding is limited by nr_tags - for best
+ * performance, the workload should not span more cpus than nr_tags / 128.
+ */
+int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+{
+	unsigned i, cpu, order;
+
+	memset(pool, 0, sizeof(*pool));
+
+	init_waitqueue_head(&pool->wait);
+	spin_lock_init(&pool->lock);
+	pool->nr_tags = nr_tags;
+
+	/* Guard against overflow */
+	if (nr_tags > (unsigned) INT_MAX + 1) {
+		pr_err("percpu_ida_init(): nr_tags too large\n");
+		return -EINVAL;
+	}
+
+	order = get_order(nr_tags * sizeof(unsigned));
+	pool->freelist = (void *) __get_free_pages(GFP_KERNEL, order);
+	if (!pool->freelist)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_tags; i++)
+		pool->freelist[i] = i;
+
+	pool->nr_free = nr_tags;
+
+	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
+				       IDA_PCPU_SIZE * sizeof(unsigned),
+				       sizeof(unsigned));
+	if (!pool->tag_cpu)
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		spin_lock_init(&per_cpu_ptr(pool->tag_cpu, cpu)->lock);
+
+	return 0;
+err:
+	percpu_ida_destroy(pool);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(percpu_ida_init);
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index e7964296fd50..7811ed3b4e70 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -32,6 +32,7 @@
 #include <linux/string.h>
 #include <linux/bitops.h>
 #include <linux/rcupdate.h>
+#include <linux/hardirq.h>		/* in_interrupt() */
 
 
 #ifdef __KERNEL__
@@ -207,7 +208,12 @@ radix_tree_node_alloc(struct radix_tree_root *root)
 	struct radix_tree_node *ret = NULL;
 	gfp_t gfp_mask = root_gfp_mask(root);
 
-	if (!(gfp_mask & __GFP_WAIT)) {
+	/*
+	 * Preload code isn't irq safe and it doesn't make sence to use
+	 * preloading in the interrupt anyway as all the allocations have to
+	 * be atomic. So just do normal allocation when in interrupt.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) && !in_interrupt()) {
 		struct radix_tree_preload *rtp;
 
 		/*
@@ -264,7 +270,7 @@ radix_tree_node_free(struct radix_tree_node *node)
  * To make use of this facility, the radix tree must be initialised without
  * __GFP_WAIT being passed to INIT_RADIX_TREE().
  */
-int radix_tree_preload(gfp_t gfp_mask)
+static int __radix_tree_preload(gfp_t gfp_mask)
 {
 	struct radix_tree_preload *rtp;
 	struct radix_tree_node *node;
@@ -288,9 +294,40 @@ int radix_tree_preload(gfp_t gfp_mask)
 out:
 	return ret;
 }
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ *
+ * To make use of this facility, the radix tree must be initialised without
+ * __GFP_WAIT being passed to INIT_RADIX_TREE().
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+	/* Warn on non-sensical use... */
+	WARN_ON_ONCE(!(gfp_mask & __GFP_WAIT));
+	return __radix_tree_preload(gfp_mask);
+}
 EXPORT_SYMBOL(radix_tree_preload);
 
 /*
+ * The same as above function, except we don't guarantee preloading happens.
+ * We do it, if we decide it helps. On success, return zero with preemption
+ * disabled. On error, return -ENOMEM with preemption not disabled.
+ */
+int radix_tree_maybe_preload(gfp_t gfp_mask)
+{
+	if (gfp_mask & __GFP_WAIT)
+		return __radix_tree_preload(gfp_mask);
+	/* Preloading doesn't help anything with this gfp mask, skip it */
+	preempt_disable();
+	return 0;
+}
+EXPORT_SYMBOL(radix_tree_maybe_preload);
+
+/*
  *	Return the maximum key which can be store into a
  *	radix tree with height HEIGHT.
  */
diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index b4625787c7ee..c7dab0645554 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -6,6 +6,7 @@ raid6_pq-y	+= algos.o recov.o tables.o int1.o int2.o int4.o \
 raid6_pq-$(CONFIG_X86) += recov_ssse3.o recov_avx2.o mmx.o sse1.o sse2.o avx2.o
 raid6_pq-$(CONFIG_ALTIVEC) += altivec1.o altivec2.o altivec4.o altivec8.o
 raid6_pq-$(CONFIG_KERNEL_MODE_NEON) += neon.o neon1.o neon2.o neon4.o neon8.o
+raid6_pq-$(CONFIG_TILEGX) += tilegx8.o
 
 hostprogs-y	+= mktables
 
@@ -110,6 +111,11 @@ $(obj)/neon8.c:   UNROLL := 8
 $(obj)/neon8.c:   $(src)/neon.uc $(src)/unroll.awk FORCE
 	$(call if_changed,unroll)
 
+targets += tilegx8.c
+$(obj)/tilegx8.c:   UNROLL := 8
+$(obj)/tilegx8.c:   $(src)/tilegx.uc $(src)/unroll.awk FORCE
+	$(call if_changed,unroll)
+
 quiet_cmd_mktable = TABLE   $@
       cmd_mktable = $(obj)/mktables > $@ || ( rm -f $@ && exit 1 )
 
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 74e6f5629dbc..f0b1aa3586d1 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -66,6 +66,9 @@ const struct raid6_calls * const raid6_algos[] = {
 	&raid6_altivec4,
 	&raid6_altivec8,
 #endif
+#if defined(CONFIG_TILEGX)
+	&raid6_tilegx8,
+#endif
 	&raid6_intx1,
 	&raid6_intx2,
 	&raid6_intx4,
diff --git a/lib/raid6/test/Makefile b/lib/raid6/test/Makefile
index 28afa1a06e03..29090f3db677 100644
--- a/lib/raid6/test/Makefile
+++ b/lib/raid6/test/Makefile
@@ -40,13 +40,16 @@ else ifeq ($(HAS_NEON),yes)
         OBJS   += neon.o neon1.o neon2.o neon4.o neon8.o
         CFLAGS += -DCONFIG_KERNEL_MODE_NEON=1
 else
-        HAS_ALTIVEC := $(shell echo -e '\#include <altivec.h>\nvector int a;' |\
+        HAS_ALTIVEC := $(shell printf '\#include <altivec.h>\nvector int a;\n' |\
                          gcc -c -x c - >&/dev/null && \
                          rm ./-.o && echo yes)
         ifeq ($(HAS_ALTIVEC),yes)
                 OBJS += altivec1.o altivec2.o altivec4.o altivec8.o
         endif
 endif
+ifeq ($(ARCH),tilegx)
+OBJS += tilegx8.o
+endif
 
 .c.o:
 	$(CC) $(CFLAGS) -c -o $@ $<
@@ -109,11 +112,15 @@ int16.c: int.uc ../unroll.awk
 int32.c: int.uc ../unroll.awk
 	$(AWK) ../unroll.awk -vN=32 < int.uc > $@
 
+tilegx8.c: tilegx.uc ../unroll.awk
+	$(AWK) ../unroll.awk -vN=8 < tilegx.uc > $@
+
 tables.c: mktables
 	./mktables > tables.c
 
 clean:
 	rm -f *.o *.a mktables mktables.c *.uc int*.c altivec*.c neon*.c tables.c raid6test
+	rm -f tilegx*.c
 
 spotless: clean
 	rm -f *~
diff --git a/lib/raid6/tilegx.uc b/lib/raid6/tilegx.uc
new file mode 100644
index 000000000000..e7c29459cbcd
--- /dev/null
+++ b/lib/raid6/tilegx.uc
@@ -0,0 +1,86 @@
+/* -*- linux-c -*- ------------------------------------------------------- *
+ *
+ *   Copyright 2002 H. Peter Anvin - All Rights Reserved
+ *   Copyright 2012 Tilera Corporation - All Rights Reserved
+ *
+ *   This program is free software; you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
+ *   Boston MA 02111-1307, USA; either version 2 of the License, or
+ *   (at your option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * tilegx$#.c
+ *
+ * $#-way unrolled TILE-Gx SIMD for RAID-6 math.
+ *
+ * This file is postprocessed using unroll.awk.
+ *
+ */
+
+#include <linux/raid/pq.h>
+
+/* Create 8 byte copies of constant byte */
+# define NBYTES(x) (__insn_v1addi(0, x))
+# define NSIZE  8
+
+/*
+ * The SHLBYTE() operation shifts each byte left by 1, *not*
+ * rolling over into the next byte
+ */
+static inline __attribute_const__ u64 SHLBYTE(u64 v)
+{
+	/* Vector One Byte Shift Left Immediate. */
+	return __insn_v1shli(v, 1);
+}
+
+/*
+ * The MASK() operation returns 0xFF in any byte for which the high
+ * bit is 1, 0x00 for any byte for which the high bit is 0.
+ */
+static inline __attribute_const__ u64 MASK(u64 v)
+{
+	/* Vector One Byte Shift Right Signed Immediate. */
+	return __insn_v1shrsi(v, 7);
+}
+
+
+void raid6_tilegx$#_gen_syndrome(int disks, size_t bytes, void **ptrs)
+{
+	u8 **dptr = (u8 **)ptrs;
+	u64 *p, *q;
+	int d, z, z0;
+
+	u64 wd$$, wq$$, wp$$, w1$$, w2$$;
+	u64 x1d = NBYTES(0x1d);
+	u64 * z0ptr;
+
+	z0 = disks - 3;			/* Highest data disk */
+	p = (u64 *)dptr[z0+1];	/* XOR parity */
+	q = (u64 *)dptr[z0+2];	/* RS syndrome */
+
+	z0ptr = (u64 *)&dptr[z0][0];
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = wp$$ = *z0ptr++;
+		for ( z = z0-1 ; z >= 0 ; z-- ) {
+			wd$$ = *(u64 *)&dptr[z][d+$$*NSIZE];
+			wp$$ = wp$$ ^ wd$$;
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+			w2$$ = w2$$ & x1d;
+			w1$$ = w1$$ ^ w2$$;
+			wq$$ = w1$$ ^ wd$$;
+		}
+		*p++ = wp$$;
+		*q++ = wq$$;
+	}
+}
+
+const struct raid6_calls raid6_tilegx$# = {
+	raid6_tilegx$#_gen_syndrome,
+	NULL,
+	"tilegx$#",
+	0
+};
diff --git a/lib/rbtree.c b/lib/rbtree.c
index c0e31fe2fabf..65f4effd117f 100644
--- a/lib/rbtree.c
+++ b/lib/rbtree.c
@@ -518,3 +518,43 @@ void rb_replace_node(struct rb_node *victim, struct rb_node *new,
 	*new = *victim;
 }
 EXPORT_SYMBOL(rb_replace_node);
+
+static struct rb_node *rb_left_deepest_node(const struct rb_node *node)
+{
+	for (;;) {
+		if (node->rb_left)
+			node = node->rb_left;
+		else if (node->rb_right)
+			node = node->rb_right;
+		else
+			return (struct rb_node *)node;
+	}
+}
+
+struct rb_node *rb_next_postorder(const struct rb_node *node)
+{
+	const struct rb_node *parent;
+	if (!node)
+		return NULL;
+	parent = rb_parent(node);
+
+	/* If we're sitting on node, we've already seen our children */
+	if (parent && node == parent->rb_left && parent->rb_right) {
+		/* If we are the parent's left node, go to the parent's right
+		 * node then all the way down to the left */
+		return rb_left_deepest_node(parent->rb_right);
+	} else
+		/* Otherwise we are the parent's right node, and the parent
+		 * should be next */
+		return (struct rb_node *)parent;
+}
+EXPORT_SYMBOL(rb_next_postorder);
+
+struct rb_node *rb_first_postorder(const struct rb_root *root)
+{
+	if (!root->rb_node)
+		return NULL;
+
+	return rb_left_deepest_node(root->rb_node);
+}
+EXPORT_SYMBOL(rb_first_postorder);
diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 122f02f9941b..31dd4ccd3baa 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -114,6 +114,16 @@ static int black_path_count(struct rb_node *rb)
 	return count;
 }
 
+static void check_postorder(int nr_nodes)
+{
+	struct rb_node *rb;
+	int count = 0;
+	for (rb = rb_first_postorder(&root); rb; rb = rb_next_postorder(rb))
+		count++;
+
+	WARN_ON_ONCE(count != nr_nodes);
+}
+
 static void check(int nr_nodes)
 {
 	struct rb_node *rb;
@@ -136,6 +146,8 @@ static void check(int nr_nodes)
 
 	WARN_ON_ONCE(count != nr_nodes);
 	WARN_ON_ONCE(count < (1 << black_path_count(rb_last(&root))) - 1);
+
+	check_postorder(nr_nodes);
 }
 
 static void check_augmented(int nr_nodes)