aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lib
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/lib')
-rw-r--r--arch/x86/lib/Makefile5
-rw-r--r--arch/x86/lib/Makefile_3211
-rw-r--r--arch/x86/lib/Makefile_6413
-rw-r--r--arch/x86/lib/bitops_32.c70
-rw-r--r--arch/x86/lib/bitops_64.c175
-rw-r--r--arch/x86/lib/bitstr_64.c28
-rw-r--r--arch/x86/lib/checksum_32.S546
-rw-r--r--arch/x86/lib/clear_page_64.S59
-rw-r--r--arch/x86/lib/copy_page_64.S119
-rw-r--r--arch/x86/lib/copy_user_64.S354
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S217
-rw-r--r--arch/x86/lib/csum-copy_64.S249
-rw-r--r--arch/x86/lib/csum-partial_64.c150
-rw-r--r--arch/x86/lib/csum-wrappers_64.c135
-rw-r--r--arch/x86/lib/delay_32.c103
-rw-r--r--arch/x86/lib/delay_64.c57
-rw-r--r--arch/x86/lib/getuser_32.S78
-rw-r--r--arch/x86/lib/getuser_64.S109
-rw-r--r--arch/x86/lib/io_64.c23
-rw-r--r--arch/x86/lib/iomap_copy_64.S30
-rw-r--r--arch/x86/lib/memcpy_32.c43
-rw-r--r--arch/x86/lib/memcpy_64.S131
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S133
-rw-r--r--arch/x86/lib/mmx_32.c403
-rw-r--r--arch/x86/lib/msr-on-cpu.c119
-rw-r--r--arch/x86/lib/putuser_32.S98
-rw-r--r--arch/x86/lib/putuser_64.S106
-rw-r--r--arch/x86/lib/rwlock_64.S38
-rw-r--r--arch/x86/lib/semaphore_32.S219
-rw-r--r--arch/x86/lib/string_32.c257
-rw-r--r--arch/x86/lib/strstr_32.c31
-rw-r--r--arch/x86/lib/thunk_64.S67
-rw-r--r--arch/x86/lib/usercopy_32.c882
-rw-r--r--arch/x86/lib/usercopy_64.c166
35 files changed, 5245 insertions, 0 deletions
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
new file mode 100644
index 000000000000..329da276c6f1
--- /dev/null
+++ b/arch/x86/lib/Makefile
@@ -0,0 +1,5 @@
+ifeq ($(CONFIG_X86_32),y)
+include ${srctree}/arch/x86/lib/Makefile_32
+else
+include ${srctree}/arch/x86/lib/Makefile_64
+endif
diff --git a/arch/x86/lib/Makefile_32 b/arch/x86/lib/Makefile_32
new file mode 100644
index 000000000000..98d1f1e2e2ef
--- /dev/null
+++ b/arch/x86/lib/Makefile_32
@@ -0,0 +1,11 @@
+#
+# Makefile for i386-specific library files..
+#
+
+
+lib-y = checksum_32.o delay_32.o usercopy_32.o getuser_32.o putuser_32.o memcpy_32.o strstr_32.o \
+ bitops_32.o semaphore_32.o string_32.o
+
+lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
+
+obj-$(CONFIG_SMP) += msr-on-cpu.o
diff --git a/arch/x86/lib/Makefile_64 b/arch/x86/lib/Makefile_64
new file mode 100644
index 000000000000..bbabad3c9335
--- /dev/null
+++ b/arch/x86/lib/Makefile_64
@@ -0,0 +1,13 @@
+#
+# Makefile for x86_64-specific library files.
+#
+
+CFLAGS_csum-partial_64.o := -funroll-loops
+
+obj-y := io_64.o iomap_copy_64.o
+obj-$(CONFIG_SMP) += msr-on-cpu.o
+
+lib-y := csum-partial_64.o csum-copy_64.o csum-wrappers_64.o delay_64.o \
+ usercopy_64.o getuser_64.o putuser_64.o \
+ thunk_64.o clear_page_64.o copy_page_64.o bitstr_64.o bitops_64.o
+lib-y += memcpy_64.o memmove_64.o memset_64.o copy_user_64.o rwlock_64.o copy_user_nocache_64.o
diff --git a/arch/x86/lib/bitops_32.c b/arch/x86/lib/bitops_32.c
new file mode 100644
index 000000000000..afd0045595d4
--- /dev/null
+++ b/arch/x86/lib/bitops_32.c
@@ -0,0 +1,70 @@
+#include <linux/bitops.h>
+#include <linux/module.h>
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_bit(const unsigned long *addr, int size, int offset)
+{
+ const unsigned long *p = addr + (offset >> 5);
+ int set = 0, bit = offset & 31, res;
+
+ if (bit) {
+ /*
+ * Look for nonzero in the first 32 bits:
+ */
+ __asm__("bsfl %1,%0\n\t"
+ "jne 1f\n\t"
+ "movl $32, %0\n"
+ "1:"
+ : "=r" (set)
+ : "r" (*p >> bit));
+ if (set < (32 - bit))
+ return set + offset;
+ set = 32 - bit;
+ p++;
+ }
+ /*
+ * No set bit yet, search remaining full words for a bit
+ */
+ res = find_first_bit (p, size - 32 * (p - addr));
+ return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_bit);
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+int find_next_zero_bit(const unsigned long *addr, int size, int offset)
+{
+ const unsigned long *p = addr + (offset >> 5);
+ int set = 0, bit = offset & 31, res;
+
+ if (bit) {
+ /*
+ * Look for zero in the first 32 bits.
+ */
+ __asm__("bsfl %1,%0\n\t"
+ "jne 1f\n\t"
+ "movl $32, %0\n"
+ "1:"
+ : "=r" (set)
+ : "r" (~(*p >> bit)));
+ if (set < (32 - bit))
+ return set + offset;
+ set = 32 - bit;
+ p++;
+ }
+ /*
+ * No zero yet, search remaining full bytes for a zero
+ */
+ res = find_first_zero_bit(p, size - 32 * (p - addr));
+ return (offset + set + res);
+}
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitops_64.c b/arch/x86/lib/bitops_64.c
new file mode 100644
index 000000000000..95b6d9639fba
--- /dev/null
+++ b/arch/x86/lib/bitops_64.c
@@ -0,0 +1,175 @@
+#include <linux/bitops.h>
+
+#undef find_first_zero_bit
+#undef find_next_zero_bit
+#undef find_first_bit
+#undef find_next_bit
+
+static inline long
+__find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1, d2;
+ long res;
+
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the je
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_zero_bit() does when offset and size are at the
+ * same word and it fails to find a zero itself.
+ */
+ size += 63;
+ size >>= 6;
+ if (!size)
+ return 0;
+ asm volatile(
+ " repe; scasq\n"
+ " je 1f\n"
+ " xorq -8(%%rdi),%%rax\n"
+ " subq $8,%%rdi\n"
+ " bsfq %%rax,%%rdx\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rdx"
+ :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2)
+ :"0" (0ULL), "1" (size), "2" (addr), "3" (-1ULL),
+ [addr] "S" (addr) : "memory");
+ /*
+ * Any register would do for [addr] above, but GCC tends to
+ * prefer rbx over rsi, even though rsi is readily available
+ * and doesn't have to be saved.
+ */
+ return res;
+}
+
+/**
+ * find_first_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first zero bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_zero_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_zero_bit (addr, size);
+}
+
+/**
+ * find_next_zero_bit - find the first zero bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_zero_bit (const unsigned long * addr, long size, long offset)
+{
+ const unsigned long * p = addr + (offset >> 6);
+ unsigned long set = 0;
+ unsigned long res, bit = offset&63;
+
+ if (bit) {
+ /*
+ * Look for zero in first word
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0"
+ : "=r" (set)
+ : "r" (~(*p >> bit)), "r"(64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No zero yet, search remaining full words for a zero
+ */
+ res = __find_first_zero_bit (p, size - 64 * (p - addr));
+
+ return (offset + set + res);
+}
+
+static inline long
+__find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ long d0, d1;
+ long res;
+
+ /*
+ * We must test the size in words, not in bits, because
+ * otherwise incoming sizes in the range -63..-1 will not run
+ * any scasq instructions, and then the flags used by the jz
+ * instruction will have whatever random value was in place
+ * before. Nobody should call us like that, but
+ * find_next_bit() does when offset and size are at the same
+ * word and it fails to find a one itself.
+ */
+ size += 63;
+ size >>= 6;
+ if (!size)
+ return 0;
+ asm volatile(
+ " repe; scasq\n"
+ " jz 1f\n"
+ " subq $8,%%rdi\n"
+ " bsfq (%%rdi),%%rax\n"
+ "1: subq %[addr],%%rdi\n"
+ " shlq $3,%%rdi\n"
+ " addq %%rdi,%%rax"
+ :"=a" (res), "=&c" (d0), "=&D" (d1)
+ :"0" (0ULL), "1" (size), "2" (addr),
+ [addr] "r" (addr) : "memory");
+ return res;
+}
+
+/**
+ * find_first_bit - find the first set bit in a memory region
+ * @addr: The address to start the search at
+ * @size: The maximum size to search
+ *
+ * Returns the bit-number of the first set bit, not the number of the byte
+ * containing a bit.
+ */
+long find_first_bit(const unsigned long * addr, unsigned long size)
+{
+ return __find_first_bit(addr,size);
+}
+
+/**
+ * find_next_bit - find the first set bit in a memory region
+ * @addr: The address to base the search on
+ * @offset: The bitnumber to start searching at
+ * @size: The maximum size to search
+ */
+long find_next_bit(const unsigned long * addr, long size, long offset)
+{
+ const unsigned long * p = addr + (offset >> 6);
+ unsigned long set = 0, bit = offset & 63, res;
+
+ if (bit) {
+ /*
+ * Look for nonzero in the first 64 bits:
+ */
+ asm("bsfq %1,%0\n\t"
+ "cmoveq %2,%0\n\t"
+ : "=r" (set)
+ : "r" (*p >> bit), "r" (64L));
+ if (set < (64 - bit))
+ return set + offset;
+ set = 64 - bit;
+ p++;
+ }
+ /*
+ * No set bit yet, search remaining full words for a bit
+ */
+ res = __find_first_bit (p, size - 64 * (p - addr));
+ return (offset + set + res);
+}
+
+#include <linux/module.h>
+
+EXPORT_SYMBOL(find_next_bit);
+EXPORT_SYMBOL(find_first_bit);
+EXPORT_SYMBOL(find_first_zero_bit);
+EXPORT_SYMBOL(find_next_zero_bit);
diff --git a/arch/x86/lib/bitstr_64.c b/arch/x86/lib/bitstr_64.c
new file mode 100644
index 000000000000..24676609a6ac
--- /dev/null
+++ b/arch/x86/lib/bitstr_64.c
@@ -0,0 +1,28 @@
+#include <linux/module.h>
+#include <linux/bitops.h>
+
+/* Find string of zero bits in a bitmap */
+unsigned long
+find_next_zero_string(unsigned long *bitmap, long start, long nbits, int len)
+{
+ unsigned long n, end, i;
+
+ again:
+ n = find_next_zero_bit(bitmap, nbits, start);
+ if (n == -1)
+ return -1;
+
+ /* could test bitsliced, but it's hardly worth it */
+ end = n+len;
+ if (end >= nbits)
+ return -1;
+ for (i = n+1; i < end; i++) {
+ if (test_bit(i, bitmap)) {
+ start = i+1;
+ goto again;
+ }
+ }
+ return n;
+}
+
+EXPORT_SYMBOL(find_next_zero_string);
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
new file mode 100644
index 000000000000..adbccd0bbb78
--- /dev/null
+++ b/arch/x86/lib/checksum_32.S
@@ -0,0 +1,546 @@
+/*
+ * INET An implementation of the TCP/IP protocol suite for the LINUX
+ * operating system. INET is implemented using the BSD Socket
+ * interface as the means of communication with the user level.
+ *
+ * IP/TCP/UDP checksumming routines
+ *
+ * Authors: Jorge Cwik, <jorge@laser.satlink.net>
+ * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
+ * Tom May, <ftom@netcom.com>
+ * Pentium Pro/II routines:
+ * Alexander Kjeldaas <astor@guardian.no>
+ * Finn Arne Gangstad <finnag@guardian.no>
+ * Lots of code moved from tcp.c and ip.c; see those files
+ * for more names.
+ *
+ * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception
+ * handling.
+ * Andi Kleen, add zeroing on error
+ * converted to pure assembler
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+
+/*
+ * computes a partial checksum, e.g. for TCP/UDP fragments
+ */
+
+/*
+unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
+ */
+
+.text
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+ /*
+ * Experiments with Ethernet and SLIP connections show that buff
+ * is aligned on either a 2-byte or 4-byte boundary. We get at
+ * least a twofold speedup on 486 and Pentium if it is 4-byte aligned.
+ * Fortunately, it is easy to convert 2-byte alignment to 4-byte
+ * alignment for the unrolled loop.
+ */
+ENTRY(csum_partial)
+ CFI_STARTPROC
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: unsigned char *buff
+ testl $3, %esi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ testl $1, %esi # Check alignment.
+ jz 10f # Jump if alignment is boundary of 2bytes.
+
+ # buf is odd
+ dec %ecx
+ jl 8f
+ movzbl (%esi), %ebx
+ adcl %ebx, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 2f
+10:
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+1: movw (%esi), %bx
+ addl $2, %esi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, %edx
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+1: movl (%esi), %ebx
+ adcl %ebx, %eax
+ movl 4(%esi), %ebx
+ adcl %ebx, %eax
+ movl 8(%esi), %ebx
+ adcl %ebx, %eax
+ movl 12(%esi), %ebx
+ adcl %ebx, %eax
+ movl 16(%esi), %ebx
+ adcl %ebx, %eax
+ movl 20(%esi), %ebx
+ adcl %ebx, %eax
+ movl 24(%esi), %ebx
+ adcl %ebx, %eax
+ movl 28(%esi), %ebx
+ adcl %ebx, %eax
+ lea 32(%esi), %esi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+3: adcl (%esi), %eax
+ lea 4(%esi), %esi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+ movw (%esi),%cx
+ leal 2(%esi),%esi
+ je 6f
+ shll $16,%ecx
+5: movb (%esi),%cl
+6: addl %ecx,%eax
+ adcl $0, %eax
+7:
+ testl $1, 12(%esp)
+ jz 8f
+ roll $8, %eax
+8:
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ ret
+ CFI_ENDPROC
+ENDPROC(csum_partial)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+ENTRY(csum_partial)
+ CFI_STARTPROC
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl 20(%esp),%eax # Function arg: unsigned int sum
+ movl 16(%esp),%ecx # Function arg: int len
+ movl 12(%esp),%esi # Function arg: const unsigned char *buf
+
+ testl $3, %esi
+ jnz 25f
+10:
+ movl %ecx, %edx
+ movl %ecx, %ebx
+ andl $0x7c, %ebx
+ shrl $7, %ecx
+ addl %ebx,%esi
+ shrl $2, %ebx
+ negl %ebx
+ lea 45f(%ebx,%ebx,2), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+
+ # Handle 2-byte-aligned regions
+20: addw (%esi), %ax
+ lea 2(%esi), %esi
+ adcl $0, %eax
+ jmp 10b
+25:
+ testl $1, %esi
+ jz 30f
+ # buf is odd
+ dec %ecx
+ jl 90f
+ movzbl (%esi), %ebx
+ addl %ebx, %eax
+ adcl $0, %eax
+ roll $8, %eax
+ inc %esi
+ testl $2, %esi
+ jz 10b
+
+30: subl $2, %ecx
+ ja 20b
+ je 32f
+ addl $2, %ecx
+ jz 80f
+ movzbl (%esi),%ebx # csumming 1 byte, 2-aligned
+ addl %ebx, %eax
+ adcl $0, %eax
+ jmp 80f
+32:
+ addw (%esi), %ax # csumming 2 bytes, 2-aligned
+ adcl $0, %eax
+ jmp 80f
+
+40:
+ addl -128(%esi), %eax
+ adcl -124(%esi), %eax
+ adcl -120(%esi), %eax
+ adcl -116(%esi), %eax
+ adcl -112(%esi), %eax
+ adcl -108(%esi), %eax
+ adcl -104(%esi), %eax
+ adcl -100(%esi), %eax
+ adcl -96(%esi), %eax
+ adcl -92(%esi), %eax
+ adcl -88(%esi), %eax
+ adcl -84(%esi), %eax
+ adcl -80(%esi), %eax
+ adcl -76(%esi), %eax
+ adcl -72(%esi), %eax
+ adcl -68(%esi), %eax
+ adcl -64(%esi), %eax
+ adcl -60(%esi), %eax
+ adcl -56(%esi), %eax
+ adcl -52(%esi), %eax
+ adcl -48(%esi), %eax
+ adcl -44(%esi), %eax
+ adcl -40(%esi), %eax
+ adcl -36(%esi), %eax
+ adcl -32(%esi), %eax
+ adcl -28(%esi), %eax
+ adcl -24(%esi), %eax
+ adcl -20(%esi), %eax
+ adcl -16(%esi), %eax
+ adcl -12(%esi), %eax
+ adcl -8(%esi), %eax
+ adcl -4(%esi), %eax
+45:
+ lea 128(%esi), %esi
+ adcl $0, %eax
+ dec %ecx
+ jge 40b
+ movl %edx, %ecx
+50: andl $3, %ecx
+ jz 80f
+
+ # Handle the last 1-3 bytes without jumping
+ notl %ecx # 1->2, 2->1, 3->0, higher bits are masked
+ movl $0xffffff,%ebx # by the shll and shrl instructions
+ shll $3,%ecx
+ shrl %cl,%ebx
+ andl -128(%esi),%ebx # esi is 4-aligned so should be ok
+ addl %ebx,%eax
+ adcl $0,%eax
+80:
+ testl $1, 12(%esp)
+ jz 90f
+ roll $8, %eax
+90:
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ ret
+ CFI_ENDPROC
+ENDPROC(csum_partial)
+
+#endif
+
+/*
+unsigned int csum_partial_copy_generic (const char *src, char *dst,
+ int len, int sum, int *src_err_ptr, int *dst_err_ptr)
+ */
+
+/*
+ * Copy from ds while checksumming, otherwise like csum_partial
+ *
+ * The macros SRC and DST specify the type of access for the instruction.
+ * thus we can call a custom exception handler for all access types.
+ *
+ * FIXME: could someone double-check whether I haven't mixed up some SRC and
+ * DST definitions? It's damn hard to trigger all cases. I hope I got
+ * them all but there's no guarantee.
+ */
+
+#define SRC(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6001f ; \
+ .previous
+
+#define DST(y...) \
+ 9999: y; \
+ .section __ex_table, "a"; \
+ .long 9999b, 6002f ; \
+ .previous
+
+#ifndef CONFIG_X86_USE_PPRO_CHECKSUM
+
+#define ARGBASE 16
+#define FP 12
+
+ENTRY(csum_partial_copy_generic)
+ CFI_STARTPROC
+ subl $4,%esp
+ CFI_ADJUST_CFA_OFFSET 4
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ movl ARGBASE+16(%esp),%eax # sum
+ movl ARGBASE+12(%esp),%ecx # len
+ movl ARGBASE+4(%esp),%esi # src
+ movl ARGBASE+8(%esp),%edi # dst
+
+ testl $2, %edi # Check alignment.
+ jz 2f # Jump if alignment is ok.
+ subl $2, %ecx # Alignment uses up two bytes.
+ jae 1f # Jump if we had at least two bytes.
+ addl $2, %ecx # ecx was < 2. Deal with it.
+ jmp 4f
+SRC(1: movw (%esi), %bx )
+ addl $2, %esi
+DST( movw %bx, (%edi) )
+ addl $2, %edi
+ addw %bx, %ax
+ adcl $0, %eax
+2:
+ movl %ecx, FP(%esp)
+ shrl $5, %ecx
+ jz 2f
+ testl %esi, %esi
+SRC(1: movl (%esi), %ebx )
+SRC( movl 4(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 4(%edi) )
+
+SRC( movl 8(%esi), %ebx )
+SRC( movl 12(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 8(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 12(%edi) )
+
+SRC( movl 16(%esi), %ebx )
+SRC( movl 20(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 16(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 20(%edi) )
+
+SRC( movl 24(%esi), %ebx )
+SRC( movl 28(%esi), %edx )
+ adcl %ebx, %eax
+DST( movl %ebx, 24(%edi) )
+ adcl %edx, %eax
+DST( movl %edx, 28(%edi) )
+
+ lea 32(%esi), %esi
+ lea 32(%edi), %edi
+ dec %ecx
+ jne 1b
+ adcl $0, %eax
+2: movl FP(%esp), %edx
+ movl %edx, %ecx
+ andl $0x1c, %edx
+ je 4f
+ shrl $2, %edx # This clears CF
+SRC(3: movl (%esi), %ebx )
+ adcl %ebx, %eax
+DST( movl %ebx, (%edi) )
+ lea 4(%esi), %esi
+ lea 4(%edi), %edi
+ dec %edx
+ jne 3b
+ adcl $0, %eax
+4: andl $3, %ecx
+ jz 7f
+ cmpl $2, %ecx
+ jb 5f
+SRC( movw (%esi), %cx )
+ leal 2(%esi), %esi
+DST( movw %cx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%ecx
+SRC(5: movb (%esi), %cl )
+DST( movb %cl, (%edi) )
+6: addl %ecx, %eax
+ adcl $0, %eax
+7:
+5000:
+
+# Exception handler:
+.section .fixup, "ax"
+
+6001:
+ movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+
+ # zero the complete destination - computing the rest
+ # is too much work
+ movl ARGBASE+8(%esp), %edi # dst
+ movl ARGBASE+12(%esp), %ecx # len
+ xorl %eax,%eax
+ rep ; stosb
+
+ jmp 5000b
+
+6002:
+ movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT,(%ebx)
+ jmp 5000b
+
+.previous
+
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ecx # equivalent to addl $4,%esp
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+
+#else
+
+/* Version for PentiumII/PPro */
+
+#define ROUND1(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ addl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ROUND(x) \
+ SRC(movl x(%esi), %ebx ) ; \
+ adcl %ebx, %eax ; \
+ DST(movl %ebx, x(%edi) ) ;
+
+#define ARGBASE 12
+
+ENTRY(csum_partial_copy_generic)
+ CFI_STARTPROC
+ pushl %ebx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ebx, 0
+ pushl %edi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edi, 0
+ pushl %esi
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET esi, 0
+ movl ARGBASE+4(%esp),%esi #src
+ movl ARGBASE+8(%esp),%edi #dst
+ movl ARGBASE+12(%esp),%ecx #len
+ movl ARGBASE+16(%esp),%eax #sum
+# movl %ecx, %edx
+ movl %ecx, %ebx
+ movl %esi, %edx
+ shrl $6, %ecx
+ andl $0x3c, %ebx
+ negl %ebx
+ subl %ebx, %esi
+ subl %ebx, %edi
+ lea -1(%esi),%edx
+ andl $-32,%edx
+ lea 3f(%ebx,%ebx), %ebx
+ testl %esi, %esi
+ jmp *%ebx
+1: addl $64,%esi
+ addl $64,%edi
+ SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl)
+ ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52)
+ ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36)
+ ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20)
+ ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4)
+3: adcl $0,%eax
+ addl $64, %edx
+ dec %ecx
+ jge 1b
+4: movl ARGBASE+12(%esp),%edx #len
+ andl $3, %edx
+ jz 7f
+ cmpl $2, %edx
+ jb 5f
+SRC( movw (%esi), %dx )
+ leal 2(%esi), %esi
+DST( movw %dx, (%edi) )
+ leal 2(%edi), %edi
+ je 6f
+ shll $16,%edx
+5:
+SRC( movb (%esi), %dl )
+DST( movb %dl, (%edi) )
+6: addl %edx, %eax
+ adcl $0, %eax
+7:
+.section .fixup, "ax"
+6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr
+ movl $-EFAULT, (%ebx)
+ # zero the complete destination (computing the rest is too much work)
+ movl ARGBASE+8(%esp),%edi # dst
+ movl ARGBASE+12(%esp),%ecx # len
+ xorl %eax,%eax
+ rep; stosb
+ jmp 7b
+6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr
+ movl $-EFAULT, (%ebx)
+ jmp 7b
+.previous
+
+ popl %esi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE esi
+ popl %edi
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edi
+ popl %ebx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ebx
+ ret
+ CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
+
+#undef ROUND
+#undef ROUND1
+
+#endif
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
new file mode 100644
index 000000000000..9a10a78bb4a4
--- /dev/null
+++ b/arch/x86/lib/clear_page_64.S
@@ -0,0 +1,59 @@
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * Zero a page.
+ * rdi page
+ */
+ ALIGN
+clear_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ xorl %eax,%eax
+ rep stosq
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page)
+
+ENTRY(clear_page)
+ CFI_STARTPROC
+ xorl %eax,%eax
+ movl $4096/64,%ecx
+ .p2align 4
+.Lloop:
+ decl %ecx
+#define PUT(x) movq %rax,x*8(%rdi)
+ movq %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop
+ nop
+ ret
+ CFI_ENDPROC
+.Lclear_page_end:
+ENDPROC(clear_page)
+
+ /* Some CPUs run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad clear_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lclear_page_end - clear_page
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
new file mode 100644
index 000000000000..727a5d46d2fc
--- /dev/null
+++ b/arch/x86/lib/copy_page_64.S
@@ -0,0 +1,119 @@
+/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+ ALIGN
+copy_page_c:
+ CFI_STARTPROC
+ movl $4096/8,%ecx
+ rep movsq
+ ret
+ CFI_ENDPROC
+ENDPROC(copy_page_c)
+
+/* Don't use streaming store because it's better when the target
+ ends up in cache. */
+
+/* Could vary the prefetch distance based on SMP/UP */
+
+ENTRY(copy_page)
+ CFI_STARTPROC
+ subq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 3*8
+ movq %rbx,(%rsp)
+ CFI_REL_OFFSET rbx, 0
+ movq %r12,1*8(%rsp)
+ CFI_REL_OFFSET r12, 1*8
+ movq %r13,2*8(%rsp)
+ CFI_REL_OFFSET r13, 2*8
+
+ movl $(4096/64)-5,%ecx
+ .p2align 4
+.Loop64:
+ dec %rcx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ prefetcht0 5*64(%rsi)
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64 (%rsi), %rsi
+ leaq 64 (%rdi), %rdi
+
+ jnz .Loop64
+
+ movl $5,%ecx
+ .p2align 4
+.Loop2:
+ decl %ecx
+
+ movq (%rsi), %rax
+ movq 8 (%rsi), %rbx
+ movq 16 (%rsi), %rdx
+ movq 24 (%rsi), %r8
+ movq 32 (%rsi), %r9
+ movq 40 (%rsi), %r10
+ movq 48 (%rsi), %r11
+ movq 56 (%rsi), %r12
+
+ movq %rax, (%rdi)
+ movq %rbx, 8 (%rdi)
+ movq %rdx, 16 (%rdi)
+ movq %r8, 24 (%rdi)
+ movq %r9, 32 (%rdi)
+ movq %r10, 40 (%rdi)
+ movq %r11, 48 (%rdi)
+ movq %r12, 56 (%rdi)
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Loop2
+
+ movq (%rsp),%rbx
+ CFI_RESTORE rbx
+ movq 1*8(%rsp),%r12
+ CFI_RESTORE r12
+ movq 2*8(%rsp),%r13
+ CFI_RESTORE r13
+ addq $3*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -3*8
+ ret
+.Lcopy_page_end:
+ CFI_ENDPROC
+ENDPROC(copy_page)
+
+ /* Some CPUs run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad copy_page
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lcopy_page_end - copy_page
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
new file mode 100644
index 000000000000..70bebd310408
--- /dev/null
+++ b/arch/x86/lib/copy_user_64.S
@@ -0,0 +1,354 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+ .macro ALTERNATIVE_JUMP feature,orig,alt
+0:
+ .byte 0xe9 /* 32bit jump */
+ .long \orig-1f /* by default jump to orig */
+1:
+ .section .altinstr_replacement,"ax"
+2: .byte 0xe9 /* near jump with 32bit immediate */
+ .long \alt-1b /* offset */ /* or alternatively to alt */
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad 0b
+ .quad 2b
+ .byte \feature /* when feature is set */
+ .byte 5
+ .byte 5
+ .previous
+ .endm
+
+/* Standard copy_to_user with segment limit checking */
+ENTRY(copy_to_user)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%rax)
+ movq %rdi,%rcx
+ addq %rdx,%rcx
+ jc bad_to_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_to_user
+ xorl %eax,%eax /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(copy_user_generic)
+ CFI_STARTPROC
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+ENTRY(__copy_from_user_inatomic)
+ CFI_STARTPROC
+ xorl %ecx,%ecx /* clear zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+
+/* Standard copy_from_user with segment limit checking */
+ENTRY(copy_from_user)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%rax)
+ movq %rsi,%rcx
+ addq %rdx,%rcx
+ jc bad_from_user
+ cmpq threadinfo_addr_limit(%rax),%rcx
+ jae bad_from_user
+ movl $1,%ecx /* set zero flag */
+ ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
+ CFI_ENDPROC
+ENDPROC(copy_from_user)
+
+ .section .fixup,"ax"
+ /* must zero dest */
+bad_from_user:
+ CFI_STARTPROC
+ movl %edx,%ecx
+ xorl %eax,%eax
+ rep
+ stosb
+bad_to_user:
+ movl %edx,%eax
+ ret
+ CFI_ENDPROC
+END(bad_from_user)
+ .previous
+
+
+/*
+ * copy_user_generic_unrolled - memory copy with exception handling.
+ * This version is for CPUs like P4 that don't have efficient micro code for rep movsq
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag -- if true zero destination on error
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(copy_user_generic_unrolled)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+ xorl %eax,%eax /*zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movq %r11,(%rdi)
+.Ld2: movq %r8,1*8(%rdi)
+.Ld3: movq %r9,2*8(%rdi)
+.Ld4: movq %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movq %r11,4*8(%rdi)
+.Ld6: movq %r8,5*8(%rdi)
+.Ld7: movq %r9,6*8(%rdi)
+.Ld8: movq %r10,7*8(%rdi)
+
+ decq %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movq %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp)
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(copy_user_generic)
+
+
+ /* Some CPUs run faster using the string copy instructions.
+ This is also a lot simpler. Use them when possible.
+ Patch in jmps to this code instead of copying it fully
+ to avoid unwanted aliasing in the exception tables. */
+
+ /* rdi destination
+ * rsi source
+ * rdx count
+ * ecx zero flag
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successfull.
+ *
+ * Only 4GB of copy is supported. This shouldn't be a problem
+ * because the kernel normally only writes from/to page sized chunks
+ * even if user space passed a longer buffer.
+ * And more would be dangerous because both Intel and AMD have
+ * errata with rep movsq > 4GB. If someone feels the need to fix
+ * this please consider this.
+ */
+ENTRY(copy_user_generic_string)
+ CFI_STARTPROC
+ movl %ecx,%r8d /* save zero flag */
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ jz 10f
+1: rep
+ movsq
+ movl %edx,%ecx
+2: rep
+ movsb
+9: movl %ecx,%eax
+ ret
+
+ /* multiple of 8 byte */
+10: rep
+ movsq
+ xor %eax,%eax
+ ret
+
+ /* exception handling */
+3: lea (%rdx,%rcx,8),%rax /* exception on quad loop */
+ jmp 6f
+5: movl %ecx,%eax /* exception on byte loop */
+ /* eax: left over bytes */
+6: testl %r8d,%r8d /* zero flag set? */
+ jz 7f
+ movl %eax,%ecx /* initialize x86 loop counter */
+ push %rax
+ xorl %eax,%eax
+8: rep
+ stosb /* zero the rest */
+11: pop %rax
+7: ret
+ CFI_ENDPROC
+END(copy_user_generic_c)
+
+ .section __ex_table,"a"
+ .quad 1b,3b
+ .quad 2b,5b
+ .quad 8b,11b
+ .quad 10b,3b
+ .previous
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
new file mode 100644
index 000000000000..4620efb12f13
--- /dev/null
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -0,0 +1,217 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v2.
+ *
+ * Functions to copy from and to user space.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+#define FIX_ALIGNMENT 1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/cpufeature.h>
+
+/*
+ * copy_user_nocache - Uncached memory copy with exception handling
+ * This will force destination/source out of cache for more performance.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ * rcx zero flag when 1 zero on exception
+ *
+ * Output:
+ * eax uncopied bytes or 0 if successful.
+ */
+ENTRY(__copy_user_nocache)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ pushq %rcx /* save zero flag */
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rcx, 0
+
+ xorl %eax,%eax /* zero for the exception handler */
+
+#ifdef FIX_ALIGNMENT
+ /* check for bad alignment of destination */
+ movl %edi,%ecx
+ andl $7,%ecx
+ jnz .Lbad_alignment
+.Lafter_bad_alignment:
+#endif
+
+ movq %rdx,%rcx
+
+ movl $64,%ebx
+ shrq $6,%rdx
+ decq %rdx
+ js .Lhandle_tail
+
+ .p2align 4
+.Lloop:
+.Ls1: movq (%rsi),%r11
+.Ls2: movq 1*8(%rsi),%r8
+.Ls3: movq 2*8(%rsi),%r9
+.Ls4: movq 3*8(%rsi),%r10
+.Ld1: movnti %r11,(%rdi)
+.Ld2: movnti %r8,1*8(%rdi)
+.Ld3: movnti %r9,2*8(%rdi)
+.Ld4: movnti %r10,3*8(%rdi)
+
+.Ls5: movq 4*8(%rsi),%r11
+.Ls6: movq 5*8(%rsi),%r8
+.Ls7: movq 6*8(%rsi),%r9
+.Ls8: movq 7*8(%rsi),%r10
+.Ld5: movnti %r11,4*8(%rdi)
+.Ld6: movnti %r8,5*8(%rdi)
+.Ld7: movnti %r9,6*8(%rdi)
+.Ld8: movnti %r10,7*8(%rdi)
+
+ dec %rdx
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+
+ jns .Lloop
+
+ .p2align 4
+.Lhandle_tail:
+ movl %ecx,%edx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ movl $8,%ebx
+ .p2align 4
+.Lloop_8:
+.Ls9: movq (%rsi),%r8
+.Ld9: movnti %r8,(%rdi)
+ decl %ecx
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+.Ls10: movb (%rsi),%bl
+.Ld10: movb %bl,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+ CFI_REMEMBER_STATE
+.Lende:
+ popq %rcx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE %rcx
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+ CFI_RESTORE_STATE
+
+#ifdef FIX_ALIGNMENT
+ /* align destination */
+ .p2align 4
+.Lbad_alignment:
+ movl $8,%r9d
+ subl %ecx,%r9d
+ movl %r9d,%ecx
+ cmpq %r9,%rdx
+ jz .Lhandle_7
+ js .Lhandle_7
+.Lalign_1:
+.Ls11: movb (%rsi),%bl
+.Ld11: movb %bl,(%rdi)
+ incq %rsi
+ incq %rdi
+ decl %ecx
+ jnz .Lalign_1
+ subq %r9,%rdx
+ jmp .Lafter_bad_alignment
+#endif
+
+ /* table sorted by exception address */
+ .section __ex_table,"a"
+ .align 8
+ .quad .Ls1,.Ls1e
+ .quad .Ls2,.Ls2e
+ .quad .Ls3,.Ls3e
+ .quad .Ls4,.Ls4e
+ .quad .Ld1,.Ls1e
+ .quad .Ld2,.Ls2e
+ .quad .Ld3,.Ls3e
+ .quad .Ld4,.Ls4e
+ .quad .Ls5,.Ls5e
+ .quad .Ls6,.Ls6e
+ .quad .Ls7,.Ls7e
+ .quad .Ls8,.Ls8e
+ .quad .Ld5,.Ls5e
+ .quad .Ld6,.Ls6e
+ .quad .Ld7,.Ls7e
+ .quad .Ld8,.Ls8e
+ .quad .Ls9,.Le_quad
+ .quad .Ld9,.Le_quad
+ .quad .Ls10,.Le_byte
+ .quad .Ld10,.Le_byte
+#ifdef FIX_ALIGNMENT
+ .quad .Ls11,.Lzero_rest
+ .quad .Ld11,.Lzero_rest
+#endif
+ .quad .Le5,.Le_zero
+ .previous
+
+ /* compute 64-offset for main loop. 8 bytes accuracy with error on the
+ pessimistic side. this is gross. it would be better to fix the
+ interface. */
+ /* eax: zero, ebx: 64 */
+.Ls1e: addl $8,%eax
+.Ls2e: addl $8,%eax
+.Ls3e: addl $8,%eax
+.Ls4e: addl $8,%eax
+.Ls5e: addl $8,%eax
+.Ls6e: addl $8,%eax
+.Ls7e: addl $8,%eax
+.Ls8e: addl $8,%eax
+ addq %rbx,%rdi /* +64 */
+ subq %rax,%rdi /* correct destination with computed offset */
+
+ shlq $6,%rdx /* loop counter * 64 (stride length) */
+ addq %rax,%rdx /* add offset to loopcnt */
+ andl $63,%ecx /* remaining bytes */
+ addq %rcx,%rdx /* add them */
+ jmp .Lzero_rest
+
+ /* exception on quad word loop in tail handling */
+ /* ecx: loopcnt/8, %edx: length, rdi: correct */
+.Le_quad:
+ shll $3,%ecx
+ andl $7,%edx
+ addl %ecx,%edx
+ /* edx: bytes to zero, rdi: dest, eax:zero */
+.Lzero_rest:
+ cmpl $0,(%rsp) /* zero flag set? */
+ jz .Le_zero
+ movq %rdx,%rcx
+.Le_byte:
+ xorl %eax,%eax
+.Le5: rep
+ stosb
+ /* when there is another exception while zeroing the rest just return */
+.Le_zero:
+ movq %rdx,%rax
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(__copy_user_nocache)
+
+
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
new file mode 100644
index 000000000000..f0dba36578ea
--- /dev/null
+++ b/arch/x86/lib/csum-copy_64.S
@@ -0,0 +1,249 @@
+/*
+ * Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file COPYING in the main directory of this archive
+ * for more details. No warranty for anything given at all.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/errno.h>
+
+/*
+ * Checksum copy with exception handling.
+ * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
+ * destination is zeroed.
+ *
+ * Input
+ * rdi source
+ * rsi destination
+ * edx len (32bit)
+ * ecx sum (32bit)
+ * r8 src_err_ptr (int)
+ * r9 dst_err_ptr (int)
+ *
+ * Output
+ * eax 64bit sum. undefined in case of exception.
+ *
+ * Wrappers need to take care of valid exception sum and zeroing.
+ * They also should align source or destination to 8 bytes.
+ */
+
+ .macro source
+10:
+ .section __ex_table,"a"
+ .align 8
+ .quad 10b,.Lbad_source
+ .previous
+ .endm
+
+ .macro dest
+20:
+ .section __ex_table,"a"
+ .align 8
+ .quad 20b,.Lbad_dest
+ .previous
+ .endm
+
+ .macro ignore L=.Lignore
+30:
+ .section __ex_table,"a"
+ .align 8
+ .quad 30b,\L
+ .previous
+ .endm
+
+
+ENTRY(csum_partial_copy_generic)
+ CFI_STARTPROC
+ cmpl $3*64,%edx
+ jle .Lignore
+
+.Lignore:
+ subq $7*8,%rsp
+ CFI_ADJUST_CFA_OFFSET 7*8
+ movq %rbx,2*8(%rsp)
+ CFI_REL_OFFSET rbx, 2*8
+ movq %r12,3*8(%rsp)
+ CFI_REL_OFFSET r12, 3*8
+ movq %r14,4*8(%rsp)
+ CFI_REL_OFFSET r14, 4*8
+ movq %r13,5*8(%rsp)
+ CFI_REL_OFFSET r13, 5*8
+ movq %rbp,6*8(%rsp)
+ CFI_REL_OFFSET rbp, 6*8
+
+ movq %r8,(%rsp)
+ movq %r9,1*8(%rsp)
+
+ movl %ecx,%eax
+ movl %edx,%ecx
+
+ xorl %r9d,%r9d
+ movq %rcx,%r12
+
+ shrq $6,%r12
+ jz .Lhandle_tail /* < 64 */
+
+ clc
+
+ /* main loop. clear in 64 byte blocks */
+ /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
+ /* r11: temp3, rdx: temp4, r12 loopcnt */
+ /* r10: temp5, rbp: temp6, r14 temp7, r13 temp8 */
+ .p2align 4
+.Lloop:
+ source
+ movq (%rdi),%rbx
+ source
+ movq 8(%rdi),%r8
+ source
+ movq 16(%rdi),%r11
+ source
+ movq 24(%rdi),%rdx
+
+ source
+ movq 32(%rdi),%r10
+ source
+ movq 40(%rdi),%rbp
+ source
+ movq 48(%rdi),%r14
+ source
+ movq 56(%rdi),%r13
+
+ ignore 2f
+ prefetcht0 5*64(%rdi)
+2:
+ adcq %rbx,%rax
+ adcq %r8,%rax
+ adcq %r11,%rax
+ adcq %rdx,%rax
+ adcq %r10,%rax
+ adcq %rbp,%rax
+ adcq %r14,%rax
+ adcq %r13,%rax
+
+ decl %r12d
+
+ dest
+ movq %rbx,(%rsi)
+ dest
+ movq %r8,8(%rsi)
+ dest
+ movq %r11,16(%rsi)
+ dest
+ movq %rdx,24(%rsi)
+
+ dest
+ movq %r10,32(%rsi)
+ dest
+ movq %rbp,40(%rsi)
+ dest
+ movq %r14,48(%rsi)
+ dest
+ movq %r13,56(%rsi)
+
+3:
+
+ leaq 64(%rdi),%rdi
+ leaq 64(%rsi),%rsi
+
+ jnz .Lloop
+
+ adcq %r9,%rax
+
+ /* do last upto 56 bytes */
+.Lhandle_tail:
+ /* ecx: count */
+ movl %ecx,%r10d
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lfold
+ clc
+ .p2align 4
+.Lloop_8:
+ source
+ movq (%rdi),%rbx
+ adcq %rbx,%rax
+ decl %ecx
+ dest
+ movq %rbx,(%rsi)
+ leaq 8(%rsi),%rsi /* preserve carry */
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+ adcq %r9,%rax /* add in carry */
+
+.Lfold:
+ /* reduce checksum to 32bits */
+ movl %eax,%ebx
+ shrq $32,%rax
+ addl %ebx,%eax
+ adcl %r9d,%eax
+
+ /* do last upto 6 bytes */
+.Lhandle_7:
+ movl %r10d,%ecx
+ andl $7,%ecx
+ shrl $1,%ecx
+ jz .Lhandle_1
+ movl $2,%edx
+ xorl %ebx,%ebx
+ clc
+ .p2align 4
+.Lloop_1:
+ source
+ movw (%rdi),%bx
+ adcl %ebx,%eax
+ decl %ecx
+ dest
+ movw %bx,(%rsi)
+ leaq 2(%rdi),%rdi
+ leaq 2(%rsi),%rsi
+ jnz .Lloop_1
+ adcl %r9d,%eax /* add in carry */
+
+ /* handle last odd byte */
+.Lhandle_1:
+ testl $1,%r10d
+ jz .Lende
+ xorl %ebx,%ebx
+ source
+ movb (%rdi),%bl
+ dest
+ movb %bl,(%rsi)
+ addl %ebx,%eax
+ adcl %r9d,%eax /* carry */
+
+ CFI_REMEMBER_STATE
+.Lende:
+ movq 2*8(%rsp),%rbx
+ CFI_RESTORE rbx
+ movq 3*8(%rsp),%r12
+ CFI_RESTORE r12
+ movq 4*8(%rsp),%r14
+ CFI_RESTORE r14
+ movq 5*8(%rsp),%r13
+ CFI_RESTORE r13
+ movq 6*8(%rsp),%rbp
+ CFI_RESTORE rbp
+ addq $7*8,%rsp
+ CFI_ADJUST_CFA_OFFSET -7*8
+ ret
+ CFI_RESTORE_STATE
+
+ /* Exception handlers. Very simple, zeroing is done in the wrappers */
+.Lbad_source:
+ movq (%rsp),%rax
+ testq %rax,%rax
+ jz .Lende
+ movl $-EFAULT,(%rax)
+ jmp .Lende
+
+.Lbad_dest:
+ movq 8(%rsp),%rax
+ testq %rax,%rax
+ jz .Lende
+ movl $-EFAULT,(%rax)
+ jmp .Lende
+ CFI_ENDPROC
+ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
new file mode 100644
index 000000000000..bc503f506903
--- /dev/null
+++ b/arch/x86/lib/csum-partial_64.c
@@ -0,0 +1,150 @@
+/*
+ * arch/x86_64/lib/csum-partial.c
+ *
+ * This file contains network checksum routines that are better done
+ * in an architecture-specific manner due to speed.
+ */
+
+#include <linux/compiler.h>
+#include <linux/module.h>
+#include <asm/checksum.h>
+
+static inline unsigned short from32to16(unsigned a)
+{
+ unsigned short b = a >> 16;
+ asm("addw %w2,%w0\n\t"
+ "adcw $0,%w0\n"
+ : "=r" (b)
+ : "0" (b), "r" (a));
+ return b;
+}
+
+/*
+ * Do a 64-bit checksum on an arbitrary memory area.
+ * Returns a 32bit checksum.
+ *
+ * This isn't as time critical as it used to be because many NICs
+ * do hardware checksumming these days.
+ *
+ * Things tried and found to not make it faster:
+ * Manual Prefetching
+ * Unrolling to an 128 bytes inner loop.
+ * Using interleaving with more registers to break the carry chains.
+ */
+static unsigned do_csum(const unsigned char *buff, unsigned len)
+{
+ unsigned odd, count;
+ unsigned long result = 0;
+
+ if (unlikely(len == 0))
+ return result;
+ odd = 1 & (unsigned long) buff;
+ if (unlikely(odd)) {
+ result = *buff << 8;
+ len--;
+ buff++;
+ }
+ count = len >> 1; /* nr of 16-bit words.. */
+ if (count) {
+ if (2 & (unsigned long) buff) {
+ result += *(unsigned short *)buff;
+ count--;
+ len -= 2;
+ buff += 2;
+ }
+ count >>= 1; /* nr of 32-bit words.. */
+ if (count) {
+ unsigned long zero;
+ unsigned count64;
+ if (4 & (unsigned long) buff) {
+ result += *(unsigned int *) buff;
+ count--;
+ len -= 4;
+ buff += 4;
+ }
+ count >>= 1; /* nr of 64-bit words.. */
+
+ /* main loop using 64byte blocks */
+ zero = 0;
+ count64 = count >> 3;
+ while (count64) {
+ asm("addq 0*8(%[src]),%[res]\n\t"
+ "adcq 1*8(%[src]),%[res]\n\t"
+ "adcq 2*8(%[src]),%[res]\n\t"
+ "adcq 3*8(%[src]),%[res]\n\t"
+ "adcq 4*8(%[src]),%[res]\n\t"
+ "adcq 5*8(%[src]),%[res]\n\t"
+ "adcq 6*8(%[src]),%[res]\n\t"
+ "adcq 7*8(%[src]),%[res]\n\t"
+ "adcq %[zero],%[res]"
+ : [res] "=r" (result)
+ : [src] "r" (buff), [zero] "r" (zero),
+ "[res]" (result));
+ buff += 64;
+ count64--;
+ }
+
+ /* last upto 7 8byte blocks */
+ count %= 8;
+ while (count) {
+ asm("addq %1,%0\n\t"
+ "adcq %2,%0\n"
+ : "=r" (result)
+ : "m" (*(unsigned long *)buff),
+ "r" (zero), "0" (result));
+ --count;
+ buff += 8;
+ }
+ result = add32_with_carry(result>>32,
+ result&0xffffffff);
+
+ if (len & 4) {
+ result += *(unsigned int *) buff;
+ buff += 4;
+ }
+ }
+ if (len & 2) {
+ result += *(unsigned short *) buff;
+ buff += 2;
+ }
+ }
+ if (len & 1)
+ result += *buff;
+ result = add32_with_carry(result>>32, result & 0xffffffff);
+ if (unlikely(odd)) {
+ result = from32to16(result);
+ result = ((result >> 8) & 0xff) | ((result & 0xff) << 8);
+ }
+ return result;
+}
+
+/*
+ * computes the checksum of a memory block at buff, length len,
+ * and adds in "sum" (32-bit)
+ *
+ * returns a 32-bit number suitable for feeding into itself
+ * or csum_tcpudp_magic
+ *
+ * this function must be called with even lengths, except
+ * for the last fragment, which may be odd
+ *
+ * it's best to have buff aligned on a 64-bit boundary
+ */
+__wsum csum_partial(const void *buff, int len, __wsum sum)
+{
+ return (__force __wsum)add32_with_carry(do_csum(buff, len),
+ (__force u32)sum);
+}
+
+EXPORT_SYMBOL(csum_partial);
+
+/*
+ * this routine is used for miscellaneous IP-like checksums, mainly
+ * in icmp.c
+ */
+__sum16 ip_compute_csum(const void *buff, int len)
+{
+ return csum_fold(csum_partial(buff,len,0));
+}
+EXPORT_SYMBOL(ip_compute_csum);
+
diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c
new file mode 100644
index 000000000000..fd42a4a095fc
--- /dev/null
+++ b/arch/x86/lib/csum-wrappers_64.c
@@ -0,0 +1,135 @@
+/* Copyright 2002,2003 Andi Kleen, SuSE Labs.
+ * Subject to the GNU Public License v.2
+ *
+ * Wrappers of assembly checksum functions for x86-64.
+ */
+
+#include <asm/checksum.h>
+#include <linux/module.h>
+
+/**
+ * csum_partial_copy_from_user - Copy and checksum from user space.
+ * @src: source address (user space)
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad source address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+__wsum
+csum_partial_copy_from_user(const void __user *src, void *dst,
+ int len, __wsum isum, int *errp)
+{
+ might_sleep();
+ *errp = 0;
+ if (likely(access_ok(VERIFY_READ,src, len))) {
+ /* Why 6, not 7? To handle odd addresses aligned we
+ would need to do considerable complications to fix the
+ checksum which is defined as an 16bit accumulator. The
+ fix alignment code is primarily for performance
+ compatibility with 32bit and that will handle odd
+ addresses slowly too. */
+ if (unlikely((unsigned long)src & 6)) {
+ while (((unsigned long)src & 6) && len >= 2) {
+ __u16 val16;
+ *errp = __get_user(val16, (const __u16 __user *)src);
+ if (*errp)
+ return isum;
+ *(__u16 *)dst = val16;
+ isum = (__force __wsum)add32_with_carry(
+ (__force unsigned)isum, val16);
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+ isum = csum_partial_copy_generic((__force const void *)src,
+ dst, len, isum, errp, NULL);
+ if (likely(*errp == 0))
+ return isum;
+ }
+ *errp = -EFAULT;
+ memset(dst,0,len);
+ return isum;
+}
+
+EXPORT_SYMBOL(csum_partial_copy_from_user);
+
+/**
+ * csum_partial_copy_to_user - Copy and checksum to user space.
+ * @src: source address
+ * @dst: destination address (user space)
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ * @errp: set to -EFAULT for an bad destination address.
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ * src and dst are best aligned to 64bits.
+ */
+__wsum
+csum_partial_copy_to_user(const void *src, void __user *dst,
+ int len, __wsum isum, int *errp)
+{
+ might_sleep();
+ if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) {
+ *errp = -EFAULT;
+ return 0;
+ }
+
+ if (unlikely((unsigned long)dst & 6)) {
+ while (((unsigned long)dst & 6) && len >= 2) {
+ __u16 val16 = *(__u16 *)src;
+ isum = (__force __wsum)add32_with_carry(
+ (__force unsigned)isum, val16);
+ *errp = __put_user(val16, (__u16 __user *)dst);
+ if (*errp)
+ return isum;
+ src += 2;
+ dst += 2;
+ len -= 2;
+ }
+ }
+
+ *errp = 0;
+ return csum_partial_copy_generic(src, (void __force *)dst,len,isum,NULL,errp);
+}
+
+EXPORT_SYMBOL(csum_partial_copy_to_user);
+
+/**
+ * csum_partial_copy_nocheck - Copy and checksum.
+ * @src: source address
+ * @dst: destination address
+ * @len: number of bytes to be copied.
+ * @isum: initial sum that is added into the result (32bit unfolded)
+ *
+ * Returns an 32bit unfolded checksum of the buffer.
+ */
+__wsum
+csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum)
+{
+ return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL);
+}
+EXPORT_SYMBOL(csum_partial_copy_nocheck);
+
+__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
+ const struct in6_addr *daddr,
+ __u32 len, unsigned short proto, __wsum sum)
+{
+ __u64 rest, sum64;
+
+ rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) +
+ (__force __u64)sum;
+ asm(" addq (%[saddr]),%[sum]\n"
+ " adcq 8(%[saddr]),%[sum]\n"
+ " adcq (%[daddr]),%[sum]\n"
+ " adcq 8(%[daddr]),%[sum]\n"
+ " adcq $0,%[sum]\n"
+ : [sum] "=r" (sum64)
+ : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr));
+ return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32));
+}
+
+EXPORT_SYMBOL(csum_ipv6_magic);
diff --git a/arch/x86/lib/delay_32.c b/arch/x86/lib/delay_32.c
new file mode 100644
index 000000000000..f6edb11364df
--- /dev/null
+++ b/arch/x86/lib/delay_32.c
@@ -0,0 +1,103 @@
+/*
+ * Precise Delay Loops for i386
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * The __delay function must _NOT_ be inlined as its execution time
+ * depends wildly on alignment on many x86 processors. The additional
+ * jump magic is needed to get the timing stable on all the CPU's
+ * we have to worry about.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+
+#include <asm/processor.h>
+#include <asm/delay.h>
+#include <asm/timer.h>
+
+#ifdef CONFIG_SMP
+# include <asm/smp.h>
+#endif
+
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+ int d0;
+
+ __asm__ __volatile__(
+ "\tjmp 1f\n"
+ ".align 16\n"
+ "1:\tjmp 2f\n"
+ ".align 16\n"
+ "2:\tdecl %0\n\tjns 2b"
+ :"=&a" (d0)
+ :"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+ unsigned long bclock, now;
+
+ rdtscl(bclock);
+ do {
+ rep_nop();
+ rdtscl(now);
+ } while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+ delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+ if (delay_fn == delay_tsc) {
+ rdtscl(*timer_val);
+ return 0;
+ }
+ return -1;
+}
+
+void __delay(unsigned long loops)
+{
+ delay_fn(loops);
+}
+
+inline void __const_udelay(unsigned long xloops)
+{
+ int d0;
+
+ xloops *= 4;
+ __asm__("mull %0"
+ :"=d" (xloops), "=&a" (d0)
+ :"1" (xloops), "0"
+ (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+ __delay(++xloops);
+}
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+
+EXPORT_SYMBOL(__delay);
+EXPORT_SYMBOL(__const_udelay);
+EXPORT_SYMBOL(__udelay);
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/delay_64.c b/arch/x86/lib/delay_64.c
new file mode 100644
index 000000000000..2dbebd308347
--- /dev/null
+++ b/arch/x86/lib/delay_64.c
@@ -0,0 +1,57 @@
+/*
+ * Precise Delay Loops for x86-64
+ *
+ * Copyright (C) 1993 Linus Torvalds
+ * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * The __delay function must _NOT_ be inlined as its execution time
+ * depends wildly on alignment on many x86 processors.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <asm/delay.h>
+#include <asm/msr.h>
+
+#ifdef CONFIG_SMP
+#include <asm/smp.h>
+#endif
+
+int read_current_timer(unsigned long *timer_value)
+{
+ rdtscll(*timer_value);
+ return 0;
+}
+
+void __delay(unsigned long loops)
+{
+ unsigned bclock, now;
+
+ rdtscl(bclock);
+ do
+ {
+ rep_nop();
+ rdtscl(now);
+ }
+ while((now-bclock) < loops);
+}
+EXPORT_SYMBOL(__delay);
+
+inline void __const_udelay(unsigned long xloops)
+{
+ __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1);
+}
+EXPORT_SYMBOL(__const_udelay);
+
+void __udelay(unsigned long usecs)
+{
+ __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
+}
+EXPORT_SYMBOL(__udelay);
+
+void __ndelay(unsigned long nsecs)
+{
+ __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
+}
+EXPORT_SYMBOL(__ndelay);
diff --git a/arch/x86/lib/getuser_32.S b/arch/x86/lib/getuser_32.S
new file mode 100644
index 000000000000..6d84b53f12a2
--- /dev/null
+++ b/arch/x86/lib/getuser_32.S
@@ -0,0 +1,78 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %eax contains the address
+ *
+ * Outputs: %eax is error code (0 or -EFAULT)
+ * %edx contains zero-extended value
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+.text
+ENTRY(__get_user_1)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+1: movzbl (%eax),%edx
+ xorl %eax,%eax
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+ CFI_STARTPROC
+ addl $1,%eax
+ jc bad_get_user
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+2: movzwl -1(%eax),%edx
+ xorl %eax,%eax
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+ CFI_STARTPROC
+ addl $3,%eax
+ jc bad_get_user
+ GET_THREAD_INFO(%edx)
+ cmpl TI_addr_limit(%edx),%eax
+ jae bad_get_user
+3: movl -3(%eax),%edx
+ xorl %eax,%eax
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+bad_get_user:
+ CFI_STARTPROC
+ xorl %edx,%edx
+ movl $-14,%eax
+ ret
+ CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+ .long 1b,bad_get_user
+ .long 2b,bad_get_user
+ .long 3b,bad_get_user
+.previous
diff --git a/arch/x86/lib/getuser_64.S b/arch/x86/lib/getuser_64.S
new file mode 100644
index 000000000000..5448876261f8
--- /dev/null
+++ b/arch/x86/lib/getuser_64.S
@@ -0,0 +1,109 @@
+/*
+ * __get_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __get_user_X
+ *
+ * Inputs: %rcx contains the address.
+ * The register is modified, but all changes are undone
+ * before returning because the C code doesn't know about it.
+ *
+ * Outputs: %rax is error code (0 or -EFAULT)
+ * %rdx contains zero-extended value
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+ .text
+ENTRY(__get_user_1)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_get_user
+1: movzb (%rcx),%edx
+ xorl %eax,%eax
+ ret
+ CFI_ENDPROC
+ENDPROC(__get_user_1)
+
+ENTRY(__get_user_2)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $1,%rcx
+ jc 20f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 20f
+ decq %rcx
+2: movzwl (%rcx),%edx
+ xorl %eax,%eax
+ ret
+20: decq %rcx
+ jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_2)
+
+ENTRY(__get_user_4)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $3,%rcx
+ jc 30f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 30f
+ subq $3,%rcx
+3: movl (%rcx),%edx
+ xorl %eax,%eax
+ ret
+30: subq $3,%rcx
+ jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_4)
+
+ENTRY(__get_user_8)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $7,%rcx
+ jc 40f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 40f
+ subq $7,%rcx
+4: movq (%rcx),%rdx
+ xorl %eax,%eax
+ ret
+40: subq $7,%rcx
+ jmp bad_get_user
+ CFI_ENDPROC
+ENDPROC(__get_user_8)
+
+bad_get_user:
+ CFI_STARTPROC
+ xorl %edx,%edx
+ movq $(-EFAULT),%rax
+ ret
+ CFI_ENDPROC
+END(bad_get_user)
+
+.section __ex_table,"a"
+ .quad 1b,bad_get_user
+ .quad 2b,bad_get_user
+ .quad 3b,bad_get_user
+ .quad 4b,bad_get_user
+.previous
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
new file mode 100644
index 000000000000..87b4a4e18039
--- /dev/null
+++ b/arch/x86/lib/io_64.c
@@ -0,0 +1,23 @@
+#include <linux/string.h>
+#include <asm/io.h>
+#include <linux/module.h>
+
+void __memcpy_toio(unsigned long dst,const void*src,unsigned len)
+{
+ __inline_memcpy((void *) dst,src,len);
+}
+EXPORT_SYMBOL(__memcpy_toio);
+
+void __memcpy_fromio(void *dst,unsigned long src,unsigned len)
+{
+ __inline_memcpy(dst,(const void *) src,len);
+}
+EXPORT_SYMBOL(__memcpy_fromio);
+
+void memset_io(volatile void __iomem *a, int b, size_t c)
+{
+ /* XXX: memset can mangle the IO patterns quite a bit.
+ perhaps it would be better to use a dumb one */
+ memset((void *)a,b,c);
+}
+EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
new file mode 100644
index 000000000000..05a95e713da8
--- /dev/null
+++ b/arch/x86/lib/iomap_copy_64.S
@@ -0,0 +1,30 @@
+/*
+ * Copyright 2006 PathScale, Inc. All Rights Reserved.
+ *
+ * This file is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * override generic version in lib/iomap_copy.c
+ */
+ENTRY(__iowrite32_copy)
+ CFI_STARTPROC
+ movl %edx,%ecx
+ rep movsd
+ ret
+ CFI_ENDPROC
+ENDPROC(__iowrite32_copy)
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
new file mode 100644
index 000000000000..8ac51b82a632
--- /dev/null
+++ b/arch/x86/lib/memcpy_32.c
@@ -0,0 +1,43 @@
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memcpy
+#undef memset
+
+void *memcpy(void *to, const void *from, size_t n)
+{
+#ifdef CONFIG_X86_USE_3DNOW
+ return __memcpy3d(to, from, n);
+#else
+ return __memcpy(to, from, n);
+#endif
+}
+EXPORT_SYMBOL(memcpy);
+
+void *memset(void *s, int c, size_t count)
+{
+ return __memset(s, c, count);
+}
+EXPORT_SYMBOL(memset);
+
+void *memmove(void *dest, const void *src, size_t n)
+{
+ int d0, d1, d2;
+
+ if (dest < src) {
+ memcpy(dest,src,n);
+ } else {
+ __asm__ __volatile__(
+ "std\n\t"
+ "rep\n\t"
+ "movsb\n\t"
+ "cld"
+ : "=&c" (d0), "=&S" (d1), "=&D" (d2)
+ :"0" (n),
+ "1" (n-1+(const char *)src),
+ "2" (n-1+(char *)dest)
+ :"memory");
+ }
+ return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
new file mode 100644
index 000000000000..c22981fa2f3a
--- /dev/null
+++ b/arch/x86/lib/memcpy_64.S
@@ -0,0 +1,131 @@
+/* Copyright 2002 Andi Kleen */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+
+/*
+ * memcpy - Copy a memory block.
+ *
+ * Input:
+ * rdi destination
+ * rsi source
+ * rdx count
+ *
+ * Output:
+ * rax original destination
+ */
+
+ ALIGN
+memcpy_c:
+ CFI_STARTPROC
+ movq %rdi,%rax
+ movl %edx,%ecx
+ shrl $3,%ecx
+ andl $7,%edx
+ rep movsq
+ movl %edx,%ecx
+ rep movsb
+ ret
+ CFI_ENDPROC
+ENDPROC(memcpy_c)
+
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ CFI_STARTPROC
+ pushq %rbx
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET rbx, 0
+ movq %rdi,%rax
+
+ movl %edx,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+
+ movq (%rsi),%r11
+ movq 8(%rsi),%r8
+
+ movq %r11,(%rdi)
+ movq %r8,1*8(%rdi)
+
+ movq 2*8(%rsi),%r9
+ movq 3*8(%rsi),%r10
+
+ movq %r9,2*8(%rdi)
+ movq %r10,3*8(%rdi)
+
+ movq 4*8(%rsi),%r11
+ movq 5*8(%rsi),%r8
+
+ movq %r11,4*8(%rdi)
+ movq %r8,5*8(%rdi)
+
+ movq 6*8(%rsi),%r9
+ movq 7*8(%rsi),%r10
+
+ movq %r9,6*8(%rdi)
+ movq %r10,7*8(%rdi)
+
+ leaq 64(%rsi),%rsi
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+.Lhandle_tail:
+ movl %edx,%ecx
+ andl $63,%ecx
+ shrl $3,%ecx
+ jz .Lhandle_7
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq (%rsi),%r8
+ movq %r8,(%rdi)
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %edx,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ movb (%rsi),%r8b
+ movb %r8b,(%rdi)
+ incq %rdi
+ incq %rsi
+ decl %ecx
+ jnz .Lloop_1
+
+.Lende:
+ popq %rbx
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE rbx
+ ret
+.Lfinal:
+ CFI_ENDPROC
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
+
+ /* Some CPUs run faster using the string copy instructions.
+ It is also a lot simpler. Use this when possible */
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad memcpy
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ /* Replace only beginning, memcpy is used to apply alternatives, so it
+ * is silly to overwrite itself with nops - reboot is only outcome... */
+ .byte 2b - 1b
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
new file mode 100644
index 000000000000..751ebae8ec42
--- /dev/null
+++ b/arch/x86/lib/memmove_64.c
@@ -0,0 +1,21 @@
+/* Normally compiler builtins are used, but sometimes the compiler calls out
+ of line code. Based on asm-i386/string.h.
+ */
+#define _STRING_C
+#include <linux/string.h>
+#include <linux/module.h>
+
+#undef memmove
+void *memmove(void * dest,const void *src,size_t count)
+{
+ if (dest < src) {
+ return memcpy(dest,src,count);
+ } else {
+ char *p = (char *) dest + count;
+ char *s = (char *) src + count;
+ while (count--)
+ *--p = *--s;
+ }
+ return dest;
+}
+EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
new file mode 100644
index 000000000000..2c5948116bd2
--- /dev/null
+++ b/arch/x86/lib/memset_64.S
@@ -0,0 +1,133 @@
+/* Copyright 2002 Andi Kleen, SuSE Labs */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+
+/*
+ * ISO C memset - set a memory block to a byte value.
+ *
+ * rdi destination
+ * rsi value (char)
+ * rdx count (bytes)
+ *
+ * rax original destination
+ */
+ ALIGN
+memset_c:
+ CFI_STARTPROC
+ movq %rdi,%r9
+ movl %edx,%r8d
+ andl $7,%r8d
+ movl %edx,%ecx
+ shrl $3,%ecx
+ /* expand byte value */
+ movzbl %sil,%esi
+ movabs $0x0101010101010101,%rax
+ mulq %rsi /* with rax, clobbers rdx */
+ rep stosq
+ movl %r8d,%ecx
+ rep stosb
+ movq %r9,%rax
+ ret
+ CFI_ENDPROC
+ENDPROC(memset_c)
+
+ENTRY(memset)
+ENTRY(__memset)
+ CFI_STARTPROC
+ movq %rdi,%r10
+ movq %rdx,%r11
+
+ /* expand byte value */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ mul %rcx /* with rax, clobbers rdx */
+
+ /* align dst */
+ movl %edi,%r9d
+ andl $7,%r9d
+ jnz .Lbad_alignment
+ CFI_REMEMBER_STATE
+.Lafter_bad_alignment:
+
+ movl %r11d,%ecx
+ shrl $6,%ecx
+ jz .Lhandle_tail
+
+ .p2align 4
+.Lloop_64:
+ decl %ecx
+ movq %rax,(%rdi)
+ movq %rax,8(%rdi)
+ movq %rax,16(%rdi)
+ movq %rax,24(%rdi)
+ movq %rax,32(%rdi)
+ movq %rax,40(%rdi)
+ movq %rax,48(%rdi)
+ movq %rax,56(%rdi)
+ leaq 64(%rdi),%rdi
+ jnz .Lloop_64
+
+ /* Handle tail in loops. The loops should be faster than hard
+ to predict jump tables. */
+ .p2align 4
+.Lhandle_tail:
+ movl %r11d,%ecx
+ andl $63&(~7),%ecx
+ jz .Lhandle_7
+ shrl $3,%ecx
+ .p2align 4
+.Lloop_8:
+ decl %ecx
+ movq %rax,(%rdi)
+ leaq 8(%rdi),%rdi
+ jnz .Lloop_8
+
+.Lhandle_7:
+ movl %r11d,%ecx
+ andl $7,%ecx
+ jz .Lende
+ .p2align 4
+.Lloop_1:
+ decl %ecx
+ movb %al,(%rdi)
+ leaq 1(%rdi),%rdi
+ jnz .Lloop_1
+
+.Lende:
+ movq %r10,%rax
+ ret
+
+ CFI_RESTORE_STATE
+.Lbad_alignment:
+ cmpq $7,%r11
+ jbe .Lhandle_7
+ movq %rax,(%rdi) /* unaligned store */
+ movq $8,%r8
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%r11
+ jmp .Lafter_bad_alignment
+.Lfinal:
+ CFI_ENDPROC
+ENDPROC(memset)
+ENDPROC(__memset)
+
+ /* Some CPUs run faster using the string instructions.
+ It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+ .section .altinstr_replacement,"ax"
+1: .byte 0xeb /* jmp <disp8> */
+ .byte (memset_c - memset) - (2f - 1b) /* offset */
+2:
+ .previous
+ .section .altinstructions,"a"
+ .align 8
+ .quad memset
+ .quad 1b
+ .byte X86_FEATURE_REP_GOOD
+ .byte .Lfinal - memset
+ .byte 2b - 1b
+ .previous
diff --git a/arch/x86/lib/mmx_32.c b/arch/x86/lib/mmx_32.c
new file mode 100644
index 000000000000..28084d2e8dd4
--- /dev/null
+++ b/arch/x86/lib/mmx_32.c
@@ -0,0 +1,403 @@
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+
+#include <asm/i387.h>
+
+
+/*
+ * MMX 3DNow! library helper functions
+ *
+ * To do:
+ * We can use MMX just for prefetch in IRQ's. This may be a win.
+ * (reported so on K6-III)
+ * We should use a better code neutral filler for the short jump
+ * leal ebx. [ebx] is apparently best for K6-2, but Cyrix ??
+ * We also want to clobber the filler register so we don't get any
+ * register forwarding stalls on the filler.
+ *
+ * Add *user handling. Checksums are not a win with MMX on any CPU
+ * tested so far for any MMX solution figured.
+ *
+ * 22/09/2000 - Arjan van de Ven
+ * Improved for non-egineering-sample Athlons
+ *
+ */
+
+void *_mmx_memcpy(void *to, const void *from, size_t len)
+{
+ void *p;
+ int i;
+
+ if (unlikely(in_interrupt()))
+ return __memcpy(to, from, len);
+
+ p = to;
+ i = len >> 6; /* len/64 */
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n" /* This set is 28 bytes */
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+
+ for(; i>5; i--)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+
+ for(; i>0; i--)
+ {
+ __asm__ __volatile__ (
+ " movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ /*
+ * Now do the tail of the block
+ */
+ __memcpy(to, from, len&63);
+ kernel_fpu_end();
+ return p;
+}
+
+#ifdef CONFIG_MK7
+
+/*
+ * The K7 has streaming cache bypass load/store. The Cyrix III, K6 and
+ * other MMX using processors do not.
+ */
+
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for(i=0;i<4096/64;i++)
+ {
+ __asm__ __volatile__ (
+ " movntq %%mm0, (%0)\n"
+ " movntq %%mm0, 8(%0)\n"
+ " movntq %%mm0, 16(%0)\n"
+ " movntq %%mm0, 24(%0)\n"
+ " movntq %%mm0, 32(%0)\n"
+ " movntq %%mm0, 40(%0)\n"
+ " movntq %%mm0, 48(%0)\n"
+ " movntq %%mm0, 56(%0)\n"
+ : : "r" (page) : "memory");
+ page+=64;
+ }
+ /* since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again.
+ */
+ __asm__ __volatile__ (
+ " sfence \n" : :
+ );
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ /* maybe the prefetch stuff can go before the expensive fnsave...
+ * but that is for later. -AV
+ */
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+ for(i=0; i<(4096-320)/64; i++)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ for(i=(4096-320)/64; i<4096/64; i++)
+ {
+ __asm__ __volatile__ (
+ "2: movq (%0), %%mm0\n"
+ " movntq %%mm0, (%1)\n"
+ " movq 8(%0), %%mm1\n"
+ " movntq %%mm1, 8(%1)\n"
+ " movq 16(%0), %%mm2\n"
+ " movntq %%mm2, 16(%1)\n"
+ " movq 24(%0), %%mm3\n"
+ " movntq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm4\n"
+ " movntq %%mm4, 32(%1)\n"
+ " movq 40(%0), %%mm5\n"
+ " movntq %%mm5, 40(%1)\n"
+ " movq 48(%0), %%mm6\n"
+ " movntq %%mm6, 48(%1)\n"
+ " movq 56(%0), %%mm7\n"
+ " movntq %%mm7, 56(%1)\n"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ /* since movntq is weakly-ordered, a "sfence" is needed to become
+ * ordered again.
+ */
+ __asm__ __volatile__ (
+ " sfence \n" : :
+ );
+ kernel_fpu_end();
+}
+
+#else
+
+/*
+ * Generic MMX implementation without K7 specific streaming
+ */
+
+static void fast_clear_page(void *page)
+{
+ int i;
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ " pxor %%mm0, %%mm0\n" : :
+ );
+
+ for(i=0;i<4096/128;i++)
+ {
+ __asm__ __volatile__ (
+ " movq %%mm0, (%0)\n"
+ " movq %%mm0, 8(%0)\n"
+ " movq %%mm0, 16(%0)\n"
+ " movq %%mm0, 24(%0)\n"
+ " movq %%mm0, 32(%0)\n"
+ " movq %%mm0, 40(%0)\n"
+ " movq %%mm0, 48(%0)\n"
+ " movq %%mm0, 56(%0)\n"
+ " movq %%mm0, 64(%0)\n"
+ " movq %%mm0, 72(%0)\n"
+ " movq %%mm0, 80(%0)\n"
+ " movq %%mm0, 88(%0)\n"
+ " movq %%mm0, 96(%0)\n"
+ " movq %%mm0, 104(%0)\n"
+ " movq %%mm0, 112(%0)\n"
+ " movq %%mm0, 120(%0)\n"
+ : : "r" (page) : "memory");
+ page+=128;
+ }
+
+ kernel_fpu_end();
+}
+
+static void fast_copy_page(void *to, void *from)
+{
+ int i;
+
+
+ kernel_fpu_begin();
+
+ __asm__ __volatile__ (
+ "1: prefetch (%0)\n"
+ " prefetch 64(%0)\n"
+ " prefetch 128(%0)\n"
+ " prefetch 192(%0)\n"
+ " prefetch 256(%0)\n"
+ "2: \n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x1AEB, 1b\n" /* jmp on 26 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from) );
+
+ for(i=0; i<4096/64; i++)
+ {
+ __asm__ __volatile__ (
+ "1: prefetch 320(%0)\n"
+ "2: movq (%0), %%mm0\n"
+ " movq 8(%0), %%mm1\n"
+ " movq 16(%0), %%mm2\n"
+ " movq 24(%0), %%mm3\n"
+ " movq %%mm0, (%1)\n"
+ " movq %%mm1, 8(%1)\n"
+ " movq %%mm2, 16(%1)\n"
+ " movq %%mm3, 24(%1)\n"
+ " movq 32(%0), %%mm0\n"
+ " movq 40(%0), %%mm1\n"
+ " movq 48(%0), %%mm2\n"
+ " movq 56(%0), %%mm3\n"
+ " movq %%mm0, 32(%1)\n"
+ " movq %%mm1, 40(%1)\n"
+ " movq %%mm2, 48(%1)\n"
+ " movq %%mm3, 56(%1)\n"
+ ".section .fixup, \"ax\"\n"
+ "3: movw $0x05EB, 1b\n" /* jmp on 5 bytes */
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b, 3b\n"
+ ".previous"
+ : : "r" (from), "r" (to) : "memory");
+ from+=64;
+ to+=64;
+ }
+ kernel_fpu_end();
+}
+
+
+#endif
+
+/*
+ * Favour MMX for page clear and copy.
+ */
+
+static void slow_zero_page(void * page)
+{
+ int d0, d1;
+ __asm__ __volatile__( \
+ "cld\n\t" \
+ "rep ; stosl" \
+ : "=&c" (d0), "=&D" (d1)
+ :"a" (0),"1" (page),"0" (1024)
+ :"memory");
+}
+
+void mmx_clear_page(void * page)
+{
+ if(unlikely(in_interrupt()))
+ slow_zero_page(page);
+ else
+ fast_clear_page(page);
+}
+
+static void slow_copy_page(void *to, void *from)
+{
+ int d0, d1, d2;
+ __asm__ __volatile__( \
+ "cld\n\t" \
+ "rep ; movsl" \
+ : "=&c" (d0), "=&D" (d1), "=&S" (d2) \
+ : "0" (1024),"1" ((long) to),"2" ((long) from) \
+ : "memory");
+}
+
+
+void mmx_copy_page(void *to, void *from)
+{
+ if(unlikely(in_interrupt()))
+ slow_copy_page(to, from);
+ else
+ fast_copy_page(to, from);
+}
+
+EXPORT_SYMBOL(_mmx_memcpy);
+EXPORT_SYMBOL(mmx_clear_page);
+EXPORT_SYMBOL(mmx_copy_page);
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
new file mode 100644
index 000000000000..7767962f25d3
--- /dev/null
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -0,0 +1,119 @@
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/smp.h>
+#include <asm/msr.h>
+
+struct msr_info {
+ u32 msr_no;
+ u32 l, h;
+ int err;
+};
+
+static void __rdmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rdmsr(rv->msr_no, rv->l, rv->h);
+}
+
+static void __rdmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+}
+
+static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
+{
+ int err = 0;
+ preempt_disable();
+ if (smp_processor_id() == cpu)
+ if (safe)
+ err = rdmsr_safe(msr_no, l, h);
+ else
+ rdmsr(msr_no, *l, *h);
+ else {
+ struct msr_info rv;
+
+ rv.msr_no = msr_no;
+ if (safe) {
+ smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
+ &rv, 0, 1);
+ err = rv.err;
+ } else {
+ smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+ }
+ *l = rv.l;
+ *h = rv.h;
+ }
+ preempt_enable();
+ return err;
+}
+
+static void __wrmsr_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ wrmsr(rv->msr_no, rv->l, rv->h);
+}
+
+static void __wrmsr_safe_on_cpu(void *info)
+{
+ struct msr_info *rv = info;
+
+ rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
+}
+
+static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
+{
+ int err = 0;
+ preempt_disable();
+ if (smp_processor_id() == cpu)
+ if (safe)
+ err = wrmsr_safe(msr_no, l, h);
+ else
+ wrmsr(msr_no, l, h);
+ else {
+ struct msr_info rv;
+
+ rv.msr_no = msr_no;
+ rv.l = l;
+ rv.h = h;
+ if (safe) {
+ smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
+ &rv, 0, 1);
+ err = rv.err;
+ } else {
+ smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+ }
+ }
+ preempt_enable();
+ return err;
+}
+
+void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+
+void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
+}
+
+/* These "safe" variants are slower and should be used when the target MSR
+ may not actually exist. */
+int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
+{
+ return _wrmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+
+int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
+{
+ return _rdmsr_on_cpu(cpu, msr_no, l, h, 1);
+}
+
+EXPORT_SYMBOL(rdmsr_on_cpu);
+EXPORT_SYMBOL(wrmsr_on_cpu);
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
+EXPORT_SYMBOL(wrmsr_safe_on_cpu);
diff --git a/arch/x86/lib/putuser_32.S b/arch/x86/lib/putuser_32.S
new file mode 100644
index 000000000000..f58fba109d18
--- /dev/null
+++ b/arch/x86/lib/putuser_32.S
@@ -0,0 +1,98 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 2005 Linus Torvalds
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/thread_info.h>
+
+
+/*
+ * __put_user_X
+ *
+ * Inputs: %eax[:%edx] contains the data
+ * %ecx contains the address
+ *
+ * Outputs: %eax is error code (0 or -EFAULT)
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#define ENTER CFI_STARTPROC ; \
+ pushl %ebx ; \
+ CFI_ADJUST_CFA_OFFSET 4 ; \
+ CFI_REL_OFFSET ebx, 0 ; \
+ GET_THREAD_INFO(%ebx)
+#define EXIT popl %ebx ; \
+ CFI_ADJUST_CFA_OFFSET -4 ; \
+ CFI_RESTORE ebx ; \
+ ret ; \
+ CFI_ENDPROC
+
+.text
+ENTRY(__put_user_1)
+ ENTER
+ cmpl TI_addr_limit(%ebx),%ecx
+ jae bad_put_user
+1: movb %al,(%ecx)
+ xorl %eax,%eax
+ EXIT
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+ ENTER
+ movl TI_addr_limit(%ebx),%ebx
+ subl $1,%ebx
+ cmpl %ebx,%ecx
+ jae bad_put_user
+2: movw %ax,(%ecx)
+ xorl %eax,%eax
+ EXIT
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+ ENTER
+ movl TI_addr_limit(%ebx),%ebx
+ subl $3,%ebx
+ cmpl %ebx,%ecx
+ jae bad_put_user
+3: movl %eax,(%ecx)
+ xorl %eax,%eax
+ EXIT
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+ ENTER
+ movl TI_addr_limit(%ebx),%ebx
+ subl $7,%ebx
+ cmpl %ebx,%ecx
+ jae bad_put_user
+4: movl %eax,(%ecx)
+5: movl %edx,4(%ecx)
+ xorl %eax,%eax
+ EXIT
+ENDPROC(__put_user_8)
+
+bad_put_user:
+ CFI_STARTPROC simple
+ CFI_DEF_CFA esp, 2*4
+ CFI_OFFSET eip, -1*4
+ CFI_OFFSET ebx, -2*4
+ movl $-14,%eax
+ EXIT
+END(bad_put_user)
+
+.section __ex_table,"a"
+ .long 1b,bad_put_user
+ .long 2b,bad_put_user
+ .long 3b,bad_put_user
+ .long 4b,bad_put_user
+ .long 5b,bad_put_user
+.previous
diff --git a/arch/x86/lib/putuser_64.S b/arch/x86/lib/putuser_64.S
new file mode 100644
index 000000000000..4989f5a8fa9b
--- /dev/null
+++ b/arch/x86/lib/putuser_64.S
@@ -0,0 +1,106 @@
+/*
+ * __put_user functions.
+ *
+ * (C) Copyright 1998 Linus Torvalds
+ * (C) Copyright 2005 Andi Kleen
+ *
+ * These functions have a non-standard call interface
+ * to make them more efficient, especially as they
+ * return an error value in addition to the "real"
+ * return value.
+ */
+
+/*
+ * __put_user_X
+ *
+ * Inputs: %rcx contains the address
+ * %rdx contains new value
+ *
+ * Outputs: %rax is error code (0 or -EFAULT)
+ *
+ * %r8 is destroyed.
+ *
+ * These functions should not modify any other registers,
+ * as they get called from within inline assembly.
+ */
+
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/page.h>
+#include <asm/errno.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+
+ .text
+ENTRY(__put_user_1)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae bad_put_user
+1: movb %dl,(%rcx)
+ xorl %eax,%eax
+ ret
+ CFI_ENDPROC
+ENDPROC(__put_user_1)
+
+ENTRY(__put_user_2)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $1,%rcx
+ jc 20f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 20f
+ decq %rcx
+2: movw %dx,(%rcx)
+ xorl %eax,%eax
+ ret
+20: decq %rcx
+ jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_2)
+
+ENTRY(__put_user_4)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $3,%rcx
+ jc 30f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 30f
+ subq $3,%rcx
+3: movl %edx,(%rcx)
+ xorl %eax,%eax
+ ret
+30: subq $3,%rcx
+ jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_4)
+
+ENTRY(__put_user_8)
+ CFI_STARTPROC
+ GET_THREAD_INFO(%r8)
+ addq $7,%rcx
+ jc 40f
+ cmpq threadinfo_addr_limit(%r8),%rcx
+ jae 40f
+ subq $7,%rcx
+4: movq %rdx,(%rcx)
+ xorl %eax,%eax
+ ret
+40: subq $7,%rcx
+ jmp bad_put_user
+ CFI_ENDPROC
+ENDPROC(__put_user_8)
+
+bad_put_user:
+ CFI_STARTPROC
+ movq $(-EFAULT),%rax
+ ret
+ CFI_ENDPROC
+END(bad_put_user)
+
+.section __ex_table,"a"
+ .quad 1b,bad_put_user
+ .quad 2b,bad_put_user
+ .quad 3b,bad_put_user
+ .quad 4b,bad_put_user
+.previous
diff --git a/arch/x86/lib/rwlock_64.S b/arch/x86/lib/rwlock_64.S
new file mode 100644
index 000000000000..0cde1f807314
--- /dev/null
+++ b/arch/x86/lib/rwlock_64.S
@@ -0,0 +1,38 @@
+/* Slow paths of read/write spinlocks. */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/dwarf2.h>
+
+/* rdi: pointer to rwlock_t */
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC
+ LOCK_PREFIX
+ addl $RW_LOCK_BIAS,(%rdi)
+1: rep
+ nop
+ cmpl $RW_LOCK_BIAS,(%rdi)
+ jne 1b
+ LOCK_PREFIX
+ subl $RW_LOCK_BIAS,(%rdi)
+ jnz __write_lock_failed
+ ret
+ CFI_ENDPROC
+END(__write_lock_failed)
+
+/* rdi: pointer to rwlock_t */
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ LOCK_PREFIX
+ incl (%rdi)
+1: rep
+ nop
+ cmpl $1,(%rdi)
+ js 1b
+ LOCK_PREFIX
+ decl (%rdi)
+ js __read_lock_failed
+ ret
+ CFI_ENDPROC
+END(__read_lock_failed)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
new file mode 100644
index 000000000000..c01eb39c0b43
--- /dev/null
+++ b/arch/x86/lib/semaphore_32.S
@@ -0,0 +1,219 @@
+/*
+ * i386 semaphore implementation.
+ *
+ * (C) Copyright 1999 Linus Torvalds
+ *
+ * Portions Copyright 1999 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
+ */
+
+#include <linux/linkage.h>
+#include <asm/rwlock.h>
+#include <asm/alternative-asm.i>
+#include <asm/frame.i>
+#include <asm/dwarf2.h>
+
+/*
+ * The semaphore operations have a special calling sequence that
+ * allow us to do a simpler in-line version of them. These routines
+ * need to convert that sequence back into the C sequence when
+ * there is contention on the semaphore.
+ *
+ * %eax contains the semaphore pointer on entry. Save the C-clobbered
+ * registers (%eax, %edx and %ecx) except %eax whish is either a return
+ * value or just clobbered..
+ */
+ .section .sched.text
+ENTRY(__down_failed)
+ CFI_STARTPROC
+ FRAME
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ call __down
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__down_failed)
+
+ENTRY(__down_failed_interruptible)
+ CFI_STARTPROC
+ FRAME
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ call __down_interruptible
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__down_failed_interruptible)
+
+ENTRY(__down_failed_trylock)
+ CFI_STARTPROC
+ FRAME
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ call __down_trylock
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__down_failed_trylock)
+
+ENTRY(__up_wakeup)
+ CFI_STARTPROC
+ FRAME
+ pushl %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ pushl %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ call __up
+ popl %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE ecx
+ popl %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE edx
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__up_wakeup)
+
+/*
+ * rw spinlock fallbacks
+ */
+#ifdef CONFIG_SMP
+ENTRY(__write_lock_failed)
+ CFI_STARTPROC simple
+ FRAME
+2: LOCK_PREFIX
+ addl $ RW_LOCK_BIAS,(%eax)
+1: rep; nop
+ cmpl $ RW_LOCK_BIAS,(%eax)
+ jne 1b
+ LOCK_PREFIX
+ subl $ RW_LOCK_BIAS,(%eax)
+ jnz 2b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__write_lock_failed)
+
+ENTRY(__read_lock_failed)
+ CFI_STARTPROC
+ FRAME
+2: LOCK_PREFIX
+ incl (%eax)
+1: rep; nop
+ cmpl $1,(%eax)
+ js 1b
+ LOCK_PREFIX
+ decl (%eax)
+ js 2b
+ ENDFRAME
+ ret
+ CFI_ENDPROC
+ END(__read_lock_failed)
+
+#endif
+
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_down_read_failed)
+ CFI_STARTPROC
+ push %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ push %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ call rwsem_down_read_failed
+ pop %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ pop %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+ END(call_rwsem_down_read_failed)
+
+ENTRY(call_rwsem_down_write_failed)
+ CFI_STARTPROC
+ push %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ calll rwsem_down_write_failed
+ pop %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+ END(call_rwsem_down_write_failed)
+
+ENTRY(call_rwsem_wake)
+ CFI_STARTPROC
+ decw %dx /* do nothing if still outstanding active readers */
+ jnz 1f
+ push %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ call rwsem_wake
+ pop %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+1: ret
+ CFI_ENDPROC
+ END(call_rwsem_wake)
+
+/* Fix up special calling conventions */
+ENTRY(call_rwsem_downgrade_wake)
+ CFI_STARTPROC
+ push %ecx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET ecx,0
+ push %edx
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET edx,0
+ call rwsem_downgrade_wake
+ pop %edx
+ CFI_ADJUST_CFA_OFFSET -4
+ pop %ecx
+ CFI_ADJUST_CFA_OFFSET -4
+ ret
+ CFI_ENDPROC
+ END(call_rwsem_downgrade_wake)
+
+#endif
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
new file mode 100644
index 000000000000..2c773fefa3dd
--- /dev/null
+++ b/arch/x86/lib/string_32.c
@@ -0,0 +1,257 @@
+/*
+ * Most of the string-functions are rather heavily hand-optimized,
+ * see especially strsep,strstr,str[c]spn. They should work, but are not
+ * very easy to understand. Everything is done entirely within the register
+ * set, making the functions fast and clean. String instructions have been
+ * used through-out, making for "slightly" unclear code :-)
+ *
+ * AK: On P4 and K7 using non string instruction implementations might be faster
+ * for large memory blocks. But most of them are unlikely to be used on large
+ * strings.
+ */
+
+#include <linux/string.h>
+#include <linux/module.h>
+
+#ifdef __HAVE_ARCH_STRCPY
+char *strcpy(char * dest,const char *src)
+{
+ int d0, d1, d2;
+ asm volatile( "1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2)
+ :"0" (src),"1" (dest) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCPY
+char *strncpy(char * dest,const char *src,size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "1:\tdecl %2\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "rep\n\t"
+ "stosb\n"
+ "2:"
+ : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
+ :"0" (src),"1" (dest),"2" (count) : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncpy);
+#endif
+
+#ifdef __HAVE_ARCH_STRCAT
+char *strcat(char * dest,const char * src)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "decl %1\n"
+ "1:\tlodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strcat);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCAT
+char *strncat(char * dest,const char * src,size_t count)
+{
+ int d0, d1, d2, d3;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "decl %1\n\t"
+ "movl %8,%3\n"
+ "1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "stosb\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %2,%2\n\t"
+ "stosb"
+ : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
+ : "0" (src),"1" (dest),"2" (0),"3" (0xffffffffu), "g" (count)
+ : "memory");
+ return dest;
+}
+EXPORT_SYMBOL(strncat);
+#endif
+
+#ifdef __HAVE_ARCH_STRCMP
+int strcmp(const char * cs,const char * ct)
+{
+ int d0, d1;
+ int res;
+ asm volatile( "1:\tlodsb\n\t"
+ "scasb\n\t"
+ "jne 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "xorl %%eax,%%eax\n\t"
+ "jmp 3f\n"
+ "2:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "3:"
+ :"=a" (res), "=&S" (d0), "=&D" (d1)
+ :"1" (cs),"2" (ct)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strcmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRNCMP
+int strncmp(const char * cs,const char * ct,size_t count)
+{
+ int res;
+ int d0, d1, d2;
+ asm volatile( "1:\tdecl %3\n\t"
+ "js 2f\n\t"
+ "lodsb\n\t"
+ "scasb\n\t"
+ "jne 3f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n"
+ "2:\txorl %%eax,%%eax\n\t"
+ "jmp 4f\n"
+ "3:\tsbbl %%eax,%%eax\n\t"
+ "orb $1,%%al\n"
+ "4:"
+ :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
+ :"1" (cs),"2" (ct),"3" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strncmp);
+#endif
+
+#ifdef __HAVE_ARCH_STRCHR
+char *strchr(const char * s, int c)
+{
+ int d0;
+ char * res;
+ asm volatile( "movb %%al,%%ah\n"
+ "1:\tlodsb\n\t"
+ "cmpb %%ah,%%al\n\t"
+ "je 2f\n\t"
+ "testb %%al,%%al\n\t"
+ "jne 1b\n\t"
+ "movl $1,%1\n"
+ "2:\tmovl %1,%0\n\t"
+ "decl %0"
+ :"=a" (res), "=&S" (d0)
+ :"1" (s),"0" (c)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRRCHR
+char *strrchr(const char * s, int c)
+{
+ int d0, d1;
+ char * res;
+ asm volatile( "movb %%al,%%ah\n"
+ "1:\tlodsb\n\t"
+ "cmpb %%ah,%%al\n\t"
+ "jne 2f\n\t"
+ "leal -1(%%esi),%0\n"
+ "2:\ttestb %%al,%%al\n\t"
+ "jne 1b"
+ :"=g" (res), "=&S" (d0), "=&a" (d1)
+ :"0" (0),"1" (s),"2" (c)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strrchr);
+#endif
+
+#ifdef __HAVE_ARCH_STRLEN
+size_t strlen(const char * s)
+{
+ int d0;
+ int res;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "notl %0\n\t"
+ "decl %0"
+ :"=c" (res), "=&D" (d0)
+ :"1" (s),"a" (0), "0" (0xffffffffu)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strlen);
+#endif
+
+#ifdef __HAVE_ARCH_MEMCHR
+void *memchr(const void *cs,int c,size_t count)
+{
+ int d0;
+ void *res;
+ if (!count)
+ return NULL;
+ asm volatile( "repne\n\t"
+ "scasb\n\t"
+ "je 1f\n\t"
+ "movl $1,%0\n"
+ "1:\tdecl %0"
+ :"=D" (res), "=&c" (d0)
+ :"a" (c),"0" (cs),"1" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(memchr);
+#endif
+
+#ifdef __HAVE_ARCH_MEMSCAN
+void *memscan(void * addr, int c, size_t size)
+{
+ if (!size)
+ return addr;
+ asm volatile("repnz; scasb\n\t"
+ "jnz 1f\n\t"
+ "dec %%edi\n"
+ "1:"
+ : "=D" (addr), "=c" (size)
+ : "0" (addr), "1" (size), "a" (c)
+ : "memory");
+ return addr;
+}
+EXPORT_SYMBOL(memscan);
+#endif
+
+#ifdef __HAVE_ARCH_STRNLEN
+size_t strnlen(const char *s, size_t count)
+{
+ int d0;
+ int res;
+ asm volatile( "movl %2,%0\n\t"
+ "jmp 2f\n"
+ "1:\tcmpb $0,(%0)\n\t"
+ "je 3f\n\t"
+ "incl %0\n"
+ "2:\tdecl %1\n\t"
+ "cmpl $-1,%1\n\t"
+ "jne 1b\n"
+ "3:\tsubl %2,%0"
+ :"=a" (res), "=&d" (d0)
+ :"c" (s),"1" (count)
+ :"memory");
+ return res;
+}
+EXPORT_SYMBOL(strnlen);
+#endif
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
new file mode 100644
index 000000000000..a3dafbf59dae
--- /dev/null
+++ b/arch/x86/lib/strstr_32.c
@@ -0,0 +1,31 @@
+#include <linux/string.h>
+
+char * strstr(const char * cs,const char * ct)
+{
+int d0, d1;
+register char * __res;
+__asm__ __volatile__(
+ "movl %6,%%edi\n\t"
+ "repne\n\t"
+ "scasb\n\t"
+ "notl %%ecx\n\t"
+ "decl %%ecx\n\t" /* NOTE! This also sets Z if searchstring='' */
+ "movl %%ecx,%%edx\n"
+ "1:\tmovl %6,%%edi\n\t"
+ "movl %%esi,%%eax\n\t"
+ "movl %%edx,%%ecx\n\t"
+ "repe\n\t"
+ "cmpsb\n\t"
+ "je 2f\n\t" /* also works for empty string, see above */
+ "xchgl %%eax,%%esi\n\t"
+ "incl %%esi\n\t"
+ "cmpb $0,-1(%%eax)\n\t"
+ "jne 1b\n\t"
+ "xorl %%eax,%%eax\n\t"
+ "2:"
+ :"=a" (__res), "=&c" (d0), "=&S" (d1)
+ :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
+ :"dx", "di");
+return __res;
+}
+
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
new file mode 100644
index 000000000000..55e586d352d3
--- /dev/null
+++ b/arch/x86/lib/thunk_64.S
@@ -0,0 +1,67 @@
+/*
+ * Save registers before calling assembly functions. This avoids
+ * disturbance of register allocation in some inline assembly constructs.
+ * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+ #include <linux/linkage.h>
+ #include <asm/dwarf2.h>
+ #include <asm/calling.h>
+ #include <asm/rwlock.h>
+
+ /* rdi: arg1 ... normal C conventions. rax is saved/restored. */
+ .macro thunk name,func
+ .globl \name
+\name:
+ CFI_STARTPROC
+ SAVE_ARGS
+ call \func
+ jmp restore
+ CFI_ENDPROC
+ .endm
+
+ /* rdi: arg1 ... normal C conventions. rax is passed from C. */
+ .macro thunk_retrax name,func
+ .globl \name
+\name:
+ CFI_STARTPROC
+ SAVE_ARGS
+ call \func
+ jmp restore_norax
+ CFI_ENDPROC
+ .endm
+
+
+ .section .sched.text
+#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
+ thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
+ thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
+ thunk rwsem_wake_thunk,rwsem_wake
+ thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
+#endif
+
+ thunk __down_failed,__down
+ thunk_retrax __down_failed_interruptible,__down_interruptible
+ thunk_retrax __down_failed_trylock,__down_trylock
+ thunk __up_wakeup,__up
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+ thunk trace_hardirqs_on_thunk,trace_hardirqs_on
+ thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+#endif
+
+ /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+ CFI_STARTPROC
+ SAVE_ARGS
+restore:
+ RESTORE_ARGS
+ ret
+ CFI_ENDPROC
+
+ CFI_STARTPROC
+ SAVE_ARGS
+restore_norax:
+ RESTORE_ARGS 1
+ ret
+ CFI_ENDPROC
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
new file mode 100644
index 000000000000..9f38b12b4af1
--- /dev/null
+++ b/arch/x86/lib/usercopy_32.c
@@ -0,0 +1,882 @@
+/*
+ * User address space access functions.
+ * The non inlined parts of asm-i386/uaccess.h are here.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ */
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/backing-dev.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+#include <asm/mmx.h>
+
+static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ if (n >= 64 && ((a1 ^ a2) & movsl_mask.mask))
+ return 0;
+#endif
+ return 1;
+}
+#define movsl_is_ok(a1,a2,n) \
+ __movsl_is_ok((unsigned long)(a1),(unsigned long)(a2),(n))
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res) \
+do { \
+ int __d0, __d1, __d2; \
+ might_sleep(); \
+ __asm__ __volatile__( \
+ " testl %1,%1\n" \
+ " jz 2f\n" \
+ "0: lodsb\n" \
+ " stosb\n" \
+ " testb %%al,%%al\n" \
+ " jz 1f\n" \
+ " decl %1\n" \
+ " jnz 0b\n" \
+ "1: subl %1,%0\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: movl %5,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 0b,3b\n" \
+ ".previous" \
+ : "=d"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
+ "=&D" (__d2) \
+ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+ : "memory"); \
+} while (0)
+
+/**
+ * __strncpy_from_user: - Copy a NUL terminated string from userspace, with less checking.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @src: Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from userspace to kernel space.
+ * Caller must check the specified block with access_ok() before calling
+ * this function.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res;
+ __do_strncpy_from_user(dst, src, count, res);
+ return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+
+/**
+ * strncpy_from_user: - Copy a NUL terminated string from userspace.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @src: Source address, in user space.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from userspace to kernel space.
+ *
+ * On success, returns the length of the string (not including the trailing
+ * NUL).
+ *
+ * If access to userspace fails, returns -EFAULT (some data may have been
+ * copied).
+ *
+ * If @count is smaller than the length of the string, copies @count bytes
+ * and returns @count.
+ */
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res = -EFAULT;
+ if (access_ok(VERIFY_READ, src, 1))
+ __do_strncpy_from_user(dst, src, count, res);
+ return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+
+/*
+ * Zero Userspace
+ */
+
+#define __do_clear_user(addr,size) \
+do { \
+ int __d0; \
+ might_sleep(); \
+ __asm__ __volatile__( \
+ "0: rep; stosl\n" \
+ " movl %2,%0\n" \
+ "1: rep; stosb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: lea 0(%2,%0,4),%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 0b,3b\n" \
+ " .long 1b,2b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0) \
+ : "r"(size & 3), "0"(size / 4), "1"(addr), "a"(0)); \
+} while (0)
+
+/**
+ * clear_user: - Zero a block of memory in user space.
+ * @to: Destination address, in user space.
+ * @n: Number of bytes to zero.
+ *
+ * Zero a block of memory in user space.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+clear_user(void __user *to, unsigned long n)
+{
+ might_sleep();
+ if (access_ok(VERIFY_WRITE, to, n))
+ __do_clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/**
+ * __clear_user: - Zero a block of memory in user space, with less checking.
+ * @to: Destination address, in user space.
+ * @n: Number of bytes to zero.
+ *
+ * Zero a block of memory in user space. Caller must check
+ * the specified block with access_ok() before calling this function.
+ *
+ * Returns number of bytes that could not be cleared.
+ * On success, this will be zero.
+ */
+unsigned long
+__clear_user(void __user *to, unsigned long n)
+{
+ __do_clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(__clear_user);
+
+/**
+ * strnlen_user: - Get the size of a string in user space.
+ * @s: The string to measure.
+ * @n: The maximum valid length
+ *
+ * Get the size of a NUL-terminated string in user space.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ * On exception, returns 0.
+ * If the string is too long, returns a value greater than @n.
+ */
+long strnlen_user(const char __user *s, long n)
+{
+ unsigned long mask = -__addr_ok(s);
+ unsigned long res, tmp;
+
+ might_sleep();
+
+ __asm__ __volatile__(
+ " testl %0, %0\n"
+ " jz 3f\n"
+ " andl %0,%%ecx\n"
+ "0: repne; scasb\n"
+ " setne %%al\n"
+ " subl %%ecx,%0\n"
+ " addl %0,%%eax\n"
+ "1:\n"
+ ".section .fixup,\"ax\"\n"
+ "2: xorl %%eax,%%eax\n"
+ " jmp 1b\n"
+ "3: movb $1,%%al\n"
+ " jmp 1b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 0b,2b\n"
+ ".previous"
+ :"=r" (n), "=D" (s), "=a" (res), "=c" (tmp)
+ :"0" (n), "1" (s), "2" (0), "3" (mask)
+ :"cc");
+ return res & mask;
+}
+EXPORT_SYMBOL(strnlen_user);
+
+#ifdef CONFIG_X86_INTEL_USERCOPY
+static unsigned long
+__copy_user_intel(void __user *to, const void *from, unsigned long size)
+{
+ int d0, d1;
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "1: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 3f\n"
+ "2: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "3: movl 0(%4), %%eax\n"
+ "4: movl 4(%4), %%edx\n"
+ "5: movl %%eax, 0(%3)\n"
+ "6: movl %%edx, 4(%3)\n"
+ "7: movl 8(%4), %%eax\n"
+ "8: movl 12(%4),%%edx\n"
+ "9: movl %%eax, 8(%3)\n"
+ "10: movl %%edx, 12(%3)\n"
+ "11: movl 16(%4), %%eax\n"
+ "12: movl 20(%4), %%edx\n"
+ "13: movl %%eax, 16(%3)\n"
+ "14: movl %%edx, 20(%3)\n"
+ "15: movl 24(%4), %%eax\n"
+ "16: movl 28(%4), %%edx\n"
+ "17: movl %%eax, 24(%3)\n"
+ "18: movl %%edx, 28(%3)\n"
+ "19: movl 32(%4), %%eax\n"
+ "20: movl 36(%4), %%edx\n"
+ "21: movl %%eax, 32(%3)\n"
+ "22: movl %%edx, 36(%3)\n"
+ "23: movl 40(%4), %%eax\n"
+ "24: movl 44(%4), %%edx\n"
+ "25: movl %%eax, 40(%3)\n"
+ "26: movl %%edx, 44(%3)\n"
+ "27: movl 48(%4), %%eax\n"
+ "28: movl 52(%4), %%edx\n"
+ "29: movl %%eax, 48(%3)\n"
+ "30: movl %%edx, 52(%3)\n"
+ "31: movl 56(%4), %%eax\n"
+ "32: movl 60(%4), %%edx\n"
+ "33: movl %%eax, 56(%3)\n"
+ "34: movl %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 1b\n"
+ "35: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "99: rep; movsl\n"
+ "36: movl %%eax, %0\n"
+ "37: rep; movsb\n"
+ "100:\n"
+ ".section .fixup,\"ax\"\n"
+ "101: lea 0(%%eax,%0,4),%0\n"
+ " jmp 100b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 1b,100b\n"
+ " .long 2b,100b\n"
+ " .long 3b,100b\n"
+ " .long 4b,100b\n"
+ " .long 5b,100b\n"
+ " .long 6b,100b\n"
+ " .long 7b,100b\n"
+ " .long 8b,100b\n"
+ " .long 9b,100b\n"
+ " .long 10b,100b\n"
+ " .long 11b,100b\n"
+ " .long 12b,100b\n"
+ " .long 13b,100b\n"
+ " .long 14b,100b\n"
+ " .long 15b,100b\n"
+ " .long 16b,100b\n"
+ " .long 17b,100b\n"
+ " .long 18b,100b\n"
+ " .long 19b,100b\n"
+ " .long 20b,100b\n"
+ " .long 21b,100b\n"
+ " .long 22b,100b\n"
+ " .long 23b,100b\n"
+ " .long 24b,100b\n"
+ " .long 25b,100b\n"
+ " .long 26b,100b\n"
+ " .long 27b,100b\n"
+ " .long 28b,100b\n"
+ " .long 29b,100b\n"
+ " .long 30b,100b\n"
+ " .long 31b,100b\n"
+ " .long 32b,100b\n"
+ " .long 33b,100b\n"
+ " .long 34b,100b\n"
+ " .long 35b,100b\n"
+ " .long 36b,100b\n"
+ " .long 37b,100b\n"
+ " .long 99b,101b\n"
+ ".previous"
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+static unsigned long
+__copy_user_zeroing_intel(void *to, const void __user *from, unsigned long size)
+{
+ int d0, d1;
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "0: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
+ "1: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movl %%eax, 0(%3)\n"
+ " movl %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movl %%eax, 8(%3)\n"
+ " movl %%edx, 12(%3)\n"
+ "4: movl 16(%4), %%eax\n"
+ "41: movl 20(%4), %%edx\n"
+ " movl %%eax, 16(%3)\n"
+ " movl %%edx, 20(%3)\n"
+ "10: movl 24(%4), %%eax\n"
+ "51: movl 28(%4), %%edx\n"
+ " movl %%eax, 24(%3)\n"
+ " movl %%edx, 28(%3)\n"
+ "11: movl 32(%4), %%eax\n"
+ "61: movl 36(%4), %%edx\n"
+ " movl %%eax, 32(%3)\n"
+ " movl %%edx, 36(%3)\n"
+ "12: movl 40(%4), %%eax\n"
+ "71: movl 44(%4), %%edx\n"
+ " movl %%eax, 40(%3)\n"
+ " movl %%edx, 44(%3)\n"
+ "13: movl 48(%4), %%eax\n"
+ "81: movl 52(%4), %%edx\n"
+ " movl %%eax, 48(%3)\n"
+ " movl %%edx, 52(%3)\n"
+ "14: movl 56(%4), %%eax\n"
+ "91: movl 60(%4), %%edx\n"
+ " movl %%eax, 56(%3)\n"
+ " movl %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
+ " movl %%eax,%0\n"
+ "7: rep; movsb\n"
+ "8:\n"
+ ".section .fixup,\"ax\"\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: pushl %0\n"
+ " pushl %%eax\n"
+ " xorl %%eax,%%eax\n"
+ " rep; stosb\n"
+ " popl %%eax\n"
+ " popl %0\n"
+ " jmp 8b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 0b,16b\n"
+ " .long 1b,16b\n"
+ " .long 2b,16b\n"
+ " .long 21b,16b\n"
+ " .long 3b,16b\n"
+ " .long 31b,16b\n"
+ " .long 4b,16b\n"
+ " .long 41b,16b\n"
+ " .long 10b,16b\n"
+ " .long 51b,16b\n"
+ " .long 11b,16b\n"
+ " .long 61b,16b\n"
+ " .long 12b,16b\n"
+ " .long 71b,16b\n"
+ " .long 13b,16b\n"
+ " .long 81b,16b\n"
+ " .long 14b,16b\n"
+ " .long 91b,16b\n"
+ " .long 6b,9b\n"
+ " .long 7b,16b\n"
+ ".previous"
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+/*
+ * Non Temporal Hint version of __copy_user_zeroing_intel. It is cache aware.
+ * hyoshiok@miraclelinux.com
+ */
+
+static unsigned long __copy_user_zeroing_intel_nocache(void *to,
+ const void __user *from, unsigned long size)
+{
+ int d0, d1;
+
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "0: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
+ "1: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movnti %%eax, 0(%3)\n"
+ " movnti %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movnti %%eax, 8(%3)\n"
+ " movnti %%edx, 12(%3)\n"
+ "4: movl 16(%4), %%eax\n"
+ "41: movl 20(%4), %%edx\n"
+ " movnti %%eax, 16(%3)\n"
+ " movnti %%edx, 20(%3)\n"
+ "10: movl 24(%4), %%eax\n"
+ "51: movl 28(%4), %%edx\n"
+ " movnti %%eax, 24(%3)\n"
+ " movnti %%edx, 28(%3)\n"
+ "11: movl 32(%4), %%eax\n"
+ "61: movl 36(%4), %%edx\n"
+ " movnti %%eax, 32(%3)\n"
+ " movnti %%edx, 36(%3)\n"
+ "12: movl 40(%4), %%eax\n"
+ "71: movl 44(%4), %%edx\n"
+ " movnti %%eax, 40(%3)\n"
+ " movnti %%edx, 44(%3)\n"
+ "13: movl 48(%4), %%eax\n"
+ "81: movl 52(%4), %%edx\n"
+ " movnti %%eax, 48(%3)\n"
+ " movnti %%edx, 52(%3)\n"
+ "14: movl 56(%4), %%eax\n"
+ "91: movl 60(%4), %%edx\n"
+ " movnti %%eax, 56(%3)\n"
+ " movnti %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ " sfence \n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
+ " movl %%eax,%0\n"
+ "7: rep; movsb\n"
+ "8:\n"
+ ".section .fixup,\"ax\"\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: pushl %0\n"
+ " pushl %%eax\n"
+ " xorl %%eax,%%eax\n"
+ " rep; stosb\n"
+ " popl %%eax\n"
+ " popl %0\n"
+ " jmp 8b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 0b,16b\n"
+ " .long 1b,16b\n"
+ " .long 2b,16b\n"
+ " .long 21b,16b\n"
+ " .long 3b,16b\n"
+ " .long 31b,16b\n"
+ " .long 4b,16b\n"
+ " .long 41b,16b\n"
+ " .long 10b,16b\n"
+ " .long 51b,16b\n"
+ " .long 11b,16b\n"
+ " .long 61b,16b\n"
+ " .long 12b,16b\n"
+ " .long 71b,16b\n"
+ " .long 13b,16b\n"
+ " .long 81b,16b\n"
+ " .long 14b,16b\n"
+ " .long 91b,16b\n"
+ " .long 6b,9b\n"
+ " .long 7b,16b\n"
+ ".previous"
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+static unsigned long __copy_user_intel_nocache(void *to,
+ const void __user *from, unsigned long size)
+{
+ int d0, d1;
+
+ __asm__ __volatile__(
+ " .align 2,0x90\n"
+ "0: movl 32(%4), %%eax\n"
+ " cmpl $67, %0\n"
+ " jbe 2f\n"
+ "1: movl 64(%4), %%eax\n"
+ " .align 2,0x90\n"
+ "2: movl 0(%4), %%eax\n"
+ "21: movl 4(%4), %%edx\n"
+ " movnti %%eax, 0(%3)\n"
+ " movnti %%edx, 4(%3)\n"
+ "3: movl 8(%4), %%eax\n"
+ "31: movl 12(%4),%%edx\n"
+ " movnti %%eax, 8(%3)\n"
+ " movnti %%edx, 12(%3)\n"
+ "4: movl 16(%4), %%eax\n"
+ "41: movl 20(%4), %%edx\n"
+ " movnti %%eax, 16(%3)\n"
+ " movnti %%edx, 20(%3)\n"
+ "10: movl 24(%4), %%eax\n"
+ "51: movl 28(%4), %%edx\n"
+ " movnti %%eax, 24(%3)\n"
+ " movnti %%edx, 28(%3)\n"
+ "11: movl 32(%4), %%eax\n"
+ "61: movl 36(%4), %%edx\n"
+ " movnti %%eax, 32(%3)\n"
+ " movnti %%edx, 36(%3)\n"
+ "12: movl 40(%4), %%eax\n"
+ "71: movl 44(%4), %%edx\n"
+ " movnti %%eax, 40(%3)\n"
+ " movnti %%edx, 44(%3)\n"
+ "13: movl 48(%4), %%eax\n"
+ "81: movl 52(%4), %%edx\n"
+ " movnti %%eax, 48(%3)\n"
+ " movnti %%edx, 52(%3)\n"
+ "14: movl 56(%4), %%eax\n"
+ "91: movl 60(%4), %%edx\n"
+ " movnti %%eax, 56(%3)\n"
+ " movnti %%edx, 60(%3)\n"
+ " addl $-64, %0\n"
+ " addl $64, %4\n"
+ " addl $64, %3\n"
+ " cmpl $63, %0\n"
+ " ja 0b\n"
+ " sfence \n"
+ "5: movl %0, %%eax\n"
+ " shrl $2, %0\n"
+ " andl $3, %%eax\n"
+ " cld\n"
+ "6: rep; movsl\n"
+ " movl %%eax,%0\n"
+ "7: rep; movsb\n"
+ "8:\n"
+ ".section .fixup,\"ax\"\n"
+ "9: lea 0(%%eax,%0,4),%0\n"
+ "16: jmp 8b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 0b,16b\n"
+ " .long 1b,16b\n"
+ " .long 2b,16b\n"
+ " .long 21b,16b\n"
+ " .long 3b,16b\n"
+ " .long 31b,16b\n"
+ " .long 4b,16b\n"
+ " .long 41b,16b\n"
+ " .long 10b,16b\n"
+ " .long 51b,16b\n"
+ " .long 11b,16b\n"
+ " .long 61b,16b\n"
+ " .long 12b,16b\n"
+ " .long 71b,16b\n"
+ " .long 13b,16b\n"
+ " .long 81b,16b\n"
+ " .long 14b,16b\n"
+ " .long 91b,16b\n"
+ " .long 6b,9b\n"
+ " .long 7b,16b\n"
+ ".previous"
+ : "=&c"(size), "=&D" (d0), "=&S" (d1)
+ : "1"(to), "2"(from), "0"(size)
+ : "eax", "edx", "memory");
+ return size;
+}
+
+#else
+
+/*
+ * Leave these declared but undefined. They should not be any references to
+ * them
+ */
+unsigned long __copy_user_zeroing_intel(void *to, const void __user *from,
+ unsigned long size);
+unsigned long __copy_user_intel(void __user *to, const void *from,
+ unsigned long size);
+unsigned long __copy_user_zeroing_intel_nocache(void *to,
+ const void __user *from, unsigned long size);
+#endif /* CONFIG_X86_INTEL_USERCOPY */
+
+/* Generic arbitrary sized copy. */
+#define __copy_user(to,from,size) \
+do { \
+ int __d0, __d1, __d2; \
+ __asm__ __volatile__( \
+ " cmp $7,%0\n" \
+ " jbe 1f\n" \
+ " movl %1,%0\n" \
+ " negl %0\n" \
+ " andl $7,%0\n" \
+ " subl %0,%3\n" \
+ "4: rep; movsb\n" \
+ " movl %3,%0\n" \
+ " shrl $2,%0\n" \
+ " andl $3,%3\n" \
+ " .align 2,0x90\n" \
+ "0: rep; movsl\n" \
+ " movl %3,%0\n" \
+ "1: rep; movsb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "5: addl %3,%0\n" \
+ " jmp 2b\n" \
+ "3: lea 0(%3,%0,4),%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 4b,5b\n" \
+ " .long 0b,3b\n" \
+ " .long 1b,2b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
+ : "3"(size), "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+#define __copy_user_zeroing(to,from,size) \
+do { \
+ int __d0, __d1, __d2; \
+ __asm__ __volatile__( \
+ " cmp $7,%0\n" \
+ " jbe 1f\n" \
+ " movl %1,%0\n" \
+ " negl %0\n" \
+ " andl $7,%0\n" \
+ " subl %0,%3\n" \
+ "4: rep; movsb\n" \
+ " movl %3,%0\n" \
+ " shrl $2,%0\n" \
+ " andl $3,%3\n" \
+ " .align 2,0x90\n" \
+ "0: rep; movsl\n" \
+ " movl %3,%0\n" \
+ "1: rep; movsb\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "5: addl %3,%0\n" \
+ " jmp 6f\n" \
+ "3: lea 0(%3,%0,4),%0\n" \
+ "6: pushl %0\n" \
+ " pushl %%eax\n" \
+ " xorl %%eax,%%eax\n" \
+ " rep; stosb\n" \
+ " popl %%eax\n" \
+ " popl %0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 4\n" \
+ " .long 4b,5b\n" \
+ " .long 0b,3b\n" \
+ " .long 1b,6b\n" \
+ ".previous" \
+ : "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
+ : "3"(size), "0"(size), "1"(to), "2"(from) \
+ : "memory"); \
+} while (0)
+
+unsigned long __copy_to_user_ll(void __user *to, const void *from,
+ unsigned long n)
+{
+#ifndef CONFIG_X86_WP_WORKS_OK
+ if (unlikely(boot_cpu_data.wp_works_ok == 0) &&
+ ((unsigned long )to) < TASK_SIZE) {
+ /*
+ * When we are in an atomic section (see
+ * mm/filemap.c:file_read_actor), return the full
+ * length to take the slow path.
+ */
+ if (in_atomic())
+ return n;
+
+ /*
+ * CPU does not honor the WP bit when writing
+ * from supervisory mode, and due to preemption or SMP,
+ * the page tables can change at any time.
+ * Do it manually. Manfred <manfred@colorfullife.com>
+ */
+ while (n) {
+ unsigned long offset = ((unsigned long)to)%PAGE_SIZE;
+ unsigned long len = PAGE_SIZE - offset;
+ int retval;
+ struct page *pg;
+ void *maddr;
+
+ if (len > n)
+ len = n;
+
+survive:
+ down_read(&current->mm->mmap_sem);
+ retval = get_user_pages(current, current->mm,
+ (unsigned long )to, 1, 1, 0, &pg, NULL);
+
+ if (retval == -ENOMEM && is_init(current)) {
+ up_read(&current->mm->mmap_sem);
+ congestion_wait(WRITE, HZ/50);
+ goto survive;
+ }
+
+ if (retval != 1) {
+ up_read(&current->mm->mmap_sem);
+ break;
+ }
+
+ maddr = kmap_atomic(pg, KM_USER0);
+ memcpy(maddr + offset, from, len);
+ kunmap_atomic(maddr, KM_USER0);
+ set_page_dirty_lock(pg);
+ put_page(pg);
+ up_read(&current->mm->mmap_sem);
+
+ from += len;
+ to += len;
+ n -= len;
+ }
+ return n;
+ }
+#endif
+ if (movsl_is_ok(to, from, n))
+ __copy_user(to, from, n);
+ else
+ n = __copy_user_intel(to, from, n);
+ return n;
+}
+EXPORT_SYMBOL(__copy_to_user_ll);
+
+unsigned long __copy_from_user_ll(void *to, const void __user *from,
+ unsigned long n)
+{
+ if (movsl_is_ok(to, from, n))
+ __copy_user_zeroing(to, from, n);
+ else
+ n = __copy_user_zeroing_intel(to, from, n);
+ return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll);
+
+unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
+ unsigned long n)
+{
+ if (movsl_is_ok(to, from, n))
+ __copy_user(to, from, n);
+ else
+ n = __copy_user_intel((void __user *)to,
+ (const void *)from, n);
+ return n;
+}
+EXPORT_SYMBOL(__copy_from_user_ll_nozero);
+
+unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,
+ unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ if ( n > 64 && cpu_has_xmm2)
+ n = __copy_user_zeroing_intel_nocache(to, from, n);
+ else
+ __copy_user_zeroing(to, from, n);
+#else
+ __copy_user_zeroing(to, from, n);
+#endif
+ return n;
+}
+
+unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,
+ unsigned long n)
+{
+#ifdef CONFIG_X86_INTEL_USERCOPY
+ if ( n > 64 && cpu_has_xmm2)
+ n = __copy_user_intel_nocache(to, from, n);
+ else
+ __copy_user(to, from, n);
+#else
+ __copy_user(to, from, n);
+#endif
+ return n;
+}
+
+/**
+ * copy_to_user: - Copy a block of data into user space.
+ * @to: Destination address, in user space.
+ * @from: Source address, in kernel space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from kernel space to user space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ */
+unsigned long
+copy_to_user(void __user *to, const void *from, unsigned long n)
+{
+ if (access_ok(VERIFY_WRITE, to, n))
+ n = __copy_to_user(to, from, n);
+ return n;
+}
+EXPORT_SYMBOL(copy_to_user);
+
+/**
+ * copy_from_user: - Copy a block of data from user space.
+ * @to: Destination address, in kernel space.
+ * @from: Source address, in user space.
+ * @n: Number of bytes to copy.
+ *
+ * Context: User context only. This function may sleep.
+ *
+ * Copy data from user space to kernel space.
+ *
+ * Returns number of bytes that could not be copied.
+ * On success, this will be zero.
+ *
+ * If some data could not be copied, this function will pad the copied
+ * data to the requested size using zero bytes.
+ */
+unsigned long
+copy_from_user(void *to, const void __user *from, unsigned long n)
+{
+ if (access_ok(VERIFY_READ, from, n))
+ n = __copy_from_user(to, from, n);
+ else
+ memset(to, 0, n);
+ return n;
+}
+EXPORT_SYMBOL(copy_from_user);
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
new file mode 100644
index 000000000000..893d43f838cc
--- /dev/null
+++ b/arch/x86/lib/usercopy_64.c
@@ -0,0 +1,166 @@
+/*
+ * User address space access functions.
+ *
+ * Copyright 1997 Andi Kleen <ak@muc.de>
+ * Copyright 1997 Linus Torvalds
+ * Copyright 2002 Andi Kleen <ak@suse.de>
+ */
+#include <linux/module.h>
+#include <asm/uaccess.h>
+
+/*
+ * Copy a null terminated string from userspace.
+ */
+
+#define __do_strncpy_from_user(dst,src,count,res) \
+do { \
+ long __d0, __d1, __d2; \
+ might_sleep(); \
+ __asm__ __volatile__( \
+ " testq %1,%1\n" \
+ " jz 2f\n" \
+ "0: lodsb\n" \
+ " stosb\n" \
+ " testb %%al,%%al\n" \
+ " jz 1f\n" \
+ " decq %1\n" \
+ " jnz 0b\n" \
+ "1: subq %1,%0\n" \
+ "2:\n" \
+ ".section .fixup,\"ax\"\n" \
+ "3: movq %5,%0\n" \
+ " jmp 2b\n" \
+ ".previous\n" \
+ ".section __ex_table,\"a\"\n" \
+ " .align 8\n" \
+ " .quad 0b,3b\n" \
+ ".previous" \
+ : "=r"(res), "=c"(count), "=&a" (__d0), "=&S" (__d1), \
+ "=&D" (__d2) \
+ : "i"(-EFAULT), "0"(count), "1"(count), "3"(src), "4"(dst) \
+ : "memory"); \
+} while (0)
+
+long
+__strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res;
+ __do_strncpy_from_user(dst, src, count, res);
+ return res;
+}
+EXPORT_SYMBOL(__strncpy_from_user);
+
+long
+strncpy_from_user(char *dst, const char __user *src, long count)
+{
+ long res = -EFAULT;
+ if (access_ok(VERIFY_READ, src, 1))
+ return __strncpy_from_user(dst, src, count);
+ return res;
+}
+EXPORT_SYMBOL(strncpy_from_user);
+
+/*
+ * Zero Userspace
+ */
+
+unsigned long __clear_user(void __user *addr, unsigned long size)
+{
+ long __d0;
+ might_sleep();
+ /* no memory constraint because it doesn't change any memory gcc knows
+ about */
+ asm volatile(
+ " testq %[size8],%[size8]\n"
+ " jz 4f\n"
+ "0: movq %[zero],(%[dst])\n"
+ " addq %[eight],%[dst]\n"
+ " decl %%ecx ; jnz 0b\n"
+ "4: movq %[size1],%%rcx\n"
+ " testl %%ecx,%%ecx\n"
+ " jz 2f\n"
+ "1: movb %b[zero],(%[dst])\n"
+ " incq %[dst]\n"
+ " decl %%ecx ; jnz 1b\n"
+ "2:\n"
+ ".section .fixup,\"ax\"\n"
+ "3: lea 0(%[size1],%[size8],8),%[size8]\n"
+ " jmp 2b\n"
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 8\n"
+ " .quad 0b,3b\n"
+ " .quad 1b,2b\n"
+ ".previous"
+ : [size8] "=c"(size), [dst] "=&D" (__d0)
+ : [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),
+ [zero] "r" (0UL), [eight] "r" (8UL));
+ return size;
+}
+EXPORT_SYMBOL(__clear_user);
+
+unsigned long clear_user(void __user *to, unsigned long n)
+{
+ if (access_ok(VERIFY_WRITE, to, n))
+ return __clear_user(to, n);
+ return n;
+}
+EXPORT_SYMBOL(clear_user);
+
+/*
+ * Return the size of a string (including the ending 0)
+ *
+ * Return 0 on exception, a value greater than N if too long
+ */
+
+long __strnlen_user(const char __user *s, long n)
+{
+ long res = 0;
+ char c;
+
+ while (1) {
+ if (res>n)
+ return n+1;
+ if (__get_user(c, s))
+ return 0;
+ if (!c)
+ return res+1;
+ res++;
+ s++;
+ }
+}
+EXPORT_SYMBOL(__strnlen_user);
+
+long strnlen_user(const char __user *s, long n)
+{
+ if (!access_ok(VERIFY_READ, s, n))
+ return 0;
+ return __strnlen_user(s, n);
+}
+EXPORT_SYMBOL(strnlen_user);
+
+long strlen_user(const char __user *s)
+{
+ long res = 0;
+ char c;
+
+ for (;;) {
+ if (get_user(c, s))
+ return 0;
+ if (!c)
+ return res+1;
+ res++;
+ s++;
+ }
+}
+EXPORT_SYMBOL(strlen_user);
+
+unsigned long copy_in_user(void __user *to, const void __user *from, unsigned len)
+{
+ if (access_ok(VERIFY_WRITE, to, len) && access_ok(VERIFY_READ, from, len)) {
+ return copy_user_generic((__force void *)to, (__force void *)from, len);
+ }
+ return len;
+}
+EXPORT_SYMBOL(copy_in_user);
+