aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/lib/memmove_64.c
diff options
context:
space:
mode:
authorMa Ling <ling.ma@intel.com>2010-09-17 03:12:40 +0800
committerH. Peter Anvin <hpa@zytor.com>2010-09-24 18:57:11 -0700
commit3b4b682becdfa9f42321aa024d5cc84f71f06d8c (patch)
tree807cac0bfbc45d45f156dae742b31f9a66aaa559 /arch/x86/lib/memmove_64.c
parentx86, mem: Optimize memcpy by avoiding memory false dependece (diff)
downloadlinux-dev-3b4b682becdfa9f42321aa024d5cc84f71f06d8c.tar.xz
linux-dev-3b4b682becdfa9f42321aa024d5cc84f71f06d8c.zip
x86, mem: Optimize memmove for small size and unaligned cases
movs instruction will combine data to accelerate moving data, however we need to concern two cases about it. 1. movs instruction need long lantency to startup, so here we use general mov instruction to copy data. 2. movs instruction is not good for unaligned case, even if src offset is 0x10, dest offset is 0x0, we avoid and handle the case by general mov instruction. Signed-off-by: Ma Ling <ling.ma@intel.com> LKML-Reference: <1284664360-6138-1-git-send-email-ling.ma@intel.com> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Diffstat (limited to 'arch/x86/lib/memmove_64.c')
-rw-r--r--arch/x86/lib/memmove_64.c225
1 files changed, 180 insertions, 45 deletions
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
index ecacc4b3d9e5..6d0f0ec41b34 100644
--- a/arch/x86/lib/memmove_64.c
+++ b/arch/x86/lib/memmove_64.c
@@ -8,50 +8,185 @@
#undef memmove
void *memmove(void *dest, const void *src, size_t count)
{
- unsigned long d0, d1, d2, d3;
- if (dest < src) {
- if ((dest + count) < src)
- return memcpy(dest, src, count);
- else
- __asm__ __volatile__(
- "movq %0, %3\n\t"
- "shr $3, %0\n\t"
- "andq $7, %3\n\t"
- "rep\n\t"
- "movsq\n\t"
- "movq %3, %0\n\t"
- "rep\n\t"
- "movsb"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
- :"0" (count),
- "1" (src),
- "2" (dest)
- :"memory");
- } else {
- if((src + count) < dest)
- return memcpy(dest, src, count);
- else
- __asm__ __volatile__(
- "movq %0, %3\n\t"
- "lea -8(%1, %0), %1\n\t"
- "lea -8(%2, %0), %2\n\t"
- "shr $3, %0\n\t"
- "andq $7, %3\n\t"
- "std\n\t"
- "rep\n\t"
- "movsq\n\t"
- "lea 7(%1), %1\n\t"
- "lea 7(%2), %2\n\t"
- "movq %3, %0\n\t"
- "rep\n\t"
- "movsb\n\t"
- "cld"
- : "=&c" (d0), "=&S" (d1), "=&D" (d2), "=r" (d3)
- :"0" (count),
- "1" (src),
- "2" (dest)
- :"memory");
- }
- return dest;
+ unsigned long d0,d1,d2,d3,d4,d5,d6,d7;
+ char *ret;
+
+ __asm__ __volatile__(
+ /* Handle more 32bytes in loop */
+ "mov %2, %3\n\t"
+ "cmp $0x20, %0\n\t"
+ "jb 1f\n\t"
+
+ /* Decide forward/backward copy mode */
+ "cmp %2, %1\n\t"
+ "jb 2f\n\t"
+
+ /*
+ * movsq instruction have many startup latency
+ * so we handle small size by general register.
+ */
+ "cmp $680, %0\n\t"
+ "jb 3f\n\t"
+ /*
+ * movsq instruction is only good for aligned case.
+ */
+ "cmpb %%dil, %%sil\n\t"
+ "je 4f\n\t"
+ "3:\n\t"
+ "sub $0x20, %0\n\t"
+ /*
+ * We gobble 32byts forward in each loop.
+ */
+ "5:\n\t"
+ "sub $0x20, %0\n\t"
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq 2*8(%1), %6\n\t"
+ "movq 3*8(%1), %7\n\t"
+ "leaq 4*8(%1), %1\n\t"
+
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, 2*8(%2)\n\t"
+ "movq %7, 3*8(%2)\n\t"
+ "leaq 4*8(%2), %2\n\t"
+ "jae 5b\n\t"
+ "addq $0x20, %0\n\t"
+ "jmp 1f\n\t"
+ /*
+ * Handle data forward by movsq.
+ */
+ ".p2align 4\n\t"
+ "4:\n\t"
+ "movq %0, %8\n\t"
+ "movq -8(%1, %0), %4\n\t"
+ "lea -8(%2, %0), %5\n\t"
+ "shrq $3, %8\n\t"
+ "rep movsq\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+ /*
+ * Handle data backward by movsq.
+ */
+ ".p2align 4\n\t"
+ "7:\n\t"
+ "movq %0, %8\n\t"
+ "movq (%1), %4\n\t"
+ "movq %2, %5\n\t"
+ "leaq -8(%1, %0), %1\n\t"
+ "leaq -8(%2, %0), %2\n\t"
+ "shrq $3, %8\n\t"
+ "std\n\t"
+ "rep movsq\n\t"
+ "cld\n\t"
+ "movq %4, (%5)\n\t"
+ "jmp 13f\n\t"
+
+ /*
+ * Start to prepare for backward copy.
+ */
+ ".p2align 4\n\t"
+ "2:\n\t"
+ "cmp $680, %0\n\t"
+ "jb 6f \n\t"
+ "cmp %%dil, %%sil\n\t"
+ "je 7b \n\t"
+ "6:\n\t"
+ /*
+ * Calculate copy position to tail.
+ */
+ "addq %0, %1\n\t"
+ "addq %0, %2\n\t"
+ "subq $0x20, %0\n\t"
+ /*
+ * We gobble 32byts backward in each loop.
+ */
+ "8:\n\t"
+ "subq $0x20, %0\n\t"
+ "movq -1*8(%1), %4\n\t"
+ "movq -2*8(%1), %5\n\t"
+ "movq -3*8(%1), %6\n\t"
+ "movq -4*8(%1), %7\n\t"
+ "leaq -4*8(%1), %1\n\t"
+
+ "movq %4, -1*8(%2)\n\t"
+ "movq %5, -2*8(%2)\n\t"
+ "movq %6, -3*8(%2)\n\t"
+ "movq %7, -4*8(%2)\n\t"
+ "leaq -4*8(%2), %2\n\t"
+ "jae 8b\n\t"
+ /*
+ * Calculate copy position to head.
+ */
+ "addq $0x20, %0\n\t"
+ "subq %0, %1\n\t"
+ "subq %0, %2\n\t"
+ "1:\n\t"
+ "cmpq $16, %0\n\t"
+ "jb 9f\n\t"
+ /*
+ * Move data from 16 bytes to 31 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq 1*8(%1), %5\n\t"
+ "movq -2*8(%1, %0), %6\n\t"
+ "movq -1*8(%1, %0), %7\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, 1*8(%2)\n\t"
+ "movq %6, -2*8(%2, %0)\n\t"
+ "movq %7, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ ".p2align 4\n\t"
+ "9:\n\t"
+ "cmpq $8, %0\n\t"
+ "jb 10f\n\t"
+ /*
+ * Move data from 8 bytes to 15 bytes.
+ */
+ "movq 0*8(%1), %4\n\t"
+ "movq -1*8(%1, %0), %5\n\t"
+ "movq %4, 0*8(%2)\n\t"
+ "movq %5, -1*8(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "10:\n\t"
+ "cmpq $4, %0\n\t"
+ "jb 11f\n\t"
+ /*
+ * Move data from 4 bytes to 7 bytes.
+ */
+ "movl (%1), %4d\n\t"
+ "movl -4(%1, %0), %5d\n\t"
+ "movl %4d, (%2)\n\t"
+ "movl %5d, -4(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "11:\n\t"
+ "cmp $2, %0\n\t"
+ "jb 12f\n\t"
+ /*
+ * Move data from 2 bytes to 3 bytes.
+ */
+ "movw (%1), %4w\n\t"
+ "movw -2(%1, %0), %5w\n\t"
+ "movw %4w, (%2)\n\t"
+ "movw %5w, -2(%2, %0)\n\t"
+ "jmp 13f\n\t"
+ "12:\n\t"
+ "cmp $1, %0\n\t"
+ "jb 13f\n\t"
+ /*
+ * Move data for 1 byte.
+ */
+ "movb (%1), %4b\n\t"
+ "movb %4b, (%2)\n\t"
+ "13:\n\t"
+ : "=&d" (d0), "=&S" (d1), "=&D" (d2), "=&a" (ret) ,
+ "=r"(d3), "=r"(d4), "=r"(d5), "=r"(d6), "=&c" (d7)
+ :"0" (count),
+ "1" (src),
+ "2" (dest)
+ :"memory");
+
+ return ret;
+
}
EXPORT_SYMBOL(memmove);