Merge branch 'upstream-fixes'

author: Jeff Garzik <jgarzik@pobox.com> 2006-02-09 04:29:00 -0500
committer: Jeff Garzik <jgarzik@pobox.com> 2006-02-09 04:29:00 -0500
commit: 9caafa6c8686e319cf4d5f3757b3972c6c522b7c (patch)
tree: b38979b835b5d22e681b175d0b98a3c7560d9c59 /arch/x86_64/lib/memset.S
parent: [libata sata_sil] implement 'slow_down' module parameter (diff)
parent: Merge branch 'master' (diff)
download: linux-dev-9caafa6c8686e319cf4d5f3757b3972c6c522b7c.tar.xz
linux-dev-9caafa6c8686e319cf4d5f3757b3972c6c522b7c.zip
1 files changed, 94 insertions, 0 deletions
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 2aa48f24ed1e..ad397f2c7de8 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -13,6 +13,98 @@
 	.p2align 4
 memset:	
 __memset:
+	movq %rdi,%r10
+	movq %rdx,%r11
+
+	/* expand byte value  */
+	movzbl %sil,%ecx
+	movabs $0x0101010101010101,%rax
+	mul    %rcx		/* with rax, clobbers rdx */
+
+	/* align dst */
+	movl  %edi,%r9d
+	andl  $7,%r9d
+	jnz  .Lbad_alignment
+.Lafter_bad_alignment:
+
+	movl %r11d,%ecx
+	shrl $6,%ecx
+	jz	 .Lhandle_tail
+
+	.p2align 4
+.Lloop_64:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	movq  %rax,8(%rdi)
+	movq  %rax,16(%rdi)
+	movq  %rax,24(%rdi)
+	movq  %rax,32(%rdi)
+	movq  %rax,40(%rdi)
+	movq  %rax,48(%rdi)
+	movq  %rax,56(%rdi)
+	leaq  64(%rdi),%rdi
+	jnz    .Lloop_64
+
+	/* Handle tail in loops. The loops should be faster than hard
+	   to predict jump tables. */
+	.p2align 4
+.Lhandle_tail:
+	movl	%r11d,%ecx
+	andl    $63&(~7),%ecx
+	jz 		.Lhandle_7
+	shrl	$3,%ecx
+	.p2align 4
+.Lloop_8:
+	decl   %ecx
+	movq  %rax,(%rdi)
+	leaq  8(%rdi),%rdi
+	jnz    .Lloop_8
+
+.Lhandle_7:
+	movl	%r11d,%ecx
+	andl	$7,%ecx
+	jz      .Lende
+	.p2align 4
+.Lloop_1:
+	decl    %ecx
+	movb 	%al,(%rdi)
+	leaq	1(%rdi),%rdi
+	jnz     .Lloop_1
+
+.Lende:
+	movq	%r10,%rax
+	ret
+
+.Lbad_alignment:
+	cmpq $7,%r11
+	jbe	.Lhandle_7
+	movq %rax,(%rdi)	/* unaligned store */
+	movq $8,%r8
+	subq %r9,%r8
+	addq %r8,%rdi
+	subq %r8,%r11
+	jmp .Lafter_bad_alignment
+
+	/* Some CPUs run faster using the string instructions.
+	   It is also a lot simpler. Use this when possible */
+
+#include <asm/cpufeature.h>
+
+	.section .altinstructions,"a"
+	.align 8
+	.quad  memset
+	.quad  memset_c
+	.byte  X86_FEATURE_REP_GOOD
+	.byte  memset_c_end-memset_c
+	.byte  memset_c_end-memset_c
+	.previous
+
+	.section .altinstr_replacement,"ax"
+ /* rdi	destination
+  * rsi value
+  * rdx count
+  */
+memset_c:
 	movq %rdi,%r9
 	movl %edx,%r8d
 	andl $7,%r8d		
@@ -29,3 +121,5 @@ __memset:
 	stosb
 	movq %r9,%rax
 	ret
+memset_c_end:
+	.previous
author	Jeff Garzik <jgarzik@pobox.com>	2006-02-09 04:29:00 -0500
committer	Jeff Garzik <jgarzik@pobox.com>	2006-02-09 04:29:00 -0500
commit	9caafa6c8686e319cf4d5f3757b3972c6c522b7c (patch)
tree	b38979b835b5d22e681b175d0b98a3c7560d9c59 /arch/x86_64/lib/memset.S
parent	[libata sata_sil] implement 'slow_down' module parameter (diff)
parent	Merge branch 'master' (diff)
download	linux-dev-9caafa6c8686e319cf4d5f3757b3972c6c522b7c.tar.xz linux-dev-9caafa6c8686e319cf4d5f3757b3972c6c522b7c.zip