From 215e00efd21a3a044eb08a0f0572ef683b777d67 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Mon, 12 Nov 2018 08:29:46 +0000 Subject: chacha20: use DRAP idiom This effectively means swapping the usage of %r9 and %r10 globally. Signed-off-by: Samuel Neves --- src/crypto/zinc/chacha20/chacha20-x86_64.pl | 471 ++++++++++++++-------------- 1 file changed, 235 insertions(+), 236 deletions(-) (limited to 'src/crypto') diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.pl b/src/crypto/zinc/chacha20/chacha20-x86_64.pl index e6f8d71..49e698c 100644 --- a/src/crypto/zinc/chacha20/chacha20-x86_64.pl +++ b/src/crypto/zinc/chacha20/chacha20-x86_64.pl @@ -281,16 +281,16 @@ $code.=<<___; ___ if(!kernel) { $code.=<<___; - mov OPENSSL_ia32cap_P+4(%rip),%r10 + mov OPENSSL_ia32cap_P+4(%rip),%r9 ___ $code.=<<___ if ($avx>2); - bt \$48,%r10 # check for AVX512F + bt \$48,%r9 # check for AVX512F jc .LChaCha20_avx512 - test %r10,%r10 # check for AVX512VL + test %r9,%r9 # check for AVX512VL js .LChaCha20_avx512vl ___ $code.=<<___; - test \$`1<<(41-32)`,%r10d + test \$`1<<(41-32)`,%r9d jnz .LChaCha20_ssse3 ___ } @@ -499,7 +499,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &por ($b,$t); } -my $xframe = $win64 ? 32+8 : 8; +my $xframe = $win64 ? 32 : 0; if($kernel) { $code .= "#ifdef CONFIG_AS_SSSE3\n"; @@ -546,8 +546,8 @@ ___ &declare_function("chacha20_ssse3", 32); $code.=<<___; .cfi_startproc - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 ___ $code.=<<___ if ($avx && !$kernel); test \$`1<<(43-32)`,%r10d @@ -563,8 +563,8 @@ $code.=<<___; and \$-16,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0x28(%r9) - movaps %xmm7,-0x18(%r9) + movaps %xmm6,-0x20(%r10) + movaps %xmm7,-0x10(%r10) .Lssse3_body: ___ $code.=<<___; @@ -660,11 +660,11 @@ $code.=<<___; .Ldone_ssse3: ___ $code.=<<___ if ($win64); - movaps -0x28(%r9),%xmm6 - movaps -0x18(%r9),%xmm7 + movaps -0x28(%r10),%xmm6 + movaps -0x18(%r10),%xmm7 ___ $code.=<<___; - lea (%r9),%rsp + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .Lssse3_epilogue: ret @@ -721,7 +721,7 @@ sub SSSE3ROUND_2x { &por ($b1,$t1); } -my $xframe = $win64 ? 0x68 : 8; +my $xframe = $win64 ? 0x60 : 0; $code.=<<___; .type ChaCha20_128,\@function,5 @@ -729,18 +729,18 @@ $code.=<<___; ChaCha20_128: .cfi_startproc .LChaCha20_128: - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 sub \$64+$xframe,%rsp and \$-16,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0x68(%r9) - movaps %xmm7,-0x58(%r9) - movaps %xmm8,-0x48(%r9) - movaps %xmm9,-0x38(%r9) - movaps %xmm10,-0x28(%r9) - movaps %xmm11,-0x18(%r9) + movaps %xmm6,-0x60(%r10) + movaps %xmm7,-0x50(%r10) + movaps %xmm8,-0x40(%r10) + movaps %xmm9,-0x30(%r10) + movaps %xmm10,-0x20(%r10) + movaps %xmm11,-0x10(%r10) .L128_body: ___ $code.=<<___; @@ -823,15 +823,15 @@ $code.=<<___; movdqu $d1,0x70($out) ___ $code.=<<___ if ($win64); - movaps -0x68(%r9),%xmm6 - movaps -0x58(%r9),%xmm7 - movaps -0x48(%r9),%xmm8 - movaps -0x38(%r9),%xmm9 - movaps -0x28(%r9),%xmm10 - movaps -0x18(%r9),%xmm11 + movaps -0x60(%r10),%xmm6 + movaps -0x50(%r10),%xmm7 + movaps -0x40(%r10),%xmm8 + movaps -0x30(%r10),%xmm9 + movaps -0x20(%r10),%xmm10 + movaps -0x10(%r10),%xmm11 ___ $code.=<<___; - lea (%r9),%rsp + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L128_epilogue: ret @@ -919,7 +919,7 @@ my @x=map("\"$_\"",@xx); "&pslld (@x[$b1],7)", "&por (@x[$b0],$t1)", "&psrld ($t0,25)", - "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip) "&por (@x[$b1],$t0)", "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's @@ -966,12 +966,12 @@ my @x=map("\"$_\"",@xx); "&pslld (@x[$b3],7)", "&por (@x[$b2],$t1)", "&psrld ($t0,25)", - "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip) "&por (@x[$b3],$t0)" ); } -my $xframe = $win64 ? 0xa8 : 8; +my $xframe = $win64 ? 0xa0 : 0; $code.=<<___; .type ChaCha20_4x,\@function,5 @@ -979,15 +979,15 @@ $code.=<<___; ChaCha20_4x: .cfi_startproc .LChaCha20_4x: - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 ___ $code.=<<___ if (!$kernel); - mov %r10,%r11 + mov %r9,%r11 ___ $code.=<<___ if ($avx>1 && !$kernel); - shr \$32,%r10 # OPENSSL_ia32cap_P+8 - test \$`1<<5`,%r10 # test AVX2 + shr \$32,%r9 # OPENSSL_ia32cap_P+8 + test \$`1<<5`,%r9 # test AVX2 jnz .LChaCha20_8x ___ $code.=<<___; @@ -1013,16 +1013,16 @@ ___ # ... # +0x140 $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) + movaps %xmm6,-0xa0(%r9) + movaps %xmm7,-0x90(%r9) + movaps %xmm8,-0x80(%r9) + movaps %xmm9,-0x70(%r9) + movaps %xmm10,-0x60(%r9) + movaps %xmm11,-0x50(%r9) + movaps %xmm12,-0x40(%r9) + movaps %xmm13,-0x30(%r9) + movaps %xmm14,-0x20(%r9) + movaps %xmm15,-0x10(%r9) .L4x_body: ___ $code.=<<___; @@ -1031,7 +1031,7 @@ $code.=<<___; movdqu 16($key),$xt3 # key[2] movdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization - lea .Lrot16(%rip),%r10 + lea .Lrot16(%rip),%r9 lea .Lrot24(%rip),%r11 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... @@ -1095,7 +1095,7 @@ $code.=<<___; .Loop_enter4x: movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" - movdqa (%r10),$xt3 # .Lrot16(%rip) + movdqa (%r9),$xt3 # .Lrot16(%rip) mov \$10,%eax movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4x @@ -1272,7 +1272,7 @@ $code.=<<___; jae .L64_or_more4x #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? - xor %r10,%r10 + xor %r9,%r9 #movdqa $xt0,0x00(%rsp) movdqa $xb0,0x10(%rsp) movdqa $xc0,0x20(%rsp) @@ -1297,7 +1297,7 @@ $code.=<<___; movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*1 - xor %r10,%r10 + xor %r9,%r9 movdqa $xt0,0x00(%rsp) movdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 @@ -1337,7 +1337,7 @@ $code.=<<___; movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x80($inp),$inp # inp+=64*2 - xor %r10,%r10 + xor %r9,%r9 movdqa $xt0,0x00(%rsp) movdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 @@ -1392,7 +1392,7 @@ $code.=<<___; movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*3 - xor %r10,%r10 + xor %r9,%r9 movdqa $xt0,0x00(%rsp) movdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 @@ -1401,30 +1401,30 @@ $code.=<<___; movdqa $xd3,0x30(%rsp) .Loop_tail4x: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 + movzb ($inp,%r9),%eax + movzb (%rsp,%r9),%ecx + lea 1(%r9),%r9 xor %ecx,%eax - mov %al,-1($out,%r10) + mov %al,-1($out,%r9) dec $len jnz .Loop_tail4x .Ldone4x: ___ $code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp + movaps -0xa0(%r10),%xmm6 + movaps -0x90(%r10),%xmm7 + movaps -0x80(%r10),%xmm8 + movaps -0x70(%r10),%xmm9 + movaps -0x60(%r10),%xmm10 + movaps -0x50(%r10),%xmm11 + movaps -0x40(%r10),%xmm12 + movaps -0x30(%r10),%xmm13 + movaps -0x20(%r10),%xmm14 + movaps -0x10(%r10),%xmm15 +___ +$code.=<<___; + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L4x_epilogue: ret @@ -1512,17 +1512,16 @@ my @x=map("\"$_\"",@xx); ); } -my $xframe = $win64 ? 0xa8 : 8; +my $xframe = $win64 ? 0xa0 : 0; +&declare_function("chacha20_xop", 32); $code.=<<___; -.type ChaCha20_4xop,\@function,5 -.align 32 -ChaCha20_4xop: .cfi_startproc .LChaCha20_4xop: - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 sub \$0x140+$xframe,%rsp + and \$-16,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] @@ -1533,16 +1532,16 @@ ___ # ... # +0x140 $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) + movaps %xmm6,-0xa0(%r10) + movaps %xmm7,-0x90(%r10) + movaps %xmm8,-0x80(%r10) + movaps %xmm9,-0x70(%r10) + movaps %xmm10,-0x60(%r10) + movaps %xmm11,-0x50(%r10) + movaps %xmm12,-0x40(%r10) + movaps %xmm13,-0x30(%r10) + movaps %xmm14,-0x20(%r10) + movaps %xmm15,-0x10(%r10) .L4xop_body: ___ $code.=<<___; @@ -1758,7 +1757,7 @@ $code.=<<___; cmp \$64,$len jae .L64_or_more4xop - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x10(%rsp) vmovdqa $xc0,0x20(%rsp) @@ -1779,7 +1778,7 @@ $code.=<<___; lea 0x40($inp),$inp # inp+=64*1 vmovdqa $xa1,0x00(%rsp) - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 vmovdqa $xc1,0x20(%rsp) @@ -1810,7 +1809,7 @@ $code.=<<___; lea 0x80($inp),$inp # inp+=64*2 vmovdqa $xa2,0x00(%rsp) - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 vmovdqa $xc2,0x20(%rsp) @@ -1851,7 +1850,7 @@ $code.=<<___; lea 0x40($inp),$inp # inp+=64*3 vmovdqa $xa3,0x00(%rsp) - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 vmovdqa $xc3,0x20(%rsp) @@ -1859,11 +1858,11 @@ $code.=<<___; vmovdqa $xd3,0x30(%rsp) .Loop_tail4xop: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 + movzb ($inp,%r9),%eax + movzb (%rsp,%r9),%ecx + lea 1(%r9),%r9 xor %ecx,%eax - mov %al,-1($out,%r10) + mov %al,-1($out,%r9) dec $len jnz .Loop_tail4xop @@ -1871,25 +1870,25 @@ $code.=<<___; vzeroupper ___ $code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp + movaps -0xa0(%r10),%xmm6 + movaps -0x90(%r10),%xmm7 + movaps -0x80(%r10),%xmm8 + movaps -0x70(%r10),%xmm9 + movaps -0x60(%r10),%xmm10 + movaps -0x50(%r10),%xmm11 + movaps -0x40(%r10),%xmm12 + movaps -0x30(%r10),%xmm13 + movaps -0x20(%r10),%xmm14 + movaps -0x10(%r10),%xmm15 +___ +$code.=<<___; + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L4xop_epilogue: ret .cfi_endproc -.size ChaCha20_4xop,.-ChaCha20_4xop ___ +&end_function("chacha20_xop"); } ######################################################################## @@ -1967,7 +1966,7 @@ my @x=map("\"$_\"",@xx); "&vpslld ($t1,@x[$b0],7)", "&vpsrld (@x[$b0],@x[$b0],25)", "&vpor (@x[$b0],$t1,@x[$b0])", - "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t0,@x[$b1],7)", @@ -2010,7 +2009,7 @@ my @x=map("\"$_\"",@xx); "&vpslld ($t1,@x[$b2],7)", "&vpsrld (@x[$b2],@x[$b2],25)", "&vpor (@x[$b2],$t1,@x[$b2])", - "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t0,@x[$b3],7)", @@ -2025,22 +2024,22 @@ my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .cfi_startproc .LChaCha20_8x: - mov %rsp,%r9 # frame register -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame register +.cfi_def_cfa_register %r10 sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) + movaps %xmm6,-0xa0(%r10) + movaps %xmm7,-0x90(%r10) + movaps %xmm8,-0x80(%r10) + movaps %xmm9,-0x70(%r10) + movaps %xmm10,-0x60(%r10) + movaps %xmm11,-0x50(%r10) + movaps %xmm12,-0x40(%r10) + movaps %xmm13,-0x30(%r10) + movaps %xmm14,-0x20(%r10) + movaps %xmm15,-0x10(%r10) .L8x_body: ___ $code.=<<___; @@ -2061,7 +2060,7 @@ $code.=<<___; vbroadcasti128 ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea 0x200(%rsp),%rax # size optimization - lea .Lrot16(%rip),%r10 + lea .Lrot16(%rip),%r9 lea .Lrot24(%rip),%r11 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... @@ -2125,7 +2124,7 @@ $code.=<<___; .Loop_enter8x: vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" - vbroadcasti128 (%r10),$xt3 + vbroadcasti128 (%r9),$xt3 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters mov \$10,%eax jmp .Loop8x @@ -2306,7 +2305,7 @@ $code.=<<___; cmp \$64,$len jae .L64_or_more8x - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) jmp .Loop_tail8x @@ -2320,7 +2319,7 @@ $code.=<<___; je .Ldone8x lea 0x40($inp),$inp # inp+=64*1 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xc0,0x00(%rsp) lea 0x40($out),$out # out+=64*1 sub \$64,$len # len-=64*1 @@ -2340,7 +2339,7 @@ $code.=<<___; je .Ldone8x lea 0x80($inp),$inp # inp+=64*2 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xa1,0x00(%rsp) lea 0x80($out),$out # out+=64*2 sub \$128,$len # len-=64*2 @@ -2364,7 +2363,7 @@ $code.=<<___; je .Ldone8x lea 0xc0($inp),$inp # inp+=64*3 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xc1,0x00(%rsp) lea 0xc0($out),$out # out+=64*3 sub \$192,$len # len-=64*3 @@ -2392,7 +2391,7 @@ $code.=<<___; je .Ldone8x lea 0x100($inp),$inp # inp+=64*4 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xa2,0x00(%rsp) lea 0x100($out),$out # out+=64*4 sub \$256,$len # len-=64*4 @@ -2424,7 +2423,7 @@ $code.=<<___; je .Ldone8x lea 0x140($inp),$inp # inp+=64*5 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xc2,0x00(%rsp) lea 0x140($out),$out # out+=64*5 sub \$320,$len # len-=64*5 @@ -2460,7 +2459,7 @@ $code.=<<___; je .Ldone8x lea 0x180($inp),$inp # inp+=64*6 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xa3,0x00(%rsp) lea 0x180($out),$out # out+=64*6 sub \$384,$len # len-=64*6 @@ -2500,18 +2499,18 @@ $code.=<<___; je .Ldone8x lea 0x1c0($inp),$inp # inp+=64*7 - xor %r10,%r10 + xor %r9,%r9 vmovdqa $xc3,0x00(%rsp) lea 0x1c0($out),$out # out+=64*7 sub \$448,$len # len-=64*7 vmovdqa $xd3,0x20(%rsp) .Loop_tail8x: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 + movzb ($inp,%r9),%eax + movzb (%rsp,%r9),%ecx + lea 1(%r9),%r9 xor %ecx,%eax - mov %al,-1($out,%r10) + mov %al,-1($out,%r9) dec $len jnz .Loop_tail8x @@ -2519,19 +2518,19 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp + movaps -0xa0(%r10),%xmm6 + movaps -0x90(%r10),%xmm7 + movaps -0x80(%r10),%xmm8 + movaps -0x70(%r10),%xmm9 + movaps -0x60(%r10),%xmm10 + movaps -0x50(%r10),%xmm11 + movaps -0x40(%r10),%xmm12 + movaps -0x30(%r10),%xmm13 + movaps -0x20(%r10),%xmm14 + movaps -0x10(%r10),%xmm15 +___ +$code.=<<___; + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L8x_epilogue: ret @@ -2591,8 +2590,8 @@ my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .cfi_startproc .LChaCha20_avx512: - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 cmp \$512,$len ja .LChaCha20_16x @@ -2600,8 +2599,8 @@ $code.=<<___; and \$-64,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0x28(%r9) - movaps %xmm7,-0x18(%r9) + movaps %xmm6,-0x20(%r10) + movaps %xmm7,-0x10(%r10) .Lavx512_body: ___ $code.=<<___; @@ -2768,11 +2767,11 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps -0x28(%r9),%xmm6 - movaps -0x18(%r9),%xmm7 + movaps -0x28(%r10),%xmm6 + movaps -0x18(%r10),%xmm7 ___ $code.=<<___; - lea (%r9),%rsp + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .Lavx512_epilogue: ret @@ -2786,8 +2785,8 @@ map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); $code.=<<___; .cfi_startproc .LChaCha20_avx512vl: - mov %rsp,%r9 # frame pointer -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame pointer +.cfi_def_cfa_register %r10 cmp \$128,$len ja .LChaCha20_8xvl @@ -2795,8 +2794,8 @@ $code.=<<___; and \$-32,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0x28(%r9) - movaps %xmm7,-0x18(%r9) + movaps %xmm6,-0x20(%r10) + movaps %xmm7,-0x10(%r10) .Lavx512vl_body: ___ $code.=<<___; @@ -2920,11 +2919,11 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps -0x28(%r9),%xmm6 - movaps -0x18(%r9),%xmm7 + movaps -0x20(%r10),%xmm6 + movaps -0x10(%r10),%xmm7 ___ $code.=<<___; - lea (%r9),%rsp + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .Lavx512vl_epilogue: ret @@ -3011,29 +3010,29 @@ $code.=<<___; ChaCha20_16x: .cfi_startproc .LChaCha20_16x: - mov %rsp,%r9 # frame register -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame register +.cfi_def_cfa_register %r10 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) + movaps %xmm6,-0xa0(%r10) + movaps %xmm7,-0x90(%r10) + movaps %xmm8,-0x80(%r10) + movaps %xmm9,-0x70(%r10) + movaps %xmm10,-0x60(%r10) + movaps %xmm11,-0x50(%r10) + movaps %xmm12,-0x40(%r10) + movaps %xmm13,-0x30(%r10) + movaps %xmm14,-0x20(%r10) + movaps %xmm15,-0x10(%r10) .L16x_body: ___ $code.=<<___; vzeroupper - lea .Lsigma(%rip),%r10 - vbroadcasti32x4 (%r10),$xa3 # key[0] + lea .Lsigma(%rip),%r9 + vbroadcasti32x4 (%r9),$xa3 # key[0] vbroadcasti32x4 ($key),$xb3 # key[1] vbroadcasti32x4 16($key),$xc3 # key[2] vbroadcasti32x4 ($counter),$xd3 # key[3] @@ -3080,10 +3079,10 @@ $code.=<<___; .align 32 .Loop_outer16x: - vpbroadcastd 0(%r10),$xa0 # reload key - vpbroadcastd 4(%r10),$xa1 - vpbroadcastd 8(%r10),$xa2 - vpbroadcastd 12(%r10),$xa3 + vpbroadcastd 0(%r9),$xa0 # reload key + vpbroadcastd 4(%r9),$xa1 + vpbroadcastd 8(%r9),$xa2 + vpbroadcastd 12(%r9),$xa3 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 @@ -3274,7 +3273,7 @@ $code.=<<___; .align 32 .Ltail16x: - xor %r10,%r10 + xor %r9,%r9 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_16x @@ -3402,11 +3401,11 @@ $code.=<<___; and \$63,$len .Loop_tail16x: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 + movzb ($inp,%r9),%eax + movzb (%rsp,%r9),%ecx + lea 1(%r9),%r9 xor %ecx,%eax - mov %al,-1($out,%r10) + mov %al,-1($out,%r9) dec $len jnz .Loop_tail16x @@ -3417,19 +3416,19 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp + movaps -0xa0(%r10),%xmm6 + movaps -0x90(%r10),%xmm7 + movaps -0x80(%r10),%xmm8 + movaps -0x70(%r10),%xmm9 + movaps -0x60(%r10),%xmm10 + movaps -0x50(%r10),%xmm11 + movaps -0x40(%r10),%xmm12 + movaps -0x30(%r10),%xmm13 + movaps -0x20(%r10),%xmm14 + movaps -0x10(%r10),%xmm15 +___ +$code.=<<___; + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L16x_epilogue: ret @@ -3451,29 +3450,29 @@ $code.=<<___; ChaCha20_8xvl: .cfi_startproc .LChaCha20_8xvl: - mov %rsp,%r9 # frame register -.cfi_def_cfa_register %r9 + lea 8(%rsp),%r10 # frame register +.cfi_def_cfa_register %r10 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); - movaps %xmm6,-0xa8(%r9) - movaps %xmm7,-0x98(%r9) - movaps %xmm8,-0x88(%r9) - movaps %xmm9,-0x78(%r9) - movaps %xmm10,-0x68(%r9) - movaps %xmm11,-0x58(%r9) - movaps %xmm12,-0x48(%r9) - movaps %xmm13,-0x38(%r9) - movaps %xmm14,-0x28(%r9) - movaps %xmm15,-0x18(%r9) + movaps %xmm6,-0xa0(%r10) + movaps %xmm7,-0x90(%r10) + movaps %xmm8,-0x80(%r10) + movaps %xmm9,-0x70(%r10) + movaps %xmm10,-0x60(%r10) + movaps %xmm11,-0x50(%r10) + movaps %xmm12,-0x40(%r10) + movaps %xmm13,-0x30(%r10) + movaps %xmm14,-0x20(%r10) + movaps %xmm15,-0x10(%r10) .L8xvl_body: ___ $code.=<<___; vzeroupper - lea .Lsigma(%rip),%r10 - vbroadcasti128 (%r10),$xa3 # key[0] + lea .Lsigma(%rip),%r9 + vbroadcasti128 (%r9),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xc3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] @@ -3520,10 +3519,10 @@ $code.=<<___; .align 32 .Loop_outer8xvl: - #vpbroadcastd 0(%r10),$xa0 # reload key - #vpbroadcastd 4(%r10),$xa1 - vpbroadcastd 8(%r10),$xa2 - vpbroadcastd 12(%r10),$xa3 + #vpbroadcastd 0(%r9),$xa0 # reload key + #vpbroadcastd 4(%r9),$xa1 + vpbroadcastd 8(%r9),$xa2 + vpbroadcastd 12(%r9),$xa3 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 @@ -3691,8 +3690,8 @@ $code.=<<___; vmovdqu $xd3,0x60($out) lea ($out,%rax),$out # size optimization - vpbroadcastd 0(%r10),%ymm0 # reload key - vpbroadcastd 4(%r10),%ymm1 + vpbroadcastd 0(%r9),%ymm0 # reload key + vpbroadcastd 4(%r9),%ymm1 sub \$64*8,$len jnz .Loop_outer8xvl @@ -3705,7 +3704,7 @@ $code.=<<___; ___ $xa0 = "%ymm8"; $code.=<<___; - xor %r10,%r10 + xor %r9,%r9 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_8xvl @@ -3791,11 +3790,11 @@ $code.=<<___; and \$63,$len .Loop_tail8xvl: - movzb ($inp,%r10),%eax - movzb (%rsp,%r10),%ecx - lea 1(%r10),%r10 + movzb ($inp,%r9),%eax + movzb (%rsp,%r9),%ecx + lea 1(%r9),%r9 xor %ecx,%eax - mov %al,-1($out,%r10) + mov %al,-1($out,%r9) dec $len jnz .Loop_tail8xvl @@ -3807,19 +3806,19 @@ $code.=<<___; vzeroall ___ $code.=<<___ if ($win64); - movaps -0xa8(%r9),%xmm6 - movaps -0x98(%r9),%xmm7 - movaps -0x88(%r9),%xmm8 - movaps -0x78(%r9),%xmm9 - movaps -0x68(%r9),%xmm10 - movaps -0x58(%r9),%xmm11 - movaps -0x48(%r9),%xmm12 - movaps -0x38(%r9),%xmm13 - movaps -0x28(%r9),%xmm14 - movaps -0x18(%r9),%xmm15 -___ -$code.=<<___; - lea (%r9),%rsp + movaps -0xa0(%r10),%xmm6 + movaps -0x90(%r10),%xmm7 + movaps -0x80(%r10),%xmm8 + movaps -0x70(%r10),%xmm9 + movaps -0x60(%r10),%xmm10 + movaps -0x50(%r10),%xmm11 + movaps -0x40(%r10),%xmm12 + movaps -0x30(%r10),%xmm13 + movaps -0x20(%r10),%xmm14 + movaps -0x10(%r10),%xmm15 +___ +$code.=<<___; + lea -8(%r10),%rsp .cfi_def_cfa_register %rsp .L8xvl_epilogue: ret -- cgit v1.2.3-59-g8ed1b