aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/crypto/zinc/chacha20/chacha20-x86_64.pl
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/crypto/zinc/chacha20/chacha20-x86_64.pl471
1 files changed, 235 insertions, 236 deletions
diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.pl b/src/crypto/zinc/chacha20/chacha20-x86_64.pl
index e6f8d71..49e698c 100644
--- a/src/crypto/zinc/chacha20/chacha20-x86_64.pl
+++ b/src/crypto/zinc/chacha20/chacha20-x86_64.pl
@@ -281,16 +281,16 @@ $code.=<<___;
___
if(!kernel) {
$code.=<<___;
- mov OPENSSL_ia32cap_P+4(%rip),%r10
+ mov OPENSSL_ia32cap_P+4(%rip),%r9
___
$code.=<<___ if ($avx>2);
- bt \$48,%r10 # check for AVX512F
+ bt \$48,%r9 # check for AVX512F
jc .LChaCha20_avx512
- test %r10,%r10 # check for AVX512VL
+ test %r9,%r9 # check for AVX512VL
js .LChaCha20_avx512vl
___
$code.=<<___;
- test \$`1<<(41-32)`,%r10d
+ test \$`1<<(41-32)`,%r9d
jnz .LChaCha20_ssse3
___
}
@@ -499,7 +499,7 @@ sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
&por ($b,$t);
}
-my $xframe = $win64 ? 32+8 : 8;
+my $xframe = $win64 ? 32 : 0;
if($kernel) {
$code .= "#ifdef CONFIG_AS_SSSE3\n";
@@ -546,8 +546,8 @@ ___
&declare_function("chacha20_ssse3", 32);
$code.=<<___;
.cfi_startproc
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
___
$code.=<<___ if ($avx && !$kernel);
test \$`1<<(43-32)`,%r10d
@@ -563,8 +563,8 @@ $code.=<<___;
and \$-16,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0x28(%r9)
- movaps %xmm7,-0x18(%r9)
+ movaps %xmm6,-0x20(%r10)
+ movaps %xmm7,-0x10(%r10)
.Lssse3_body:
___
$code.=<<___;
@@ -660,11 +660,11 @@ $code.=<<___;
.Ldone_ssse3:
___
$code.=<<___ if ($win64);
- movaps -0x28(%r9),%xmm6
- movaps -0x18(%r9),%xmm7
+ movaps -0x28(%r10),%xmm6
+ movaps -0x18(%r10),%xmm7
___
$code.=<<___;
- lea (%r9),%rsp
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.Lssse3_epilogue:
ret
@@ -721,7 +721,7 @@ sub SSSE3ROUND_2x {
&por ($b1,$t1);
}
-my $xframe = $win64 ? 0x68 : 8;
+my $xframe = $win64 ? 0x60 : 0;
$code.=<<___;
.type ChaCha20_128,\@function,5
@@ -729,18 +729,18 @@ $code.=<<___;
ChaCha20_128:
.cfi_startproc
.LChaCha20_128:
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
sub \$64+$xframe,%rsp
and \$-16,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0x68(%r9)
- movaps %xmm7,-0x58(%r9)
- movaps %xmm8,-0x48(%r9)
- movaps %xmm9,-0x38(%r9)
- movaps %xmm10,-0x28(%r9)
- movaps %xmm11,-0x18(%r9)
+ movaps %xmm6,-0x60(%r10)
+ movaps %xmm7,-0x50(%r10)
+ movaps %xmm8,-0x40(%r10)
+ movaps %xmm9,-0x30(%r10)
+ movaps %xmm10,-0x20(%r10)
+ movaps %xmm11,-0x10(%r10)
.L128_body:
___
$code.=<<___;
@@ -823,15 +823,15 @@ $code.=<<___;
movdqu $d1,0x70($out)
___
$code.=<<___ if ($win64);
- movaps -0x68(%r9),%xmm6
- movaps -0x58(%r9),%xmm7
- movaps -0x48(%r9),%xmm8
- movaps -0x38(%r9),%xmm9
- movaps -0x28(%r9),%xmm10
- movaps -0x18(%r9),%xmm11
+ movaps -0x60(%r10),%xmm6
+ movaps -0x50(%r10),%xmm7
+ movaps -0x40(%r10),%xmm8
+ movaps -0x30(%r10),%xmm9
+ movaps -0x20(%r10),%xmm10
+ movaps -0x10(%r10),%xmm11
___
$code.=<<___;
- lea (%r9),%rsp
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L128_epilogue:
ret
@@ -919,7 +919,7 @@ my @x=map("\"$_\"",@xx);
"&pslld (@x[$b1],7)",
"&por (@x[$b0],$t1)",
"&psrld ($t0,25)",
- "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
+ "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
"&por (@x[$b1],$t0)",
"&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's
@@ -966,12 +966,12 @@ my @x=map("\"$_\"",@xx);
"&pslld (@x[$b3],7)",
"&por (@x[$b2],$t1)",
"&psrld ($t0,25)",
- "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip)
+ "&movdqa ($t1,'(%r9)')", # .Lrot16(%rip)
"&por (@x[$b3],$t0)"
);
}
-my $xframe = $win64 ? 0xa8 : 8;
+my $xframe = $win64 ? 0xa0 : 0;
$code.=<<___;
.type ChaCha20_4x,\@function,5
@@ -979,15 +979,15 @@ $code.=<<___;
ChaCha20_4x:
.cfi_startproc
.LChaCha20_4x:
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
___
$code.=<<___ if (!$kernel);
- mov %r10,%r11
+ mov %r9,%r11
___
$code.=<<___ if ($avx>1 && !$kernel);
- shr \$32,%r10 # OPENSSL_ia32cap_P+8
- test \$`1<<5`,%r10 # test AVX2
+ shr \$32,%r9 # OPENSSL_ia32cap_P+8
+ test \$`1<<5`,%r9 # test AVX2
jnz .LChaCha20_8x
___
$code.=<<___;
@@ -1013,16 +1013,16 @@ ___
# ...
# +0x140
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%r9)
- movaps %xmm7,-0x98(%r9)
- movaps %xmm8,-0x88(%r9)
- movaps %xmm9,-0x78(%r9)
- movaps %xmm10,-0x68(%r9)
- movaps %xmm11,-0x58(%r9)
- movaps %xmm12,-0x48(%r9)
- movaps %xmm13,-0x38(%r9)
- movaps %xmm14,-0x28(%r9)
- movaps %xmm15,-0x18(%r9)
+ movaps %xmm6,-0xa0(%r9)
+ movaps %xmm7,-0x90(%r9)
+ movaps %xmm8,-0x80(%r9)
+ movaps %xmm9,-0x70(%r9)
+ movaps %xmm10,-0x60(%r9)
+ movaps %xmm11,-0x50(%r9)
+ movaps %xmm12,-0x40(%r9)
+ movaps %xmm13,-0x30(%r9)
+ movaps %xmm14,-0x20(%r9)
+ movaps %xmm15,-0x10(%r9)
.L4x_body:
___
$code.=<<___;
@@ -1031,7 +1031,7 @@ $code.=<<___;
movdqu 16($key),$xt3 # key[2]
movdqu ($counter),$xd3 # key[3]
lea 0x100(%rsp),%rcx # size optimization
- lea .Lrot16(%rip),%r10
+ lea .Lrot16(%rip),%r9
lea .Lrot24(%rip),%r11
pshufd \$0x00,$xa3,$xa0 # smash key by lanes...
@@ -1095,7 +1095,7 @@ $code.=<<___;
.Loop_enter4x:
movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]"
movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]"
- movdqa (%r10),$xt3 # .Lrot16(%rip)
+ movdqa (%r9),$xt3 # .Lrot16(%rip)
mov \$10,%eax
movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters
jmp .Loop4x
@@ -1272,7 +1272,7 @@ $code.=<<___;
jae .L64_or_more4x
#movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember?
- xor %r10,%r10
+ xor %r9,%r9
#movdqa $xt0,0x00(%rsp)
movdqa $xb0,0x10(%rsp)
movdqa $xc0,0x20(%rsp)
@@ -1297,7 +1297,7 @@ $code.=<<___;
movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember?
lea 0x40($inp),$inp # inp+=64*1
- xor %r10,%r10
+ xor %r9,%r9
movdqa $xt0,0x00(%rsp)
movdqa $xb1,0x10(%rsp)
lea 0x40($out),$out # out+=64*1
@@ -1337,7 +1337,7 @@ $code.=<<___;
movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember?
lea 0x80($inp),$inp # inp+=64*2
- xor %r10,%r10
+ xor %r9,%r9
movdqa $xt0,0x00(%rsp)
movdqa $xb2,0x10(%rsp)
lea 0x80($out),$out # out+=64*2
@@ -1392,7 +1392,7 @@ $code.=<<___;
movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember?
lea 0x40($inp),$inp # inp+=64*3
- xor %r10,%r10
+ xor %r9,%r9
movdqa $xt0,0x00(%rsp)
movdqa $xb3,0x10(%rsp)
lea 0x40($out),$out # out+=64*3
@@ -1401,30 +1401,30 @@ $code.=<<___;
movdqa $xd3,0x30(%rsp)
.Loop_tail4x:
- movzb ($inp,%r10),%eax
- movzb (%rsp,%r10),%ecx
- lea 1(%r10),%r10
+ movzb ($inp,%r9),%eax
+ movzb (%rsp,%r9),%ecx
+ lea 1(%r9),%r9
xor %ecx,%eax
- mov %al,-1($out,%r10)
+ mov %al,-1($out,%r9)
dec $len
jnz .Loop_tail4x
.Ldone4x:
___
$code.=<<___ if ($win64);
- movaps -0xa8(%r9),%xmm6
- movaps -0x98(%r9),%xmm7
- movaps -0x88(%r9),%xmm8
- movaps -0x78(%r9),%xmm9
- movaps -0x68(%r9),%xmm10
- movaps -0x58(%r9),%xmm11
- movaps -0x48(%r9),%xmm12
- movaps -0x38(%r9),%xmm13
- movaps -0x28(%r9),%xmm14
- movaps -0x18(%r9),%xmm15
-___
-$code.=<<___;
- lea (%r9),%rsp
+ movaps -0xa0(%r10),%xmm6
+ movaps -0x90(%r10),%xmm7
+ movaps -0x80(%r10),%xmm8
+ movaps -0x70(%r10),%xmm9
+ movaps -0x60(%r10),%xmm10
+ movaps -0x50(%r10),%xmm11
+ movaps -0x40(%r10),%xmm12
+ movaps -0x30(%r10),%xmm13
+ movaps -0x20(%r10),%xmm14
+ movaps -0x10(%r10),%xmm15
+___
+$code.=<<___;
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L4x_epilogue:
ret
@@ -1512,17 +1512,16 @@ my @x=map("\"$_\"",@xx);
);
}
-my $xframe = $win64 ? 0xa8 : 8;
+my $xframe = $win64 ? 0xa0 : 0;
+&declare_function("chacha20_xop", 32);
$code.=<<___;
-.type ChaCha20_4xop,\@function,5
-.align 32
-ChaCha20_4xop:
.cfi_startproc
.LChaCha20_4xop:
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
sub \$0x140+$xframe,%rsp
+ and \$-16,%rsp
___
################ stack layout
# +0x00 SIMD equivalent of @x[8-12]
@@ -1533,16 +1532,16 @@ ___
# ...
# +0x140
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%r9)
- movaps %xmm7,-0x98(%r9)
- movaps %xmm8,-0x88(%r9)
- movaps %xmm9,-0x78(%r9)
- movaps %xmm10,-0x68(%r9)
- movaps %xmm11,-0x58(%r9)
- movaps %xmm12,-0x48(%r9)
- movaps %xmm13,-0x38(%r9)
- movaps %xmm14,-0x28(%r9)
- movaps %xmm15,-0x18(%r9)
+ movaps %xmm6,-0xa0(%r10)
+ movaps %xmm7,-0x90(%r10)
+ movaps %xmm8,-0x80(%r10)
+ movaps %xmm9,-0x70(%r10)
+ movaps %xmm10,-0x60(%r10)
+ movaps %xmm11,-0x50(%r10)
+ movaps %xmm12,-0x40(%r10)
+ movaps %xmm13,-0x30(%r10)
+ movaps %xmm14,-0x20(%r10)
+ movaps %xmm15,-0x10(%r10)
.L4xop_body:
___
$code.=<<___;
@@ -1758,7 +1757,7 @@ $code.=<<___;
cmp \$64,$len
jae .L64_or_more4xop
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xa0,0x00(%rsp)
vmovdqa $xb0,0x10(%rsp)
vmovdqa $xc0,0x20(%rsp)
@@ -1779,7 +1778,7 @@ $code.=<<___;
lea 0x40($inp),$inp # inp+=64*1
vmovdqa $xa1,0x00(%rsp)
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xb1,0x10(%rsp)
lea 0x40($out),$out # out+=64*1
vmovdqa $xc1,0x20(%rsp)
@@ -1810,7 +1809,7 @@ $code.=<<___;
lea 0x80($inp),$inp # inp+=64*2
vmovdqa $xa2,0x00(%rsp)
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xb2,0x10(%rsp)
lea 0x80($out),$out # out+=64*2
vmovdqa $xc2,0x20(%rsp)
@@ -1851,7 +1850,7 @@ $code.=<<___;
lea 0x40($inp),$inp # inp+=64*3
vmovdqa $xa3,0x00(%rsp)
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xb3,0x10(%rsp)
lea 0x40($out),$out # out+=64*3
vmovdqa $xc3,0x20(%rsp)
@@ -1859,11 +1858,11 @@ $code.=<<___;
vmovdqa $xd3,0x30(%rsp)
.Loop_tail4xop:
- movzb ($inp,%r10),%eax
- movzb (%rsp,%r10),%ecx
- lea 1(%r10),%r10
+ movzb ($inp,%r9),%eax
+ movzb (%rsp,%r9),%ecx
+ lea 1(%r9),%r9
xor %ecx,%eax
- mov %al,-1($out,%r10)
+ mov %al,-1($out,%r9)
dec $len
jnz .Loop_tail4xop
@@ -1871,25 +1870,25 @@ $code.=<<___;
vzeroupper
___
$code.=<<___ if ($win64);
- movaps -0xa8(%r9),%xmm6
- movaps -0x98(%r9),%xmm7
- movaps -0x88(%r9),%xmm8
- movaps -0x78(%r9),%xmm9
- movaps -0x68(%r9),%xmm10
- movaps -0x58(%r9),%xmm11
- movaps -0x48(%r9),%xmm12
- movaps -0x38(%r9),%xmm13
- movaps -0x28(%r9),%xmm14
- movaps -0x18(%r9),%xmm15
-___
-$code.=<<___;
- lea (%r9),%rsp
+ movaps -0xa0(%r10),%xmm6
+ movaps -0x90(%r10),%xmm7
+ movaps -0x80(%r10),%xmm8
+ movaps -0x70(%r10),%xmm9
+ movaps -0x60(%r10),%xmm10
+ movaps -0x50(%r10),%xmm11
+ movaps -0x40(%r10),%xmm12
+ movaps -0x30(%r10),%xmm13
+ movaps -0x20(%r10),%xmm14
+ movaps -0x10(%r10),%xmm15
+___
+$code.=<<___;
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L4xop_epilogue:
ret
.cfi_endproc
-.size ChaCha20_4xop,.-ChaCha20_4xop
___
+&end_function("chacha20_xop");
}
########################################################################
@@ -1967,7 +1966,7 @@ my @x=map("\"$_\"",@xx);
"&vpslld ($t1,@x[$b0],7)",
"&vpsrld (@x[$b0],@x[$b0],25)",
"&vpor (@x[$b0],$t1,@x[$b0])",
- "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
+ "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
"&vpaddd ($xc_,$xc_,@x[$d1])",
"&vpxor (@x[$b1],$xc_,@x[$b1])",
"&vpslld ($t0,@x[$b1],7)",
@@ -2010,7 +2009,7 @@ my @x=map("\"$_\"",@xx);
"&vpslld ($t1,@x[$b2],7)",
"&vpsrld (@x[$b2],@x[$b2],25)",
"&vpor (@x[$b2],$t1,@x[$b2])",
- "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip)
+ "&vbroadcasti128($t1,'(%r9)')", # .Lrot16(%rip)
"&vpaddd ($xc_,$xc_,@x[$d3])",
"&vpxor (@x[$b3],$xc_,@x[$b3])",
"&vpslld ($t0,@x[$b3],7)",
@@ -2025,22 +2024,22 @@ my $xframe = $win64 ? 0xa8 : 8;
$code.=<<___;
.cfi_startproc
.LChaCha20_8x:
- mov %rsp,%r9 # frame register
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame register
+.cfi_def_cfa_register %r10
sub \$0x280+$xframe,%rsp
and \$-32,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%r9)
- movaps %xmm7,-0x98(%r9)
- movaps %xmm8,-0x88(%r9)
- movaps %xmm9,-0x78(%r9)
- movaps %xmm10,-0x68(%r9)
- movaps %xmm11,-0x58(%r9)
- movaps %xmm12,-0x48(%r9)
- movaps %xmm13,-0x38(%r9)
- movaps %xmm14,-0x28(%r9)
- movaps %xmm15,-0x18(%r9)
+ movaps %xmm6,-0xa0(%r10)
+ movaps %xmm7,-0x90(%r10)
+ movaps %xmm8,-0x80(%r10)
+ movaps %xmm9,-0x70(%r10)
+ movaps %xmm10,-0x60(%r10)
+ movaps %xmm11,-0x50(%r10)
+ movaps %xmm12,-0x40(%r10)
+ movaps %xmm13,-0x30(%r10)
+ movaps %xmm14,-0x20(%r10)
+ movaps %xmm15,-0x10(%r10)
.L8x_body:
___
$code.=<<___;
@@ -2061,7 +2060,7 @@ $code.=<<___;
vbroadcasti128 ($counter),$xd3 # key[3]
lea 0x100(%rsp),%rcx # size optimization
lea 0x200(%rsp),%rax # size optimization
- lea .Lrot16(%rip),%r10
+ lea .Lrot16(%rip),%r9
lea .Lrot24(%rip),%r11
vpshufd \$0x00,$xa3,$xa0 # smash key by lanes...
@@ -2125,7 +2124,7 @@ $code.=<<___;
.Loop_enter8x:
vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]"
vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]"
- vbroadcasti128 (%r10),$xt3
+ vbroadcasti128 (%r9),$xt3
vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters
mov \$10,%eax
jmp .Loop8x
@@ -2306,7 +2305,7 @@ $code.=<<___;
cmp \$64,$len
jae .L64_or_more8x
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xa0,0x00(%rsp)
vmovdqa $xb0,0x20(%rsp)
jmp .Loop_tail8x
@@ -2320,7 +2319,7 @@ $code.=<<___;
je .Ldone8x
lea 0x40($inp),$inp # inp+=64*1
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xc0,0x00(%rsp)
lea 0x40($out),$out # out+=64*1
sub \$64,$len # len-=64*1
@@ -2340,7 +2339,7 @@ $code.=<<___;
je .Ldone8x
lea 0x80($inp),$inp # inp+=64*2
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xa1,0x00(%rsp)
lea 0x80($out),$out # out+=64*2
sub \$128,$len # len-=64*2
@@ -2364,7 +2363,7 @@ $code.=<<___;
je .Ldone8x
lea 0xc0($inp),$inp # inp+=64*3
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xc1,0x00(%rsp)
lea 0xc0($out),$out # out+=64*3
sub \$192,$len # len-=64*3
@@ -2392,7 +2391,7 @@ $code.=<<___;
je .Ldone8x
lea 0x100($inp),$inp # inp+=64*4
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xa2,0x00(%rsp)
lea 0x100($out),$out # out+=64*4
sub \$256,$len # len-=64*4
@@ -2424,7 +2423,7 @@ $code.=<<___;
je .Ldone8x
lea 0x140($inp),$inp # inp+=64*5
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xc2,0x00(%rsp)
lea 0x140($out),$out # out+=64*5
sub \$320,$len # len-=64*5
@@ -2460,7 +2459,7 @@ $code.=<<___;
je .Ldone8x
lea 0x180($inp),$inp # inp+=64*6
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xa3,0x00(%rsp)
lea 0x180($out),$out # out+=64*6
sub \$384,$len # len-=64*6
@@ -2500,18 +2499,18 @@ $code.=<<___;
je .Ldone8x
lea 0x1c0($inp),$inp # inp+=64*7
- xor %r10,%r10
+ xor %r9,%r9
vmovdqa $xc3,0x00(%rsp)
lea 0x1c0($out),$out # out+=64*7
sub \$448,$len # len-=64*7
vmovdqa $xd3,0x20(%rsp)
.Loop_tail8x:
- movzb ($inp,%r10),%eax
- movzb (%rsp,%r10),%ecx
- lea 1(%r10),%r10
+ movzb ($inp,%r9),%eax
+ movzb (%rsp,%r9),%ecx
+ lea 1(%r9),%r9
xor %ecx,%eax
- mov %al,-1($out,%r10)
+ mov %al,-1($out,%r9)
dec $len
jnz .Loop_tail8x
@@ -2519,19 +2518,19 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- movaps -0xa8(%r9),%xmm6
- movaps -0x98(%r9),%xmm7
- movaps -0x88(%r9),%xmm8
- movaps -0x78(%r9),%xmm9
- movaps -0x68(%r9),%xmm10
- movaps -0x58(%r9),%xmm11
- movaps -0x48(%r9),%xmm12
- movaps -0x38(%r9),%xmm13
- movaps -0x28(%r9),%xmm14
- movaps -0x18(%r9),%xmm15
-___
-$code.=<<___;
- lea (%r9),%rsp
+ movaps -0xa0(%r10),%xmm6
+ movaps -0x90(%r10),%xmm7
+ movaps -0x80(%r10),%xmm8
+ movaps -0x70(%r10),%xmm9
+ movaps -0x60(%r10),%xmm10
+ movaps -0x50(%r10),%xmm11
+ movaps -0x40(%r10),%xmm12
+ movaps -0x30(%r10),%xmm13
+ movaps -0x20(%r10),%xmm14
+ movaps -0x10(%r10),%xmm15
+___
+$code.=<<___;
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L8x_epilogue:
ret
@@ -2591,8 +2590,8 @@ my $xframe = $win64 ? 32+8 : 8;
$code.=<<___;
.cfi_startproc
.LChaCha20_avx512:
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
cmp \$512,$len
ja .LChaCha20_16x
@@ -2600,8 +2599,8 @@ $code.=<<___;
and \$-64,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0x28(%r9)
- movaps %xmm7,-0x18(%r9)
+ movaps %xmm6,-0x20(%r10)
+ movaps %xmm7,-0x10(%r10)
.Lavx512_body:
___
$code.=<<___;
@@ -2768,11 +2767,11 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- movaps -0x28(%r9),%xmm6
- movaps -0x18(%r9),%xmm7
+ movaps -0x28(%r10),%xmm6
+ movaps -0x18(%r10),%xmm7
___
$code.=<<___;
- lea (%r9),%rsp
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.Lavx512_epilogue:
ret
@@ -2786,8 +2785,8 @@ map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz);
$code.=<<___;
.cfi_startproc
.LChaCha20_avx512vl:
- mov %rsp,%r9 # frame pointer
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame pointer
+.cfi_def_cfa_register %r10
cmp \$128,$len
ja .LChaCha20_8xvl
@@ -2795,8 +2794,8 @@ $code.=<<___;
and \$-32,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0x28(%r9)
- movaps %xmm7,-0x18(%r9)
+ movaps %xmm6,-0x20(%r10)
+ movaps %xmm7,-0x10(%r10)
.Lavx512vl_body:
___
$code.=<<___;
@@ -2920,11 +2919,11 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- movaps -0x28(%r9),%xmm6
- movaps -0x18(%r9),%xmm7
+ movaps -0x20(%r10),%xmm6
+ movaps -0x10(%r10),%xmm7
___
$code.=<<___;
- lea (%r9),%rsp
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.Lavx512vl_epilogue:
ret
@@ -3011,29 +3010,29 @@ $code.=<<___;
ChaCha20_16x:
.cfi_startproc
.LChaCha20_16x:
- mov %rsp,%r9 # frame register
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame register
+.cfi_def_cfa_register %r10
sub \$64+$xframe,%rsp
and \$-64,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%r9)
- movaps %xmm7,-0x98(%r9)
- movaps %xmm8,-0x88(%r9)
- movaps %xmm9,-0x78(%r9)
- movaps %xmm10,-0x68(%r9)
- movaps %xmm11,-0x58(%r9)
- movaps %xmm12,-0x48(%r9)
- movaps %xmm13,-0x38(%r9)
- movaps %xmm14,-0x28(%r9)
- movaps %xmm15,-0x18(%r9)
+ movaps %xmm6,-0xa0(%r10)
+ movaps %xmm7,-0x90(%r10)
+ movaps %xmm8,-0x80(%r10)
+ movaps %xmm9,-0x70(%r10)
+ movaps %xmm10,-0x60(%r10)
+ movaps %xmm11,-0x50(%r10)
+ movaps %xmm12,-0x40(%r10)
+ movaps %xmm13,-0x30(%r10)
+ movaps %xmm14,-0x20(%r10)
+ movaps %xmm15,-0x10(%r10)
.L16x_body:
___
$code.=<<___;
vzeroupper
- lea .Lsigma(%rip),%r10
- vbroadcasti32x4 (%r10),$xa3 # key[0]
+ lea .Lsigma(%rip),%r9
+ vbroadcasti32x4 (%r9),$xa3 # key[0]
vbroadcasti32x4 ($key),$xb3 # key[1]
vbroadcasti32x4 16($key),$xc3 # key[2]
vbroadcasti32x4 ($counter),$xd3 # key[3]
@@ -3080,10 +3079,10 @@ $code.=<<___;
.align 32
.Loop_outer16x:
- vpbroadcastd 0(%r10),$xa0 # reload key
- vpbroadcastd 4(%r10),$xa1
- vpbroadcastd 8(%r10),$xa2
- vpbroadcastd 12(%r10),$xa3
+ vpbroadcastd 0(%r9),$xa0 # reload key
+ vpbroadcastd 4(%r9),$xa1
+ vpbroadcastd 8(%r9),$xa2
+ vpbroadcastd 12(%r9),$xa3
vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters
vmovdqa64 @key[4],$xb0
vmovdqa64 @key[5],$xb1
@@ -3274,7 +3273,7 @@ $code.=<<___;
.align 32
.Ltail16x:
- xor %r10,%r10
+ xor %r9,%r9
sub $inp,$out
cmp \$64*1,$len
jb .Less_than_64_16x
@@ -3402,11 +3401,11 @@ $code.=<<___;
and \$63,$len
.Loop_tail16x:
- movzb ($inp,%r10),%eax
- movzb (%rsp,%r10),%ecx
- lea 1(%r10),%r10
+ movzb ($inp,%r9),%eax
+ movzb (%rsp,%r9),%ecx
+ lea 1(%r9),%r9
xor %ecx,%eax
- mov %al,-1($out,%r10)
+ mov %al,-1($out,%r9)
dec $len
jnz .Loop_tail16x
@@ -3417,19 +3416,19 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- movaps -0xa8(%r9),%xmm6
- movaps -0x98(%r9),%xmm7
- movaps -0x88(%r9),%xmm8
- movaps -0x78(%r9),%xmm9
- movaps -0x68(%r9),%xmm10
- movaps -0x58(%r9),%xmm11
- movaps -0x48(%r9),%xmm12
- movaps -0x38(%r9),%xmm13
- movaps -0x28(%r9),%xmm14
- movaps -0x18(%r9),%xmm15
-___
-$code.=<<___;
- lea (%r9),%rsp
+ movaps -0xa0(%r10),%xmm6
+ movaps -0x90(%r10),%xmm7
+ movaps -0x80(%r10),%xmm8
+ movaps -0x70(%r10),%xmm9
+ movaps -0x60(%r10),%xmm10
+ movaps -0x50(%r10),%xmm11
+ movaps -0x40(%r10),%xmm12
+ movaps -0x30(%r10),%xmm13
+ movaps -0x20(%r10),%xmm14
+ movaps -0x10(%r10),%xmm15
+___
+$code.=<<___;
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L16x_epilogue:
ret
@@ -3451,29 +3450,29 @@ $code.=<<___;
ChaCha20_8xvl:
.cfi_startproc
.LChaCha20_8xvl:
- mov %rsp,%r9 # frame register
-.cfi_def_cfa_register %r9
+ lea 8(%rsp),%r10 # frame register
+.cfi_def_cfa_register %r10
sub \$64+$xframe,%rsp
and \$-64,%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-0xa8(%r9)
- movaps %xmm7,-0x98(%r9)
- movaps %xmm8,-0x88(%r9)
- movaps %xmm9,-0x78(%r9)
- movaps %xmm10,-0x68(%r9)
- movaps %xmm11,-0x58(%r9)
- movaps %xmm12,-0x48(%r9)
- movaps %xmm13,-0x38(%r9)
- movaps %xmm14,-0x28(%r9)
- movaps %xmm15,-0x18(%r9)
+ movaps %xmm6,-0xa0(%r10)
+ movaps %xmm7,-0x90(%r10)
+ movaps %xmm8,-0x80(%r10)
+ movaps %xmm9,-0x70(%r10)
+ movaps %xmm10,-0x60(%r10)
+ movaps %xmm11,-0x50(%r10)
+ movaps %xmm12,-0x40(%r10)
+ movaps %xmm13,-0x30(%r10)
+ movaps %xmm14,-0x20(%r10)
+ movaps %xmm15,-0x10(%r10)
.L8xvl_body:
___
$code.=<<___;
vzeroupper
- lea .Lsigma(%rip),%r10
- vbroadcasti128 (%r10),$xa3 # key[0]
+ lea .Lsigma(%rip),%r9
+ vbroadcasti128 (%r9),$xa3 # key[0]
vbroadcasti128 ($key),$xb3 # key[1]
vbroadcasti128 16($key),$xc3 # key[2]
vbroadcasti128 ($counter),$xd3 # key[3]
@@ -3520,10 +3519,10 @@ $code.=<<___;
.align 32
.Loop_outer8xvl:
- #vpbroadcastd 0(%r10),$xa0 # reload key
- #vpbroadcastd 4(%r10),$xa1
- vpbroadcastd 8(%r10),$xa2
- vpbroadcastd 12(%r10),$xa3
+ #vpbroadcastd 0(%r9),$xa0 # reload key
+ #vpbroadcastd 4(%r9),$xa1
+ vpbroadcastd 8(%r9),$xa2
+ vpbroadcastd 12(%r9),$xa3
vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters
vmovdqa64 @key[4],$xb0
vmovdqa64 @key[5],$xb1
@@ -3691,8 +3690,8 @@ $code.=<<___;
vmovdqu $xd3,0x60($out)
lea ($out,%rax),$out # size optimization
- vpbroadcastd 0(%r10),%ymm0 # reload key
- vpbroadcastd 4(%r10),%ymm1
+ vpbroadcastd 0(%r9),%ymm0 # reload key
+ vpbroadcastd 4(%r9),%ymm1
sub \$64*8,$len
jnz .Loop_outer8xvl
@@ -3705,7 +3704,7 @@ $code.=<<___;
___
$xa0 = "%ymm8";
$code.=<<___;
- xor %r10,%r10
+ xor %r9,%r9
sub $inp,$out
cmp \$64*1,$len
jb .Less_than_64_8xvl
@@ -3791,11 +3790,11 @@ $code.=<<___;
and \$63,$len
.Loop_tail8xvl:
- movzb ($inp,%r10),%eax
- movzb (%rsp,%r10),%ecx
- lea 1(%r10),%r10
+ movzb ($inp,%r9),%eax
+ movzb (%rsp,%r9),%ecx
+ lea 1(%r9),%r9
xor %ecx,%eax
- mov %al,-1($out,%r10)
+ mov %al,-1($out,%r9)
dec $len
jnz .Loop_tail8xvl
@@ -3807,19 +3806,19 @@ $code.=<<___;
vzeroall
___
$code.=<<___ if ($win64);
- movaps -0xa8(%r9),%xmm6
- movaps -0x98(%r9),%xmm7
- movaps -0x88(%r9),%xmm8
- movaps -0x78(%r9),%xmm9
- movaps -0x68(%r9),%xmm10
- movaps -0x58(%r9),%xmm11
- movaps -0x48(%r9),%xmm12
- movaps -0x38(%r9),%xmm13
- movaps -0x28(%r9),%xmm14
- movaps -0x18(%r9),%xmm15
-___
-$code.=<<___;
- lea (%r9),%rsp
+ movaps -0xa0(%r10),%xmm6
+ movaps -0x90(%r10),%xmm7
+ movaps -0x80(%r10),%xmm8
+ movaps -0x70(%r10),%xmm9
+ movaps -0x60(%r10),%xmm10
+ movaps -0x50(%r10),%xmm11
+ movaps -0x40(%r10),%xmm12
+ movaps -0x30(%r10),%xmm13
+ movaps -0x20(%r10),%xmm14
+ movaps -0x10(%r10),%xmm15
+___
+$code.=<<___;
+ lea -8(%r10),%rsp
.cfi_def_cfa_register %rsp
.L8xvl_epilogue:
ret