From 488c43da50d43432ef1bb49a11411bc124dc5e4c Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Mon, 12 Nov 2018 08:14:51 +0000 Subject: chacha20,poly1305: switch to perlasm originals on x86_64 Signed-off-by: Samuel Neves --- src/crypto/zinc/chacha20/chacha20-x86_64.S | 2632 ----------------- src/crypto/zinc/chacha20/chacha20-x86_64.pl | 4005 ++++++++++++++++++++++++++ src/crypto/zinc/perlasm/x86_64-xlate.pl | 1432 +++++++++ src/crypto/zinc/poly1305/poly1305-x86_64.S | 2792 ------------------ src/crypto/zinc/poly1305/poly1305-x86_64.pl | 4159 +++++++++++++++++++++++++++ 5 files changed, 9596 insertions(+), 5424 deletions(-) delete mode 100644 src/crypto/zinc/chacha20/chacha20-x86_64.S create mode 100644 src/crypto/zinc/chacha20/chacha20-x86_64.pl create mode 100644 src/crypto/zinc/perlasm/x86_64-xlate.pl delete mode 100644 src/crypto/zinc/poly1305/poly1305-x86_64.S create mode 100644 src/crypto/zinc/poly1305/poly1305-x86_64.pl (limited to 'src') diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.S b/src/crypto/zinc/chacha20/chacha20-x86_64.S deleted file mode 100644 index 3d10c7f..0000000 --- a/src/crypto/zinc/chacha20/chacha20-x86_64.S +++ /dev/null @@ -1,2632 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2017 Samuel Neves . All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include - -.section .rodata.cst16.Lzero, "aM", @progbits, 16 -.align 16 -.Lzero: -.long 0,0,0,0 -.section .rodata.cst16.Lone, "aM", @progbits, 16 -.align 16 -.Lone: -.long 1,0,0,0 -.section .rodata.cst16.Linc, "aM", @progbits, 16 -.align 16 -.Linc: -.long 0,1,2,3 -.section .rodata.cst16.Lfour, "aM", @progbits, 16 -.align 16 -.Lfour: -.long 4,4,4,4 -.section .rodata.cst32.Lincy, "aM", @progbits, 32 -.align 32 -.Lincy: -.long 0,2,4,6,1,3,5,7 -.section .rodata.cst32.Leight, "aM", @progbits, 32 -.align 32 -.Leight: -.long 8,8,8,8,8,8,8,8 -.section .rodata.cst16.Lrot16, "aM", @progbits, 16 -.align 16 -.Lrot16: -.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd -.section .rodata.cst16.Lrot24, "aM", @progbits, 16 -.align 16 -.Lrot24: -.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe -.section .rodata.cst16.Lsigma, "aM", @progbits, 16 -.align 16 -.Lsigma: -.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 -.section .rodata.cst64.Lzeroz, "aM", @progbits, 64 -.align 64 -.Lzeroz: -.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 -.section .rodata.cst64.Lfourz, "aM", @progbits, 64 -.align 64 -.Lfourz: -.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 -.section .rodata.cst64.Lincz, "aM", @progbits, 64 -.align 64 -.Lincz: -.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 -.section .rodata.cst64.Lsixteen, "aM", @progbits, 64 -.align 64 -.Lsixteen: -.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 -.section .rodata.cst32.Ltwoy, "aM", @progbits, 32 -.align 64 -.Ltwoy: -.long 2,0,0,0, 2,0,0,0 - -.text - -#ifdef CONFIG_AS_SSSE3 -.align 32 -ENTRY(hchacha20_ssse3) - movdqa .Lsigma(%rip),%xmm0 - movdqu (%rdx),%xmm1 - movdqu 16(%rdx),%xmm2 - movdqu (%rsi),%xmm3 - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - movq $10,%r8 - .align 32 -.Loop_hssse3: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decq %r8 - jnz .Loop_hssse3 - movdqu %xmm0,0(%rdi) - movdqu %xmm3,16(%rdi) - ret -ENDPROC(hchacha20_ssse3) - -.align 32 -ENTRY(chacha20_ssse3) -.Lchacha20_ssse3: - cmpq $0,%rdx - je .Lssse3_epilogue - leaq 8(%rsp),%r10 - - cmpq $128,%rdx - ja .Lchacha20_4x - -.Ldo_sse3_after_all: - subq $64+8,%rsp - andq $-32,%rsp - movdqa .Lsigma(%rip),%xmm0 - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - movq $10,%r8 - jmp .Loop_ssse3 - -.align 32 -.Loop_outer_ssse3: - movdqa .Lone(%rip),%xmm3 - movdqa 0(%rsp),%xmm0 - movdqa 16(%rsp),%xmm1 - movdqa 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - movq $10,%r8 - movdqa %xmm3,48(%rsp) - jmp .Loop_ssse3 - -.align 32 -.Loop_ssse3: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm6,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 - pshufb %xmm7,%xmm3 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decq %r8 - jnz .Loop_ssse3 - paddd 0(%rsp),%xmm0 - paddd 16(%rsp),%xmm1 - paddd 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - - cmpq $64,%rdx - jb .Ltail_ssse3 - - movdqu 0(%rsi),%xmm4 - movdqu 16(%rsi),%xmm5 - pxor %xmm4,%xmm0 - movdqu 32(%rsi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 48(%rsi),%xmm5 - leaq 64(%rsi),%rsi - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - - movdqu %xmm0,0(%rdi) - movdqu %xmm1,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - leaq 64(%rdi),%rdi - - subq $64,%rdx - jnz .Loop_outer_ssse3 - - jmp .Ldone_ssse3 - -.align 16 -.Ltail_ssse3: - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - xorq %r8,%r8 - -.Loop_tail_ssse3: - movzbl (%rsi,%r8,1),%eax - movzbl (%rsp,%r8,1),%ecx - leaq 1(%r8),%r8 - xorl %ecx,%eax - movb %al,-1(%rdi,%r8,1) - decq %rdx - jnz .Loop_tail_ssse3 - -.Ldone_ssse3: - leaq -8(%r10),%rsp - -.Lssse3_epilogue: - ret - -.align 32 -.Lchacha20_4x: - leaq 8(%rsp),%r10 - -.Lproceed4x: - subq $0x140+8,%rsp - andq $-32,%rsp - movdqa .Lsigma(%rip),%xmm11 - movdqu (%rcx),%xmm15 - movdqu 16(%rcx),%xmm7 - movdqu (%r8),%xmm3 - leaq 256(%rsp),%rcx - leaq .Lrot16(%rip),%r9 - leaq .Lrot24(%rip),%r11 - - pshufd $0x00,%xmm11,%xmm8 - pshufd $0x55,%xmm11,%xmm9 - movdqa %xmm8,64(%rsp) - pshufd $0xaa,%xmm11,%xmm10 - movdqa %xmm9,80(%rsp) - pshufd $0xff,%xmm11,%xmm11 - movdqa %xmm10,96(%rsp) - movdqa %xmm11,112(%rsp) - - pshufd $0x00,%xmm15,%xmm12 - pshufd $0x55,%xmm15,%xmm13 - movdqa %xmm12,128-256(%rcx) - pshufd $0xaa,%xmm15,%xmm14 - movdqa %xmm13,144-256(%rcx) - pshufd $0xff,%xmm15,%xmm15 - movdqa %xmm14,160-256(%rcx) - movdqa %xmm15,176-256(%rcx) - - pshufd $0x00,%xmm7,%xmm4 - pshufd $0x55,%xmm7,%xmm5 - movdqa %xmm4,192-256(%rcx) - pshufd $0xaa,%xmm7,%xmm6 - movdqa %xmm5,208-256(%rcx) - pshufd $0xff,%xmm7,%xmm7 - movdqa %xmm6,224-256(%rcx) - movdqa %xmm7,240-256(%rcx) - - pshufd $0x00,%xmm3,%xmm0 - pshufd $0x55,%xmm3,%xmm1 - paddd .Linc(%rip),%xmm0 - pshufd $0xaa,%xmm3,%xmm2 - movdqa %xmm1,272-256(%rcx) - pshufd $0xff,%xmm3,%xmm3 - movdqa %xmm2,288-256(%rcx) - movdqa %xmm3,304-256(%rcx) - - jmp .Loop_enter4x - -.align 32 -.Loop_outer4x: - movdqa 64(%rsp),%xmm8 - movdqa 80(%rsp),%xmm9 - movdqa 96(%rsp),%xmm10 - movdqa 112(%rsp),%xmm11 - movdqa 128-256(%rcx),%xmm12 - movdqa 144-256(%rcx),%xmm13 - movdqa 160-256(%rcx),%xmm14 - movdqa 176-256(%rcx),%xmm15 - movdqa 192-256(%rcx),%xmm4 - movdqa 208-256(%rcx),%xmm5 - movdqa 224-256(%rcx),%xmm6 - movdqa 240-256(%rcx),%xmm7 - movdqa 256-256(%rcx),%xmm0 - movdqa 272-256(%rcx),%xmm1 - movdqa 288-256(%rcx),%xmm2 - movdqa 304-256(%rcx),%xmm3 - paddd .Lfour(%rip),%xmm0 - -.Loop_enter4x: - movdqa %xmm6,32(%rsp) - movdqa %xmm7,48(%rsp) - movdqa (%r9),%xmm7 - movl $10,%eax - movdqa %xmm0,256-256(%rcx) - jmp .Loop4x - -.align 32 -.Loop4x: - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 - pshufb %xmm7,%xmm0 - pshufb %xmm7,%xmm1 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm6 - pslld $12,%xmm12 - psrld $20,%xmm6 - movdqa %xmm13,%xmm7 - pslld $12,%xmm13 - por %xmm6,%xmm12 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm13 - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 - pshufb %xmm6,%xmm0 - pshufb %xmm6,%xmm1 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm7 - pslld $7,%xmm12 - psrld $25,%xmm7 - movdqa %xmm13,%xmm6 - pslld $7,%xmm13 - por %xmm7,%xmm12 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm13 - movdqa %xmm4,0(%rsp) - movdqa %xmm5,16(%rsp) - movdqa 32(%rsp),%xmm4 - movdqa 48(%rsp),%xmm5 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 - pshufb %xmm7,%xmm2 - pshufb %xmm7,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm6 - pslld $12,%xmm14 - psrld $20,%xmm6 - movdqa %xmm15,%xmm7 - pslld $12,%xmm15 - por %xmm6,%xmm14 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm15 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 - pshufb %xmm6,%xmm2 - pshufb %xmm6,%xmm3 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm7 - pslld $7,%xmm14 - psrld $25,%xmm7 - movdqa %xmm15,%xmm6 - pslld $7,%xmm15 - por %xmm7,%xmm14 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm15 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 - pshufb %xmm7,%xmm3 - pshufb %xmm7,%xmm0 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm6 - pslld $12,%xmm13 - psrld $20,%xmm6 - movdqa %xmm14,%xmm7 - pslld $12,%xmm14 - por %xmm6,%xmm13 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm14 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 - pshufb %xmm6,%xmm3 - pshufb %xmm6,%xmm0 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm7 - pslld $7,%xmm13 - psrld $25,%xmm7 - movdqa %xmm14,%xmm6 - pslld $7,%xmm14 - por %xmm7,%xmm13 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm14 - movdqa %xmm4,32(%rsp) - movdqa %xmm5,48(%rsp) - movdqa 0(%rsp),%xmm4 - movdqa 16(%rsp),%xmm5 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 - pshufb %xmm7,%xmm1 - pshufb %xmm7,%xmm2 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm6 - pslld $12,%xmm15 - psrld $20,%xmm6 - movdqa %xmm12,%xmm7 - pslld $12,%xmm12 - por %xmm6,%xmm15 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm12 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 - pshufb %xmm6,%xmm1 - pshufb %xmm6,%xmm2 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm7 - pslld $7,%xmm15 - psrld $25,%xmm7 - movdqa %xmm12,%xmm6 - pslld $7,%xmm12 - por %xmm7,%xmm15 - psrld $25,%xmm6 - movdqa (%r9),%xmm7 - por %xmm6,%xmm12 - decl %eax - jnz .Loop4x - - paddd 64(%rsp),%xmm8 - paddd 80(%rsp),%xmm9 - paddd 96(%rsp),%xmm10 - paddd 112(%rsp),%xmm11 - - movdqa %xmm8,%xmm6 - punpckldq %xmm9,%xmm8 - movdqa %xmm10,%xmm7 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm9,%xmm6 - punpckhdq %xmm11,%xmm7 - movdqa %xmm8,%xmm9 - punpcklqdq %xmm10,%xmm8 - movdqa %xmm6,%xmm11 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm10,%xmm9 - punpckhqdq %xmm7,%xmm11 - paddd 128-256(%rcx),%xmm12 - paddd 144-256(%rcx),%xmm13 - paddd 160-256(%rcx),%xmm14 - paddd 176-256(%rcx),%xmm15 - - movdqa %xmm8,0(%rsp) - movdqa %xmm9,16(%rsp) - movdqa 32(%rsp),%xmm8 - movdqa 48(%rsp),%xmm9 - - movdqa %xmm12,%xmm10 - punpckldq %xmm13,%xmm12 - movdqa %xmm14,%xmm7 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm13,%xmm10 - punpckhdq %xmm15,%xmm7 - movdqa %xmm12,%xmm13 - punpcklqdq %xmm14,%xmm12 - movdqa %xmm10,%xmm15 - punpcklqdq %xmm7,%xmm10 - punpckhqdq %xmm14,%xmm13 - punpckhqdq %xmm7,%xmm15 - paddd 192-256(%rcx),%xmm4 - paddd 208-256(%rcx),%xmm5 - paddd 224-256(%rcx),%xmm8 - paddd 240-256(%rcx),%xmm9 - - movdqa %xmm6,32(%rsp) - movdqa %xmm11,48(%rsp) - - movdqa %xmm4,%xmm14 - punpckldq %xmm5,%xmm4 - movdqa %xmm8,%xmm7 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm5,%xmm14 - punpckhdq %xmm9,%xmm7 - movdqa %xmm4,%xmm5 - punpcklqdq %xmm8,%xmm4 - movdqa %xmm14,%xmm9 - punpcklqdq %xmm7,%xmm14 - punpckhqdq %xmm8,%xmm5 - punpckhqdq %xmm7,%xmm9 - paddd 256-256(%rcx),%xmm0 - paddd 272-256(%rcx),%xmm1 - paddd 288-256(%rcx),%xmm2 - paddd 304-256(%rcx),%xmm3 - - movdqa %xmm0,%xmm8 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm8 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm8,%xmm3 - punpcklqdq %xmm7,%xmm8 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - cmpq $256,%rdx - jb .Ltail4x - - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 48(%rsp),%xmm6 - pxor %xmm15,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm3,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - - subq $256,%rdx - jnz .Loop_outer4x - - jmp .Ldone4x - -.Ltail4x: - cmpq $192,%rdx - jae .L192_or_more4x - cmpq $128,%rdx - jae .L128_or_more4x - cmpq $64,%rdx - jae .L64_or_more4x - - - xorq %r9,%r9 - - movdqa %xmm12,16(%rsp) - movdqa %xmm4,32(%rsp) - movdqa %xmm0,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L64_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je .Ldone4x - - movdqa 16(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r9,%r9 - movdqa %xmm6,0(%rsp) - movdqa %xmm13,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm5,32(%rsp) - subq $64,%rdx - movdqa %xmm1,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L128_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - je .Ldone4x - - movdqa 32(%rsp),%xmm6 - leaq 128(%rsi),%rsi - xorq %r9,%r9 - movdqa %xmm6,0(%rsp) - movdqa %xmm10,16(%rsp) - leaq 128(%rdi),%rdi - movdqa %xmm14,32(%rsp) - subq $128,%rdx - movdqa %xmm8,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L192_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je .Ldone4x - - movdqa 48(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r9,%r9 - movdqa %xmm6,0(%rsp) - movdqa %xmm15,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm9,32(%rsp) - subq $192,%rdx - movdqa %xmm3,48(%rsp) - -.Loop_tail4x: - movzbl (%rsi,%r9,1),%eax - movzbl (%rsp,%r9,1),%ecx - leaq 1(%r9),%r9 - xorl %ecx,%eax - movb %al,-1(%rdi,%r9,1) - decq %rdx - jnz .Loop_tail4x - -.Ldone4x: - leaq -8(%r10),%rsp - -.L4x_epilogue: - ret -ENDPROC(chacha20_ssse3) -#endif /* CONFIG_AS_SSSE3 */ - -#ifdef CONFIG_AS_AVX2 -.align 32 -ENTRY(chacha20_avx2) -.Lchacha20_avx2: - cmpq $0,%rdx - je .L8x_epilogue - leaq 8(%rsp),%r10 - - subq $0x280+8,%rsp - andq $-32,%rsp - vzeroupper - - vbroadcasti128 .Lsigma(%rip),%ymm11 - vbroadcasti128 (%rcx),%ymm3 - vbroadcasti128 16(%rcx),%ymm15 - vbroadcasti128 (%r8),%ymm7 - leaq 256(%rsp),%rcx - leaq 512(%rsp),%rax - leaq .Lrot16(%rip),%r9 - leaq .Lrot24(%rip),%r11 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vmovdqa %ymm8,128-256(%rcx) - vpshufd $0xaa,%ymm11,%ymm10 - vmovdqa %ymm9,160-256(%rcx) - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa %ymm10,192-256(%rcx) - vmovdqa %ymm11,224-256(%rcx) - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vmovdqa %ymm0,256-256(%rcx) - vpshufd $0xaa,%ymm3,%ymm2 - vmovdqa %ymm1,288-256(%rcx) - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa %ymm2,320-256(%rcx) - vmovdqa %ymm3,352-256(%rcx) - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vmovdqa %ymm12,384-512(%rax) - vpshufd $0xaa,%ymm15,%ymm14 - vmovdqa %ymm13,416-512(%rax) - vpshufd $0xff,%ymm15,%ymm15 - vmovdqa %ymm14,448-512(%rax) - vmovdqa %ymm15,480-512(%rax) - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpaddd .Lincy(%rip),%ymm4,%ymm4 - vpshufd $0xaa,%ymm7,%ymm6 - vmovdqa %ymm5,544-512(%rax) - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa %ymm6,576-512(%rax) - vmovdqa %ymm7,608-512(%rax) - - jmp .Loop_enter8x - -.align 32 -.Loop_outer8x: - vmovdqa 128-256(%rcx),%ymm8 - vmovdqa 160-256(%rcx),%ymm9 - vmovdqa 192-256(%rcx),%ymm10 - vmovdqa 224-256(%rcx),%ymm11 - vmovdqa 256-256(%rcx),%ymm0 - vmovdqa 288-256(%rcx),%ymm1 - vmovdqa 320-256(%rcx),%ymm2 - vmovdqa 352-256(%rcx),%ymm3 - vmovdqa 384-512(%rax),%ymm12 - vmovdqa 416-512(%rax),%ymm13 - vmovdqa 448-512(%rax),%ymm14 - vmovdqa 480-512(%rax),%ymm15 - vmovdqa 512-512(%rax),%ymm4 - vmovdqa 544-512(%rax),%ymm5 - vmovdqa 576-512(%rax),%ymm6 - vmovdqa 608-512(%rax),%ymm7 - vpaddd .Leight(%rip),%ymm4,%ymm4 - -.Loop_enter8x: - vmovdqa %ymm14,64(%rsp) - vmovdqa %ymm15,96(%rsp) - vbroadcasti128 (%r9),%ymm15 - vmovdqa %ymm4,512-512(%rax) - movl $10,%eax - jmp .Loop8x - -.align 32 -.Loop8x: - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $12,%ymm0,%ymm14 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $12,%ymm1,%ymm15 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $7,%ymm0,%ymm15 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $7,%ymm1,%ymm14 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vmovdqa %ymm12,0(%rsp) - vmovdqa %ymm13,32(%rsp) - vmovdqa 64(%rsp),%ymm12 - vmovdqa 96(%rsp),%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $12,%ymm2,%ymm14 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $12,%ymm3,%ymm15 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $7,%ymm2,%ymm15 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $7,%ymm3,%ymm14 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $12,%ymm1,%ymm14 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $12,%ymm2,%ymm15 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $7,%ymm1,%ymm15 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $7,%ymm2,%ymm14 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vmovdqa %ymm12,64(%rsp) - vmovdqa %ymm13,96(%rsp) - vmovdqa 0(%rsp),%ymm12 - vmovdqa 32(%rsp),%ymm13 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $12,%ymm3,%ymm14 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $12,%ymm0,%ymm15 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $7,%ymm3,%ymm15 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vbroadcasti128 (%r9),%ymm15 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $7,%ymm0,%ymm14 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - decl %eax - jnz .Loop8x - - leaq 512(%rsp),%rax - vpaddd 128-256(%rcx),%ymm8,%ymm8 - vpaddd 160-256(%rcx),%ymm9,%ymm9 - vpaddd 192-256(%rcx),%ymm10,%ymm10 - vpaddd 224-256(%rcx),%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm14 - vpunpckldq %ymm11,%ymm10,%ymm15 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm15,%ymm14,%ymm9 - vpunpckhqdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm10,%ymm8,%ymm11 - vpunpckhqdq %ymm10,%ymm8,%ymm8 - vpaddd 256-256(%rcx),%ymm0,%ymm0 - vpaddd 288-256(%rcx),%ymm1,%ymm1 - vpaddd 320-256(%rcx),%ymm2,%ymm2 - vpaddd 352-256(%rcx),%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm10 - vpunpckldq %ymm3,%ymm2,%ymm15 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm10,%ymm1 - vpunpckhqdq %ymm15,%ymm10,%ymm10 - vpunpcklqdq %ymm2,%ymm0,%ymm3 - vpunpckhqdq %ymm2,%ymm0,%ymm0 - vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 - vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 - vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 - vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 - vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 - vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 - vmovdqa %ymm15,0(%rsp) - vmovdqa %ymm9,32(%rsp) - vmovdqa 64(%rsp),%ymm15 - vmovdqa 96(%rsp),%ymm9 - - vpaddd 384-512(%rax),%ymm12,%ymm12 - vpaddd 416-512(%rax),%ymm13,%ymm13 - vpaddd 448-512(%rax),%ymm15,%ymm15 - vpaddd 480-512(%rax),%ymm9,%ymm9 - - vpunpckldq %ymm13,%ymm12,%ymm2 - vpunpckldq %ymm9,%ymm15,%ymm8 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm9,%ymm15,%ymm15 - vpunpcklqdq %ymm8,%ymm2,%ymm13 - vpunpckhqdq %ymm8,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm12,%ymm9 - vpunpckhqdq %ymm15,%ymm12,%ymm12 - vpaddd 512-512(%rax),%ymm4,%ymm4 - vpaddd 544-512(%rax),%ymm5,%ymm5 - vpaddd 576-512(%rax),%ymm6,%ymm6 - vpaddd 608-512(%rax),%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm15 - vpunpckldq %ymm7,%ymm6,%ymm8 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm8,%ymm15,%ymm5 - vpunpckhqdq %ymm8,%ymm15,%ymm15 - vpunpcklqdq %ymm6,%ymm4,%ymm7 - vpunpckhqdq %ymm6,%ymm4,%ymm4 - vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 - vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 - vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 - vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 - vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 - vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 - vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 - vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 - vmovdqa 0(%rsp),%ymm6 - vmovdqa 32(%rsp),%ymm12 - - cmpq $512,%rdx - jb .Ltail8x - - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - leaq 128(%rsi),%rsi - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm12,%ymm12 - vpxor 32(%rsi),%ymm13,%ymm13 - vpxor 64(%rsi),%ymm10,%ymm10 - vpxor 96(%rsi),%ymm15,%ymm15 - leaq 128(%rsi),%rsi - vmovdqu %ymm12,0(%rdi) - vmovdqu %ymm13,32(%rdi) - vmovdqu %ymm10,64(%rdi) - vmovdqu %ymm15,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm14,%ymm14 - vpxor 32(%rsi),%ymm2,%ymm2 - vpxor 64(%rsi),%ymm3,%ymm3 - vpxor 96(%rsi),%ymm7,%ymm7 - leaq 128(%rsi),%rsi - vmovdqu %ymm14,0(%rdi) - vmovdqu %ymm2,32(%rdi) - vmovdqu %ymm3,64(%rdi) - vmovdqu %ymm7,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm11,%ymm11 - vpxor 32(%rsi),%ymm9,%ymm9 - vpxor 64(%rsi),%ymm0,%ymm0 - vpxor 96(%rsi),%ymm4,%ymm4 - leaq 128(%rsi),%rsi - vmovdqu %ymm11,0(%rdi) - vmovdqu %ymm9,32(%rdi) - vmovdqu %ymm0,64(%rdi) - vmovdqu %ymm4,96(%rdi) - leaq 128(%rdi),%rdi - - subq $512,%rdx - jnz .Loop_outer8x - - jmp .Ldone8x - -.Ltail8x: - cmpq $448,%rdx - jae .L448_or_more8x - cmpq $384,%rdx - jae .L384_or_more8x - cmpq $320,%rdx - jae .L320_or_more8x - cmpq $256,%rdx - jae .L256_or_more8x - cmpq $192,%rdx - jae .L192_or_more8x - cmpq $128,%rdx - jae .L128_or_more8x - cmpq $64,%rdx - jae .L64_or_more8x - - xorq %r9,%r9 - vmovdqa %ymm6,0(%rsp) - vmovdqa %ymm8,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L64_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - je .Ldone8x - - leaq 64(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm1,0(%rsp) - leaq 64(%rdi),%rdi - subq $64,%rdx - vmovdqa %ymm5,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L128_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - je .Ldone8x - - leaq 128(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm12,0(%rsp) - leaq 128(%rdi),%rdi - subq $128,%rdx - vmovdqa %ymm13,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L192_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - je .Ldone8x - - leaq 192(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm10,0(%rsp) - leaq 192(%rdi),%rdi - subq $192,%rdx - vmovdqa %ymm15,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L256_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - je .Ldone8x - - leaq 256(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm14,0(%rsp) - leaq 256(%rdi),%rdi - subq $256,%rdx - vmovdqa %ymm2,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L320_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - je .Ldone8x - - leaq 320(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm3,0(%rsp) - leaq 320(%rdi),%rdi - subq $320,%rdx - vmovdqa %ymm7,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L384_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - je .Ldone8x - - leaq 384(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm11,0(%rsp) - leaq 384(%rdi),%rdi - subq $384,%rdx - vmovdqa %ymm9,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L448_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vpxor 384(%rsi),%ymm11,%ymm11 - vpxor 416(%rsi),%ymm9,%ymm9 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - vmovdqu %ymm11,384(%rdi) - vmovdqu %ymm9,416(%rdi) - je .Ldone8x - - leaq 448(%rsi),%rsi - xorq %r9,%r9 - vmovdqa %ymm0,0(%rsp) - leaq 448(%rdi),%rdi - subq $448,%rdx - vmovdqa %ymm4,32(%rsp) - -.Loop_tail8x: - movzbl (%rsi,%r9,1),%eax - movzbl (%rsp,%r9,1),%ecx - leaq 1(%r9),%r9 - xorl %ecx,%eax - movb %al,-1(%rdi,%r9,1) - decq %rdx - jnz .Loop_tail8x - -.Ldone8x: - vzeroall - leaq -8(%r10),%rsp - -.L8x_epilogue: - ret -ENDPROC(chacha20_avx2) -#endif /* CONFIG_AS_AVX2 */ - -#ifdef CONFIG_AS_AVX512 -.align 32 -ENTRY(chacha20_avx512) -.Lchacha20_avx512: - cmpq $0,%rdx - je .Lavx512_epilogue - leaq 8(%rsp),%r10 - - cmpq $512,%rdx - ja .Lchacha20_16x - - subq $64+8,%rsp - andq $-64,%rsp - vbroadcasti32x4 .Lsigma(%rip),%zmm0 - vbroadcasti32x4 (%rcx),%zmm1 - vbroadcasti32x4 16(%rcx),%zmm2 - vbroadcasti32x4 (%r8),%zmm3 - - vmovdqa32 %zmm0,%zmm16 - vmovdqa32 %zmm1,%zmm17 - vmovdqa32 %zmm2,%zmm18 - vpaddd .Lzeroz(%rip),%zmm3,%zmm3 - vmovdqa32 .Lfourz(%rip),%zmm20 - movq $10,%r8 - vmovdqa32 %zmm3,%zmm19 - jmp .Loop_avx512 - -.align 16 -.Loop_outer_avx512: - vmovdqa32 %zmm16,%zmm0 - vmovdqa32 %zmm17,%zmm1 - vmovdqa32 %zmm18,%zmm2 - vpaddd %zmm20,%zmm19,%zmm3 - movq $10,%r8 - vmovdqa32 %zmm3,%zmm19 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $16,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $12,%zmm1,%zmm1 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $8,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $7,%zmm1,%zmm1 - vpshufd $78,%zmm2,%zmm2 - vpshufd $57,%zmm1,%zmm1 - vpshufd $147,%zmm3,%zmm3 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $16,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $12,%zmm1,%zmm1 - vpaddd %zmm1,%zmm0,%zmm0 - vpxord %zmm0,%zmm3,%zmm3 - vprold $8,%zmm3,%zmm3 - vpaddd %zmm3,%zmm2,%zmm2 - vpxord %zmm2,%zmm1,%zmm1 - vprold $7,%zmm1,%zmm1 - vpshufd $78,%zmm2,%zmm2 - vpshufd $147,%zmm1,%zmm1 - vpshufd $57,%zmm3,%zmm3 - decq %r8 - jnz .Loop_avx512 - vpaddd %zmm16,%zmm0,%zmm0 - vpaddd %zmm17,%zmm1,%zmm1 - vpaddd %zmm18,%zmm2,%zmm2 - vpaddd %zmm19,%zmm3,%zmm3 - - subq $64,%rdx - jb .Ltail64_avx512 - - vpxor 0(%rsi),%xmm0,%xmm4 - vpxor 16(%rsi),%xmm1,%xmm5 - vpxor 32(%rsi),%xmm2,%xmm6 - vpxor 48(%rsi),%xmm3,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - jz .Ldone_avx512 - - vextracti32x4 $1,%zmm0,%xmm4 - vextracti32x4 $1,%zmm1,%xmm5 - vextracti32x4 $1,%zmm2,%xmm6 - vextracti32x4 $1,%zmm3,%xmm7 - - subq $64,%rdx - jb .Ltail_avx512 - - vpxor 0(%rsi),%xmm4,%xmm4 - vpxor 16(%rsi),%xmm5,%xmm5 - vpxor 32(%rsi),%xmm6,%xmm6 - vpxor 48(%rsi),%xmm7,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - jz .Ldone_avx512 - - vextracti32x4 $2,%zmm0,%xmm4 - vextracti32x4 $2,%zmm1,%xmm5 - vextracti32x4 $2,%zmm2,%xmm6 - vextracti32x4 $2,%zmm3,%xmm7 - - subq $64,%rdx - jb .Ltail_avx512 - - vpxor 0(%rsi),%xmm4,%xmm4 - vpxor 16(%rsi),%xmm5,%xmm5 - vpxor 32(%rsi),%xmm6,%xmm6 - vpxor 48(%rsi),%xmm7,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - jz .Ldone_avx512 - - vextracti32x4 $3,%zmm0,%xmm4 - vextracti32x4 $3,%zmm1,%xmm5 - vextracti32x4 $3,%zmm2,%xmm6 - vextracti32x4 $3,%zmm3,%xmm7 - - subq $64,%rdx - jb .Ltail_avx512 - - vpxor 0(%rsi),%xmm4,%xmm4 - vpxor 16(%rsi),%xmm5,%xmm5 - vpxor 32(%rsi),%xmm6,%xmm6 - vpxor 48(%rsi),%xmm7,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - jnz .Loop_outer_avx512 - - jmp .Ldone_avx512 - -.align 16 -.Ltail64_avx512: - vmovdqa %xmm0,0(%rsp) - vmovdqa %xmm1,16(%rsp) - vmovdqa %xmm2,32(%rsp) - vmovdqa %xmm3,48(%rsp) - addq $64,%rdx - jmp .Loop_tail_avx512 - -.align 16 -.Ltail_avx512: - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovdqa %xmm7,48(%rsp) - addq $64,%rdx - -.Loop_tail_avx512: - movzbl (%rsi,%r8,1),%eax - movzbl (%rsp,%r8,1),%ecx - leaq 1(%r8),%r8 - xorl %ecx,%eax - movb %al,-1(%rdi,%r8,1) - decq %rdx - jnz .Loop_tail_avx512 - - vmovdqa32 %zmm16,0(%rsp) - -.Ldone_avx512: - vzeroall - leaq -8(%r10),%rsp - -.Lavx512_epilogue: - ret - -.align 32 -.Lchacha20_16x: - leaq 8(%rsp),%r10 - - subq $64+8,%rsp - andq $-64,%rsp - vzeroupper - - leaq .Lsigma(%rip),%r9 - vbroadcasti32x4 (%r9),%zmm3 - vbroadcasti32x4 (%rcx),%zmm7 - vbroadcasti32x4 16(%rcx),%zmm11 - vbroadcasti32x4 (%r8),%zmm15 - - vpshufd $0x00,%zmm3,%zmm0 - vpshufd $0x55,%zmm3,%zmm1 - vpshufd $0xaa,%zmm3,%zmm2 - vpshufd $0xff,%zmm3,%zmm3 - vmovdqa64 %zmm0,%zmm16 - vmovdqa64 %zmm1,%zmm17 - vmovdqa64 %zmm2,%zmm18 - vmovdqa64 %zmm3,%zmm19 - - vpshufd $0x00,%zmm7,%zmm4 - vpshufd $0x55,%zmm7,%zmm5 - vpshufd $0xaa,%zmm7,%zmm6 - vpshufd $0xff,%zmm7,%zmm7 - vmovdqa64 %zmm4,%zmm20 - vmovdqa64 %zmm5,%zmm21 - vmovdqa64 %zmm6,%zmm22 - vmovdqa64 %zmm7,%zmm23 - - vpshufd $0x00,%zmm11,%zmm8 - vpshufd $0x55,%zmm11,%zmm9 - vpshufd $0xaa,%zmm11,%zmm10 - vpshufd $0xff,%zmm11,%zmm11 - vmovdqa64 %zmm8,%zmm24 - vmovdqa64 %zmm9,%zmm25 - vmovdqa64 %zmm10,%zmm26 - vmovdqa64 %zmm11,%zmm27 - - vpshufd $0x00,%zmm15,%zmm12 - vpshufd $0x55,%zmm15,%zmm13 - vpshufd $0xaa,%zmm15,%zmm14 - vpshufd $0xff,%zmm15,%zmm15 - vpaddd .Lincz(%rip),%zmm12,%zmm12 - vmovdqa64 %zmm12,%zmm28 - vmovdqa64 %zmm13,%zmm29 - vmovdqa64 %zmm14,%zmm30 - vmovdqa64 %zmm15,%zmm31 - - movl $10,%eax - jmp .Loop16x - -.align 32 -.Loop_outer16x: - vpbroadcastd 0(%r9),%zmm0 - vpbroadcastd 4(%r9),%zmm1 - vpbroadcastd 8(%r9),%zmm2 - vpbroadcastd 12(%r9),%zmm3 - vpaddd .Lsixteen(%rip),%zmm28,%zmm28 - vmovdqa64 %zmm20,%zmm4 - vmovdqa64 %zmm21,%zmm5 - vmovdqa64 %zmm22,%zmm6 - vmovdqa64 %zmm23,%zmm7 - vmovdqa64 %zmm24,%zmm8 - vmovdqa64 %zmm25,%zmm9 - vmovdqa64 %zmm26,%zmm10 - vmovdqa64 %zmm27,%zmm11 - vmovdqa64 %zmm28,%zmm12 - vmovdqa64 %zmm29,%zmm13 - vmovdqa64 %zmm30,%zmm14 - vmovdqa64 %zmm31,%zmm15 - - vmovdqa64 %zmm0,%zmm16 - vmovdqa64 %zmm1,%zmm17 - vmovdqa64 %zmm2,%zmm18 - vmovdqa64 %zmm3,%zmm19 - - movl $10,%eax - jmp .Loop16x - -.align 32 -.Loop16x: - vpaddd %zmm4,%zmm0,%zmm0 - vpaddd %zmm5,%zmm1,%zmm1 - vpaddd %zmm6,%zmm2,%zmm2 - vpaddd %zmm7,%zmm3,%zmm3 - vpxord %zmm0,%zmm12,%zmm12 - vpxord %zmm1,%zmm13,%zmm13 - vpxord %zmm2,%zmm14,%zmm14 - vpxord %zmm3,%zmm15,%zmm15 - vprold $16,%zmm12,%zmm12 - vprold $16,%zmm13,%zmm13 - vprold $16,%zmm14,%zmm14 - vprold $16,%zmm15,%zmm15 - vpaddd %zmm12,%zmm8,%zmm8 - vpaddd %zmm13,%zmm9,%zmm9 - vpaddd %zmm14,%zmm10,%zmm10 - vpaddd %zmm15,%zmm11,%zmm11 - vpxord %zmm8,%zmm4,%zmm4 - vpxord %zmm9,%zmm5,%zmm5 - vpxord %zmm10,%zmm6,%zmm6 - vpxord %zmm11,%zmm7,%zmm7 - vprold $12,%zmm4,%zmm4 - vprold $12,%zmm5,%zmm5 - vprold $12,%zmm6,%zmm6 - vprold $12,%zmm7,%zmm7 - vpaddd %zmm4,%zmm0,%zmm0 - vpaddd %zmm5,%zmm1,%zmm1 - vpaddd %zmm6,%zmm2,%zmm2 - vpaddd %zmm7,%zmm3,%zmm3 - vpxord %zmm0,%zmm12,%zmm12 - vpxord %zmm1,%zmm13,%zmm13 - vpxord %zmm2,%zmm14,%zmm14 - vpxord %zmm3,%zmm15,%zmm15 - vprold $8,%zmm12,%zmm12 - vprold $8,%zmm13,%zmm13 - vprold $8,%zmm14,%zmm14 - vprold $8,%zmm15,%zmm15 - vpaddd %zmm12,%zmm8,%zmm8 - vpaddd %zmm13,%zmm9,%zmm9 - vpaddd %zmm14,%zmm10,%zmm10 - vpaddd %zmm15,%zmm11,%zmm11 - vpxord %zmm8,%zmm4,%zmm4 - vpxord %zmm9,%zmm5,%zmm5 - vpxord %zmm10,%zmm6,%zmm6 - vpxord %zmm11,%zmm7,%zmm7 - vprold $7,%zmm4,%zmm4 - vprold $7,%zmm5,%zmm5 - vprold $7,%zmm6,%zmm6 - vprold $7,%zmm7,%zmm7 - vpaddd %zmm5,%zmm0,%zmm0 - vpaddd %zmm6,%zmm1,%zmm1 - vpaddd %zmm7,%zmm2,%zmm2 - vpaddd %zmm4,%zmm3,%zmm3 - vpxord %zmm0,%zmm15,%zmm15 - vpxord %zmm1,%zmm12,%zmm12 - vpxord %zmm2,%zmm13,%zmm13 - vpxord %zmm3,%zmm14,%zmm14 - vprold $16,%zmm15,%zmm15 - vprold $16,%zmm12,%zmm12 - vprold $16,%zmm13,%zmm13 - vprold $16,%zmm14,%zmm14 - vpaddd %zmm15,%zmm10,%zmm10 - vpaddd %zmm12,%zmm11,%zmm11 - vpaddd %zmm13,%zmm8,%zmm8 - vpaddd %zmm14,%zmm9,%zmm9 - vpxord %zmm10,%zmm5,%zmm5 - vpxord %zmm11,%zmm6,%zmm6 - vpxord %zmm8,%zmm7,%zmm7 - vpxord %zmm9,%zmm4,%zmm4 - vprold $12,%zmm5,%zmm5 - vprold $12,%zmm6,%zmm6 - vprold $12,%zmm7,%zmm7 - vprold $12,%zmm4,%zmm4 - vpaddd %zmm5,%zmm0,%zmm0 - vpaddd %zmm6,%zmm1,%zmm1 - vpaddd %zmm7,%zmm2,%zmm2 - vpaddd %zmm4,%zmm3,%zmm3 - vpxord %zmm0,%zmm15,%zmm15 - vpxord %zmm1,%zmm12,%zmm12 - vpxord %zmm2,%zmm13,%zmm13 - vpxord %zmm3,%zmm14,%zmm14 - vprold $8,%zmm15,%zmm15 - vprold $8,%zmm12,%zmm12 - vprold $8,%zmm13,%zmm13 - vprold $8,%zmm14,%zmm14 - vpaddd %zmm15,%zmm10,%zmm10 - vpaddd %zmm12,%zmm11,%zmm11 - vpaddd %zmm13,%zmm8,%zmm8 - vpaddd %zmm14,%zmm9,%zmm9 - vpxord %zmm10,%zmm5,%zmm5 - vpxord %zmm11,%zmm6,%zmm6 - vpxord %zmm8,%zmm7,%zmm7 - vpxord %zmm9,%zmm4,%zmm4 - vprold $7,%zmm5,%zmm5 - vprold $7,%zmm6,%zmm6 - vprold $7,%zmm7,%zmm7 - vprold $7,%zmm4,%zmm4 - decl %eax - jnz .Loop16x - - vpaddd %zmm16,%zmm0,%zmm0 - vpaddd %zmm17,%zmm1,%zmm1 - vpaddd %zmm18,%zmm2,%zmm2 - vpaddd %zmm19,%zmm3,%zmm3 - - vpunpckldq %zmm1,%zmm0,%zmm18 - vpunpckldq %zmm3,%zmm2,%zmm19 - vpunpckhdq %zmm1,%zmm0,%zmm0 - vpunpckhdq %zmm3,%zmm2,%zmm2 - vpunpcklqdq %zmm19,%zmm18,%zmm1 - vpunpckhqdq %zmm19,%zmm18,%zmm18 - vpunpcklqdq %zmm2,%zmm0,%zmm3 - vpunpckhqdq %zmm2,%zmm0,%zmm0 - vpaddd %zmm20,%zmm4,%zmm4 - vpaddd %zmm21,%zmm5,%zmm5 - vpaddd %zmm22,%zmm6,%zmm6 - vpaddd %zmm23,%zmm7,%zmm7 - - vpunpckldq %zmm5,%zmm4,%zmm2 - vpunpckldq %zmm7,%zmm6,%zmm19 - vpunpckhdq %zmm5,%zmm4,%zmm4 - vpunpckhdq %zmm7,%zmm6,%zmm6 - vpunpcklqdq %zmm19,%zmm2,%zmm5 - vpunpckhqdq %zmm19,%zmm2,%zmm2 - vpunpcklqdq %zmm6,%zmm4,%zmm7 - vpunpckhqdq %zmm6,%zmm4,%zmm4 - vshufi32x4 $0x44,%zmm5,%zmm1,%zmm19 - vshufi32x4 $0xee,%zmm5,%zmm1,%zmm5 - vshufi32x4 $0x44,%zmm2,%zmm18,%zmm1 - vshufi32x4 $0xee,%zmm2,%zmm18,%zmm2 - vshufi32x4 $0x44,%zmm7,%zmm3,%zmm18 - vshufi32x4 $0xee,%zmm7,%zmm3,%zmm7 - vshufi32x4 $0x44,%zmm4,%zmm0,%zmm3 - vshufi32x4 $0xee,%zmm4,%zmm0,%zmm4 - vpaddd %zmm24,%zmm8,%zmm8 - vpaddd %zmm25,%zmm9,%zmm9 - vpaddd %zmm26,%zmm10,%zmm10 - vpaddd %zmm27,%zmm11,%zmm11 - - vpunpckldq %zmm9,%zmm8,%zmm6 - vpunpckldq %zmm11,%zmm10,%zmm0 - vpunpckhdq %zmm9,%zmm8,%zmm8 - vpunpckhdq %zmm11,%zmm10,%zmm10 - vpunpcklqdq %zmm0,%zmm6,%zmm9 - vpunpckhqdq %zmm0,%zmm6,%zmm6 - vpunpcklqdq %zmm10,%zmm8,%zmm11 - vpunpckhqdq %zmm10,%zmm8,%zmm8 - vpaddd %zmm28,%zmm12,%zmm12 - vpaddd %zmm29,%zmm13,%zmm13 - vpaddd %zmm30,%zmm14,%zmm14 - vpaddd %zmm31,%zmm15,%zmm15 - - vpunpckldq %zmm13,%zmm12,%zmm10 - vpunpckldq %zmm15,%zmm14,%zmm0 - vpunpckhdq %zmm13,%zmm12,%zmm12 - vpunpckhdq %zmm15,%zmm14,%zmm14 - vpunpcklqdq %zmm0,%zmm10,%zmm13 - vpunpckhqdq %zmm0,%zmm10,%zmm10 - vpunpcklqdq %zmm14,%zmm12,%zmm15 - vpunpckhqdq %zmm14,%zmm12,%zmm12 - vshufi32x4 $0x44,%zmm13,%zmm9,%zmm0 - vshufi32x4 $0xee,%zmm13,%zmm9,%zmm13 - vshufi32x4 $0x44,%zmm10,%zmm6,%zmm9 - vshufi32x4 $0xee,%zmm10,%zmm6,%zmm10 - vshufi32x4 $0x44,%zmm15,%zmm11,%zmm6 - vshufi32x4 $0xee,%zmm15,%zmm11,%zmm15 - vshufi32x4 $0x44,%zmm12,%zmm8,%zmm11 - vshufi32x4 $0xee,%zmm12,%zmm8,%zmm12 - vshufi32x4 $0x88,%zmm0,%zmm19,%zmm16 - vshufi32x4 $0xdd,%zmm0,%zmm19,%zmm19 - vshufi32x4 $0x88,%zmm13,%zmm5,%zmm0 - vshufi32x4 $0xdd,%zmm13,%zmm5,%zmm13 - vshufi32x4 $0x88,%zmm9,%zmm1,%zmm17 - vshufi32x4 $0xdd,%zmm9,%zmm1,%zmm1 - vshufi32x4 $0x88,%zmm10,%zmm2,%zmm9 - vshufi32x4 $0xdd,%zmm10,%zmm2,%zmm10 - vshufi32x4 $0x88,%zmm6,%zmm18,%zmm14 - vshufi32x4 $0xdd,%zmm6,%zmm18,%zmm18 - vshufi32x4 $0x88,%zmm15,%zmm7,%zmm6 - vshufi32x4 $0xdd,%zmm15,%zmm7,%zmm15 - vshufi32x4 $0x88,%zmm11,%zmm3,%zmm8 - vshufi32x4 $0xdd,%zmm11,%zmm3,%zmm3 - vshufi32x4 $0x88,%zmm12,%zmm4,%zmm11 - vshufi32x4 $0xdd,%zmm12,%zmm4,%zmm12 - cmpq $1024,%rdx - jb .Ltail16x - - vpxord 0(%rsi),%zmm16,%zmm16 - vpxord 64(%rsi),%zmm17,%zmm17 - vpxord 128(%rsi),%zmm14,%zmm14 - vpxord 192(%rsi),%zmm8,%zmm8 - vmovdqu32 %zmm16,0(%rdi) - vmovdqu32 %zmm17,64(%rdi) - vmovdqu32 %zmm14,128(%rdi) - vmovdqu32 %zmm8,192(%rdi) - - vpxord 256(%rsi),%zmm19,%zmm19 - vpxord 320(%rsi),%zmm1,%zmm1 - vpxord 384(%rsi),%zmm18,%zmm18 - vpxord 448(%rsi),%zmm3,%zmm3 - vmovdqu32 %zmm19,256(%rdi) - vmovdqu32 %zmm1,320(%rdi) - vmovdqu32 %zmm18,384(%rdi) - vmovdqu32 %zmm3,448(%rdi) - - vpxord 512(%rsi),%zmm0,%zmm0 - vpxord 576(%rsi),%zmm9,%zmm9 - vpxord 640(%rsi),%zmm6,%zmm6 - vpxord 704(%rsi),%zmm11,%zmm11 - vmovdqu32 %zmm0,512(%rdi) - vmovdqu32 %zmm9,576(%rdi) - vmovdqu32 %zmm6,640(%rdi) - vmovdqu32 %zmm11,704(%rdi) - - vpxord 768(%rsi),%zmm13,%zmm13 - vpxord 832(%rsi),%zmm10,%zmm10 - vpxord 896(%rsi),%zmm15,%zmm15 - vpxord 960(%rsi),%zmm12,%zmm12 - leaq 1024(%rsi),%rsi - vmovdqu32 %zmm13,768(%rdi) - vmovdqu32 %zmm10,832(%rdi) - vmovdqu32 %zmm15,896(%rdi) - vmovdqu32 %zmm12,960(%rdi) - leaq 1024(%rdi),%rdi - - subq $1024,%rdx - jnz .Loop_outer16x - - jmp .Ldone16x - -.align 32 -.Ltail16x: - xorq %r9,%r9 - subq %rsi,%rdi - cmpq $64,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm16,%zmm16 - vmovdqu32 %zmm16,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm17,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $128,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm17,%zmm17 - vmovdqu32 %zmm17,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm14,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $192,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm14,%zmm14 - vmovdqu32 %zmm14,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm8,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $256,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm8,%zmm8 - vmovdqu32 %zmm8,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm19,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $320,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm19,%zmm19 - vmovdqu32 %zmm19,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm1,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $384,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm1,%zmm1 - vmovdqu32 %zmm1,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm18,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $448,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm18,%zmm18 - vmovdqu32 %zmm18,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm3,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $512,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm3,%zmm3 - vmovdqu32 %zmm3,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm0,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $576,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm0,%zmm0 - vmovdqu32 %zmm0,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm9,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $640,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm9,%zmm9 - vmovdqu32 %zmm9,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm6,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $704,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm6,%zmm6 - vmovdqu32 %zmm6,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm11,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $768,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm11,%zmm11 - vmovdqu32 %zmm11,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm13,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $832,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm13,%zmm13 - vmovdqu32 %zmm13,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm10,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $896,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm10,%zmm10 - vmovdqu32 %zmm10,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm15,%zmm16 - leaq 64(%rsi),%rsi - - cmpq $960,%rdx - jb .Less_than_64_16x - vpxord (%rsi),%zmm15,%zmm15 - vmovdqu32 %zmm15,(%rdi,%rsi,1) - je .Ldone16x - vmovdqa32 %zmm12,%zmm16 - leaq 64(%rsi),%rsi - -.Less_than_64_16x: - vmovdqa32 %zmm16,0(%rsp) - leaq (%rdi,%rsi,1),%rdi - andq $63,%rdx - -.Loop_tail16x: - movzbl (%rsi,%r9,1),%eax - movzbl (%rsp,%r9,1),%ecx - leaq 1(%r9),%r9 - xorl %ecx,%eax - movb %al,-1(%rdi,%r9,1) - decq %rdx - jnz .Loop_tail16x - - vpxord %zmm16,%zmm16,%zmm16 - vmovdqa32 %zmm16,0(%rsp) - -.Ldone16x: - vzeroall - leaq -8(%r10),%rsp - -.L16x_epilogue: - ret -ENDPROC(chacha20_avx512) - -.align 32 -ENTRY(chacha20_avx512vl) - cmpq $0,%rdx - je .Lavx512vl_epilogue - - leaq 8(%rsp),%r10 - - cmpq $128,%rdx - ja .Lchacha20_8xvl - - subq $64+8,%rsp - andq $-64,%rsp - vbroadcasti128 .Lsigma(%rip),%ymm0 - vbroadcasti128 (%rcx),%ymm1 - vbroadcasti128 16(%rcx),%ymm2 - vbroadcasti128 (%r8),%ymm3 - - vmovdqa32 %ymm0,%ymm16 - vmovdqa32 %ymm1,%ymm17 - vmovdqa32 %ymm2,%ymm18 - vpaddd .Lzeroz(%rip),%ymm3,%ymm3 - vmovdqa32 .Ltwoy(%rip),%ymm20 - movq $10,%r8 - vmovdqa32 %ymm3,%ymm19 - jmp .Loop_avx512vl - -.align 16 -.Loop_outer_avx512vl: - vmovdqa32 %ymm18,%ymm2 - vpaddd %ymm20,%ymm19,%ymm3 - movq $10,%r8 - vmovdqa32 %ymm3,%ymm19 - jmp .Loop_avx512vl - -.align 32 -.Loop_avx512vl: - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - vpshufd $78,%ymm2,%ymm2 - vpshufd $57,%ymm1,%ymm1 - vpshufd $147,%ymm3,%ymm3 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $16,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $12,%ymm1,%ymm1 - vpaddd %ymm1,%ymm0,%ymm0 - vpxor %ymm0,%ymm3,%ymm3 - vprold $8,%ymm3,%ymm3 - vpaddd %ymm3,%ymm2,%ymm2 - vpxor %ymm2,%ymm1,%ymm1 - vprold $7,%ymm1,%ymm1 - vpshufd $78,%ymm2,%ymm2 - vpshufd $147,%ymm1,%ymm1 - vpshufd $57,%ymm3,%ymm3 - decq %r8 - jnz .Loop_avx512vl - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - - subq $64,%rdx - jb .Ltail64_avx512vl - - vpxor 0(%rsi),%xmm0,%xmm4 - vpxor 16(%rsi),%xmm1,%xmm5 - vpxor 32(%rsi),%xmm2,%xmm6 - vpxor 48(%rsi),%xmm3,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - jz .Ldone_avx512vl - - vextracti128 $1,%ymm0,%xmm4 - vextracti128 $1,%ymm1,%xmm5 - vextracti128 $1,%ymm2,%xmm6 - vextracti128 $1,%ymm3,%xmm7 - - subq $64,%rdx - jb .Ltail_avx512vl - - vpxor 0(%rsi),%xmm4,%xmm4 - vpxor 16(%rsi),%xmm5,%xmm5 - vpxor 32(%rsi),%xmm6,%xmm6 - vpxor 48(%rsi),%xmm7,%xmm7 - leaq 64(%rsi),%rsi - - vmovdqu %xmm4,0(%rdi) - vmovdqu %xmm5,16(%rdi) - vmovdqu %xmm6,32(%rdi) - vmovdqu %xmm7,48(%rdi) - leaq 64(%rdi),%rdi - - vmovdqa32 %ymm16,%ymm0 - vmovdqa32 %ymm17,%ymm1 - jnz .Loop_outer_avx512vl - - jmp .Ldone_avx512vl - -.align 16 -.Ltail64_avx512vl: - vmovdqa %xmm0,0(%rsp) - vmovdqa %xmm1,16(%rsp) - vmovdqa %xmm2,32(%rsp) - vmovdqa %xmm3,48(%rsp) - addq $64,%rdx - jmp .Loop_tail_avx512vl - -.align 16 -.Ltail_avx512vl: - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - vmovdqa %xmm7,48(%rsp) - addq $64,%rdx - -.Loop_tail_avx512vl: - movzbl (%rsi,%r8,1),%eax - movzbl (%rsp,%r8,1),%ecx - leaq 1(%r8),%r8 - xorl %ecx,%eax - movb %al,-1(%rdi,%r8,1) - decq %rdx - jnz .Loop_tail_avx512vl - - vmovdqa32 %ymm16,0(%rsp) - vmovdqa32 %ymm16,32(%rsp) - -.Ldone_avx512vl: - vzeroall - leaq -8(%r10),%rsp -.Lavx512vl_epilogue: - ret - -.align 32 -.Lchacha20_8xvl: - leaq 8(%rsp),%r10 - subq $64+8,%rsp - andq $-64,%rsp - vzeroupper - - leaq .Lsigma(%rip),%r9 - vbroadcasti128 (%r9),%ymm3 - vbroadcasti128 (%rcx),%ymm7 - vbroadcasti128 16(%rcx),%ymm11 - vbroadcasti128 (%r8),%ymm15 - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vpshufd $0xaa,%ymm3,%ymm2 - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpshufd $0xaa,%ymm7,%ymm6 - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa64 %ymm4,%ymm20 - vmovdqa64 %ymm5,%ymm21 - vmovdqa64 %ymm6,%ymm22 - vmovdqa64 %ymm7,%ymm23 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vpshufd $0xaa,%ymm11,%ymm10 - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa64 %ymm8,%ymm24 - vmovdqa64 %ymm9,%ymm25 - vmovdqa64 %ymm10,%ymm26 - vmovdqa64 %ymm11,%ymm27 - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vpshufd $0xaa,%ymm15,%ymm14 - vpshufd $0xff,%ymm15,%ymm15 - vpaddd .Lincy(%rip),%ymm12,%ymm12 - vmovdqa64 %ymm12,%ymm28 - vmovdqa64 %ymm13,%ymm29 - vmovdqa64 %ymm14,%ymm30 - vmovdqa64 %ymm15,%ymm31 - - movl $10,%eax - jmp .Loop8xvl - -.align 32 -.Loop_outer8xvl: - - - vpbroadcastd 8(%r9),%ymm2 - vpbroadcastd 12(%r9),%ymm3 - vpaddd .Leight(%rip),%ymm28,%ymm28 - vmovdqa64 %ymm20,%ymm4 - vmovdqa64 %ymm21,%ymm5 - vmovdqa64 %ymm22,%ymm6 - vmovdqa64 %ymm23,%ymm7 - vmovdqa64 %ymm24,%ymm8 - vmovdqa64 %ymm25,%ymm9 - vmovdqa64 %ymm26,%ymm10 - vmovdqa64 %ymm27,%ymm11 - vmovdqa64 %ymm28,%ymm12 - vmovdqa64 %ymm29,%ymm13 - vmovdqa64 %ymm30,%ymm14 - vmovdqa64 %ymm31,%ymm15 - - vmovdqa64 %ymm0,%ymm16 - vmovdqa64 %ymm1,%ymm17 - vmovdqa64 %ymm2,%ymm18 - vmovdqa64 %ymm3,%ymm19 - - movl $10,%eax - jmp .Loop8xvl - -.align 32 -.Loop8xvl: - vpaddd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm7,%ymm3,%ymm3 - vpxor %ymm0,%ymm12,%ymm12 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm3,%ymm15,%ymm15 - vprold $16,%ymm12,%ymm12 - vprold $16,%ymm13,%ymm13 - vprold $16,%ymm14,%ymm14 - vprold $16,%ymm15,%ymm15 - vpaddd %ymm12,%ymm8,%ymm8 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm11,%ymm7,%ymm7 - vprold $12,%ymm4,%ymm4 - vprold $12,%ymm5,%ymm5 - vprold $12,%ymm6,%ymm6 - vprold $12,%ymm7,%ymm7 - vpaddd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm7,%ymm3,%ymm3 - vpxor %ymm0,%ymm12,%ymm12 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm3,%ymm15,%ymm15 - vprold $8,%ymm12,%ymm12 - vprold $8,%ymm13,%ymm13 - vprold $8,%ymm14,%ymm14 - vprold $8,%ymm15,%ymm15 - vpaddd %ymm12,%ymm8,%ymm8 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm15,%ymm11,%ymm11 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm11,%ymm7,%ymm7 - vprold $7,%ymm4,%ymm4 - vprold $7,%ymm5,%ymm5 - vprold $7,%ymm6,%ymm6 - vprold $7,%ymm7,%ymm7 - vpaddd %ymm5,%ymm0,%ymm0 - vpaddd %ymm6,%ymm1,%ymm1 - vpaddd %ymm7,%ymm2,%ymm2 - vpaddd %ymm4,%ymm3,%ymm3 - vpxor %ymm0,%ymm15,%ymm15 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm2,%ymm13,%ymm13 - vpxor %ymm3,%ymm14,%ymm14 - vprold $16,%ymm15,%ymm15 - vprold $16,%ymm12,%ymm12 - vprold $16,%ymm13,%ymm13 - vprold $16,%ymm14,%ymm14 - vpaddd %ymm15,%ymm10,%ymm10 - vpaddd %ymm12,%ymm11,%ymm11 - vpaddd %ymm13,%ymm8,%ymm8 - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm10,%ymm5,%ymm5 - vpxor %ymm11,%ymm6,%ymm6 - vpxor %ymm8,%ymm7,%ymm7 - vpxor %ymm9,%ymm4,%ymm4 - vprold $12,%ymm5,%ymm5 - vprold $12,%ymm6,%ymm6 - vprold $12,%ymm7,%ymm7 - vprold $12,%ymm4,%ymm4 - vpaddd %ymm5,%ymm0,%ymm0 - vpaddd %ymm6,%ymm1,%ymm1 - vpaddd %ymm7,%ymm2,%ymm2 - vpaddd %ymm4,%ymm3,%ymm3 - vpxor %ymm0,%ymm15,%ymm15 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm2,%ymm13,%ymm13 - vpxor %ymm3,%ymm14,%ymm14 - vprold $8,%ymm15,%ymm15 - vprold $8,%ymm12,%ymm12 - vprold $8,%ymm13,%ymm13 - vprold $8,%ymm14,%ymm14 - vpaddd %ymm15,%ymm10,%ymm10 - vpaddd %ymm12,%ymm11,%ymm11 - vpaddd %ymm13,%ymm8,%ymm8 - vpaddd %ymm14,%ymm9,%ymm9 - vpxor %ymm10,%ymm5,%ymm5 - vpxor %ymm11,%ymm6,%ymm6 - vpxor %ymm8,%ymm7,%ymm7 - vpxor %ymm9,%ymm4,%ymm4 - vprold $7,%ymm5,%ymm5 - vprold $7,%ymm6,%ymm6 - vprold $7,%ymm7,%ymm7 - vprold $7,%ymm4,%ymm4 - decl %eax - jnz .Loop8xvl - - vpaddd %ymm16,%ymm0,%ymm0 - vpaddd %ymm17,%ymm1,%ymm1 - vpaddd %ymm18,%ymm2,%ymm2 - vpaddd %ymm19,%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm18 - vpunpckldq %ymm3,%ymm2,%ymm19 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm19,%ymm18,%ymm1 - vpunpckhqdq %ymm19,%ymm18,%ymm18 - vpunpcklqdq %ymm2,%ymm0,%ymm3 - vpunpckhqdq %ymm2,%ymm0,%ymm0 - vpaddd %ymm20,%ymm4,%ymm4 - vpaddd %ymm21,%ymm5,%ymm5 - vpaddd %ymm22,%ymm6,%ymm6 - vpaddd %ymm23,%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm2 - vpunpckldq %ymm7,%ymm6,%ymm19 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm19,%ymm2,%ymm5 - vpunpckhqdq %ymm19,%ymm2,%ymm2 - vpunpcklqdq %ymm6,%ymm4,%ymm7 - vpunpckhqdq %ymm6,%ymm4,%ymm4 - vshufi32x4 $0,%ymm5,%ymm1,%ymm19 - vshufi32x4 $3,%ymm5,%ymm1,%ymm5 - vshufi32x4 $0,%ymm2,%ymm18,%ymm1 - vshufi32x4 $3,%ymm2,%ymm18,%ymm2 - vshufi32x4 $0,%ymm7,%ymm3,%ymm18 - vshufi32x4 $3,%ymm7,%ymm3,%ymm7 - vshufi32x4 $0,%ymm4,%ymm0,%ymm3 - vshufi32x4 $3,%ymm4,%ymm0,%ymm4 - vpaddd %ymm24,%ymm8,%ymm8 - vpaddd %ymm25,%ymm9,%ymm9 - vpaddd %ymm26,%ymm10,%ymm10 - vpaddd %ymm27,%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm6 - vpunpckldq %ymm11,%ymm10,%ymm0 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm0,%ymm6,%ymm9 - vpunpckhqdq %ymm0,%ymm6,%ymm6 - vpunpcklqdq %ymm10,%ymm8,%ymm11 - vpunpckhqdq %ymm10,%ymm8,%ymm8 - vpaddd %ymm28,%ymm12,%ymm12 - vpaddd %ymm29,%ymm13,%ymm13 - vpaddd %ymm30,%ymm14,%ymm14 - vpaddd %ymm31,%ymm15,%ymm15 - - vpunpckldq %ymm13,%ymm12,%ymm10 - vpunpckldq %ymm15,%ymm14,%ymm0 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm0,%ymm10,%ymm13 - vpunpckhqdq %ymm0,%ymm10,%ymm10 - vpunpcklqdq %ymm14,%ymm12,%ymm15 - vpunpckhqdq %ymm14,%ymm12,%ymm12 - vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 - vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 - vperm2i128 $0x20,%ymm10,%ymm6,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm6,%ymm10 - vperm2i128 $0x20,%ymm15,%ymm11,%ymm6 - vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 - vperm2i128 $0x20,%ymm12,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 - cmpq $512,%rdx - jb .Ltail8xvl - - movl $0x80,%eax - vpxord 0(%rsi),%ymm19,%ymm19 - vpxor 32(%rsi),%ymm0,%ymm0 - vpxor 64(%rsi),%ymm5,%ymm5 - vpxor 96(%rsi),%ymm13,%ymm13 - leaq (%rsi,%rax,1),%rsi - vmovdqu32 %ymm19,0(%rdi) - vmovdqu %ymm0,32(%rdi) - vmovdqu %ymm5,64(%rdi) - vmovdqu %ymm13,96(%rdi) - leaq (%rdi,%rax,1),%rdi - - vpxor 0(%rsi),%ymm1,%ymm1 - vpxor 32(%rsi),%ymm9,%ymm9 - vpxor 64(%rsi),%ymm2,%ymm2 - vpxor 96(%rsi),%ymm10,%ymm10 - leaq (%rsi,%rax,1),%rsi - vmovdqu %ymm1,0(%rdi) - vmovdqu %ymm9,32(%rdi) - vmovdqu %ymm2,64(%rdi) - vmovdqu %ymm10,96(%rdi) - leaq (%rdi,%rax,1),%rdi - - vpxord 0(%rsi),%ymm18,%ymm18 - vpxor 32(%rsi),%ymm6,%ymm6 - vpxor 64(%rsi),%ymm7,%ymm7 - vpxor 96(%rsi),%ymm15,%ymm15 - leaq (%rsi,%rax,1),%rsi - vmovdqu32 %ymm18,0(%rdi) - vmovdqu %ymm6,32(%rdi) - vmovdqu %ymm7,64(%rdi) - vmovdqu %ymm15,96(%rdi) - leaq (%rdi,%rax,1),%rdi - - vpxor 0(%rsi),%ymm3,%ymm3 - vpxor 32(%rsi),%ymm11,%ymm11 - vpxor 64(%rsi),%ymm4,%ymm4 - vpxor 96(%rsi),%ymm12,%ymm12 - leaq (%rsi,%rax,1),%rsi - vmovdqu %ymm3,0(%rdi) - vmovdqu %ymm11,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vmovdqu %ymm12,96(%rdi) - leaq (%rdi,%rax,1),%rdi - - vpbroadcastd 0(%r9),%ymm0 - vpbroadcastd 4(%r9),%ymm1 - - subq $512,%rdx - jnz .Loop_outer8xvl - - jmp .Ldone8xvl - -.align 32 -.Ltail8xvl: - vmovdqa64 %ymm19,%ymm8 - xorq %r9,%r9 - subq %rsi,%rdi - cmpq $64,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm8,%ymm8 - vpxor 32(%rsi),%ymm0,%ymm0 - vmovdqu %ymm8,0(%rdi,%rsi,1) - vmovdqu %ymm0,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm5,%ymm8 - vmovdqa %ymm13,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $128,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm5,%ymm5 - vpxor 32(%rsi),%ymm13,%ymm13 - vmovdqu %ymm5,0(%rdi,%rsi,1) - vmovdqu %ymm13,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm1,%ymm8 - vmovdqa %ymm9,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $192,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm1,%ymm1 - vpxor 32(%rsi),%ymm9,%ymm9 - vmovdqu %ymm1,0(%rdi,%rsi,1) - vmovdqu %ymm9,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm2,%ymm8 - vmovdqa %ymm10,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $256,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm2,%ymm2 - vpxor 32(%rsi),%ymm10,%ymm10 - vmovdqu %ymm2,0(%rdi,%rsi,1) - vmovdqu %ymm10,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa32 %ymm18,%ymm8 - vmovdqa %ymm6,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $320,%rdx - jb .Less_than_64_8xvl - vpxord 0(%rsi),%ymm18,%ymm18 - vpxor 32(%rsi),%ymm6,%ymm6 - vmovdqu32 %ymm18,0(%rdi,%rsi,1) - vmovdqu %ymm6,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm7,%ymm8 - vmovdqa %ymm15,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $384,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm7,%ymm7 - vpxor 32(%rsi),%ymm15,%ymm15 - vmovdqu %ymm7,0(%rdi,%rsi,1) - vmovdqu %ymm15,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm3,%ymm8 - vmovdqa %ymm11,%ymm0 - leaq 64(%rsi),%rsi - - cmpq $448,%rdx - jb .Less_than_64_8xvl - vpxor 0(%rsi),%ymm3,%ymm3 - vpxor 32(%rsi),%ymm11,%ymm11 - vmovdqu %ymm3,0(%rdi,%rsi,1) - vmovdqu %ymm11,32(%rdi,%rsi,1) - je .Ldone8xvl - vmovdqa %ymm4,%ymm8 - vmovdqa %ymm12,%ymm0 - leaq 64(%rsi),%rsi - -.Less_than_64_8xvl: - vmovdqa %ymm8,0(%rsp) - vmovdqa %ymm0,32(%rsp) - leaq (%rdi,%rsi,1),%rdi - andq $63,%rdx - -.Loop_tail8xvl: - movzbl (%rsi,%r9,1),%eax - movzbl (%rsp,%r9,1),%ecx - leaq 1(%r9),%r9 - xorl %ecx,%eax - movb %al,-1(%rdi,%r9,1) - decq %rdx - jnz .Loop_tail8xvl - - vpxor %ymm8,%ymm8,%ymm8 - vmovdqa %ymm8,0(%rsp) - vmovdqa %ymm8,32(%rsp) - -.Ldone8xvl: - vzeroall - leaq -8(%r10),%rsp -.L8xvl_epilogue: - ret -ENDPROC(chacha20_avx512vl) - -#endif /* CONFIG_AS_AVX512 */ diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.pl b/src/crypto/zinc/chacha20/chacha20-x86_64.pl new file mode 100644 index 0000000..b54f3b1 --- /dev/null +++ b/src/crypto/zinc/chacha20/chacha20-x86_64.pl @@ -0,0 +1,4005 @@ +#! /usr/bin/env perl +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# November 2014 +# +# ChaCha20 for x86_64. +# +# December 2016 +# +# Add AVX512F code path. +# +# December 2017 +# +# Add AVX512VL code path. +# +# Performance in cycles per byte out of large buffer. +# +# IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) +# +# P4 9.48/+99% - - +# Core2 7.83/+55% 7.90/5.76 4.35 +# Westmere 7.19/+50% 5.60/4.50 3.00 +# Sandy Bridge 8.31/+42% 5.45/4.00 2.72 +# Ivy Bridge 6.71/+46% 5.40/? 2.41 +# Haswell 5.92/+43% 5.20/3.45 2.42 1.23 +# Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] +# Silvermont 12.0/+33% 7.75/6.90 7.03(iii) +# Knights L 11.7/- ? 9.60(iii) 0.80 +# Goldmont 10.6/+17% 5.10/3.52 3.28 +# Sledgehammer 7.28/+52% - - +# Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) +# Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 +# VIA Nano 10.5/+46% 6.72/6.88 6.05 +# +# (i) compared to older gcc 3.x one can observe >2x improvement on +# most platforms; +# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used +# by chacha20_poly1305_tls_cipher, results are EVP-free; +# (iii) this is not optimal result for Atom because of MSROM +# limitations, SSE2 can do better, but gain is considered too +# low to justify the [maintenance] effort; +# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 +# and 4.85 for 128-byte inputs; +# (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; +# (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 +# cpb in single thread, the corresponding capability is suppressed; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); + $avx += 1 if ($1==2.11 && $2>=8); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +# input parameter block +($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Ltwoy: +.long 2,0,0,0, 2,0,0,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.Lsigma: +.asciz "expand 32-byte k" +.asciz "ChaCha20 for x86_64, CRYPTOGAMS by " +___ + +sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; + my $arg = pop; + $arg = "\$$arg" if ($arg*1 eq $arg); + $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; +} + +@x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), + "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); +@t=("%esi","%edi"); + +sub ROUND { # critical path is 24 cycles per round +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_)=map("\"$_\"",@t); +my @x=map("\"$_\"",@x); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + # Normally instructions would be interleaved to favour in-order + # execution. Generally out-of-order cores manage it gracefully, + # but not this time for some reason. As in-order execution + # cores are dying breed, old Atom is the only one around, + # instructions are left uninterleaved. Besides, Atom is better + # off executing 1xSSSE3 code anyway... + + ( + "&add (@x[$a0],@x[$b0])", # Q1 + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],16)", + "&add (@x[$a1],@x[$b1])", # Q2 + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],16)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],12)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],12)", + + "&add (@x[$a0],@x[$b0])", + "&xor (@x[$d0],@x[$a0])", + "&rol (@x[$d0],8)", + "&add (@x[$a1],@x[$b1])", + "&xor (@x[$d1],@x[$a1])", + "&rol (@x[$d1],8)", + + "&add ($xc,@x[$d0])", + "&xor (@x[$b0],$xc)", + "&rol (@x[$b0],7)", + "&add ($xc_,@x[$d1])", + "&xor (@x[$b1],$xc_)", + "&rol (@x[$b1],7)", + + "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's + "&mov (\"4*$c1(%rsp)\",$xc_)", + "&mov ($xc,\"4*$c2(%rsp)\")", + "&mov ($xc_,\"4*$c3(%rsp)\")", + + "&add (@x[$a2],@x[$b2])", # Q3 + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],16)", + "&add (@x[$a3],@x[$b3])", # Q4 + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],16)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],12)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],12)", + + "&add (@x[$a2],@x[$b2])", + "&xor (@x[$d2],@x[$a2])", + "&rol (@x[$d2],8)", + "&add (@x[$a3],@x[$b3])", + "&xor (@x[$d3],@x[$a3])", + "&rol (@x[$d3],8)", + + "&add ($xc,@x[$d2])", + "&xor (@x[$b2],$xc)", + "&rol (@x[$b2],7)", + "&add ($xc_,@x[$d3])", + "&xor (@x[$b3],$xc_)", + "&rol (@x[$b3],7)" + ); +} + +######################################################################## +# Generic code path that handles all lengths on pre-SSSE3 processors. +$code.=<<___; +.globl ChaCha20_ctr32 +.type ChaCha20_ctr32,\@function,5 +.align 64 +ChaCha20_ctr32: +.cfi_startproc + cmp \$0,$len + je .Lno_data + mov OPENSSL_ia32cap_P+4(%rip),%r10 +___ +$code.=<<___ if ($avx>2); + bt \$48,%r10 # check for AVX512F + jc .LChaCha20_avx512 + test %r10,%r10 # check for AVX512VL + js .LChaCha20_avx512vl +___ +$code.=<<___; + test \$`1<<(41-32)`,%r10d + jnz .LChaCha20_ssse3 + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$64+24,%rsp +.cfi_adjust_cfa_offset 64+24 +.Lctr32_body: + + #movdqa .Lsigma(%rip),%xmm0 + movdqu ($key),%xmm1 + movdqu 16($key),%xmm2 + movdqu ($counter),%xmm3 + movdqa .Lone(%rip),%xmm4 + + #movdqa %xmm0,4*0(%rsp) # key[0] + movdqa %xmm1,4*4(%rsp) # key[1] + movdqa %xmm2,4*8(%rsp) # key[2] + movdqa %xmm3,4*12(%rsp) # key[3] + mov $len,%rbp # reassign $len + jmp .Loop_outer + +.align 32 +.Loop_outer: + mov \$0x61707865,@x[0] # 'expa' + mov \$0x3320646e,@x[1] # 'nd 3' + mov \$0x79622d32,@x[2] # '2-by' + mov \$0x6b206574,@x[3] # 'te k' + mov 4*4(%rsp),@x[4] + mov 4*5(%rsp),@x[5] + mov 4*6(%rsp),@x[6] + mov 4*7(%rsp),@x[7] + movd %xmm3,@x[12] + mov 4*13(%rsp),@x[13] + mov 4*14(%rsp),@x[14] + mov 4*15(%rsp),@x[15] + + mov %rbp,64+0(%rsp) # save len + mov \$10,%ebp + mov $inp,64+8(%rsp) # save inp + movq %xmm2,%rsi # "@x[8]" + mov $out,64+16(%rsp) # save out + mov %rsi,%rdi + shr \$32,%rdi # "@x[9]" + jmp .Loop + +.align 32 +.Loop: +___ + foreach (&ROUND (0, 4, 8,12)) { eval; } + foreach (&ROUND (0, 5,10,15)) { eval; } + &dec ("%ebp"); + &jnz (".Loop"); + +$code.=<<___; + mov @t[1],4*9(%rsp) # modulo-scheduled + mov @t[0],4*8(%rsp) + mov 64(%rsp),%rbp # load len + movdqa %xmm2,%xmm1 + mov 64+8(%rsp),$inp # load inp + paddd %xmm4,%xmm3 # increment counter + mov 64+16(%rsp),$out # load out + + add \$0x61707865,@x[0] # 'expa' + add \$0x3320646e,@x[1] # 'nd 3' + add \$0x79622d32,@x[2] # '2-by' + add \$0x6b206574,@x[3] # 'te k' + add 4*4(%rsp),@x[4] + add 4*5(%rsp),@x[5] + add 4*6(%rsp),@x[6] + add 4*7(%rsp),@x[7] + add 4*12(%rsp),@x[12] + add 4*13(%rsp),@x[13] + add 4*14(%rsp),@x[14] + add 4*15(%rsp),@x[15] + paddd 4*8(%rsp),%xmm1 + + cmp \$64,%rbp + jb .Ltail + + xor 4*0($inp),@x[0] # xor with input + xor 4*1($inp),@x[1] + xor 4*2($inp),@x[2] + xor 4*3($inp),@x[3] + xor 4*4($inp),@x[4] + xor 4*5($inp),@x[5] + xor 4*6($inp),@x[6] + xor 4*7($inp),@x[7] + movdqu 4*8($inp),%xmm0 + xor 4*12($inp),@x[12] + xor 4*13($inp),@x[13] + xor 4*14($inp),@x[14] + xor 4*15($inp),@x[15] + lea 4*16($inp),$inp # inp+=64 + pxor %xmm1,%xmm0 + + movdqa %xmm2,4*8(%rsp) + movd %xmm3,4*12(%rsp) + + mov @x[0],4*0($out) # write output + mov @x[1],4*1($out) + mov @x[2],4*2($out) + mov @x[3],4*3($out) + mov @x[4],4*4($out) + mov @x[5],4*5($out) + mov @x[6],4*6($out) + mov @x[7],4*7($out) + movdqu %xmm0,4*8($out) + mov @x[12],4*12($out) + mov @x[13],4*13($out) + mov @x[14],4*14($out) + mov @x[15],4*15($out) + lea 4*16($out),$out # out+=64 + + sub \$64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + mov @x[0],4*0(%rsp) + mov @x[1],4*1(%rsp) + xor %rbx,%rbx + mov @x[2],4*2(%rsp) + mov @x[3],4*3(%rsp) + mov @x[4],4*4(%rsp) + mov @x[5],4*5(%rsp) + mov @x[6],4*6(%rsp) + mov @x[7],4*7(%rsp) + movdqa %xmm1,4*8(%rsp) + mov @x[12],4*12(%rsp) + mov @x[13],4*13(%rsp) + mov @x[14],4*14(%rsp) + mov @x[15],4*15(%rsp) + +.Loop_tail: + movzb ($inp,%rbx),%eax + movzb (%rsp,%rbx),%edx + lea 1(%rbx),%rbx + xor %edx,%eax + mov %al,-1($out,%rbx) + dec %rbp + jnz .Loop_tail + +.Ldone: + lea 64+24+48(%rsp),%rsi +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lno_data: + ret +.cfi_endproc +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +___ + +######################################################################## +# SSSE3 code path that handles shorter lengths +{ +my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); + +sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot16); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,20); + &pslld ($t,12); + &por ($b,$t); + + &paddd ($a,$b); + &pxor ($d,$a); + &pshufb ($d,$rot24); + + &paddd ($c,$d); + &pxor ($b,$c); + &movdqa ($t,$b); + &psrld ($b,25); + &pslld ($t,7); + &por ($b,$t); +} + +my $xframe = $win64 ? 32+8 : 8; + +$code.=<<___; +.type ChaCha20_ssse3,\@function,5 +.align 32 +ChaCha20_ssse3: +.cfi_startproc +.LChaCha20_ssse3: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 +___ +$code.=<<___ if ($avx); + test \$`1<<(43-32)`,%r10d + jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 +___ +$code.=<<___; + cmp \$128,$len # we might throw away some data, + je .LChaCha20_128 + ja .LChaCha20_4x # but overall it won't be slower + +.Ldo_sse3_after_all: + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lssse3_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$a + movdqu ($key),$b + movdqu 16($key),$c + movdqu ($counter),$d + movdqa .Lrot16(%rip),$rot16 + movdqa .Lrot24(%rip),$rot24 + + movdqa $a,0x00(%rsp) + movdqa $b,0x10(%rsp) + movdqa $c,0x20(%rsp) + movdqa $d,0x30(%rsp) + mov \$10,$counter # reuse $counter + jmp .Loop_ssse3 + +.align 32 +.Loop_outer_ssse3: + movdqa .Lone(%rip),$d + movdqa 0x00(%rsp),$a + movdqa 0x10(%rsp),$b + movdqa 0x20(%rsp),$c + paddd 0x30(%rsp),$d + mov \$10,$counter + movdqa $d,0x30(%rsp) + jmp .Loop_ssse3 + +.align 32 +.Loop_ssse3: +___ + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b00111001); + &pshufd ($d,$d,0b10010011); + &nop (); + + &SSSE3ROUND(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b10010011); + &pshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_ssse3"); + +$code.=<<___; + paddd 0x00(%rsp),$a + paddd 0x10(%rsp),$b + paddd 0x20(%rsp),$c + paddd 0x30(%rsp),$d + + cmp \$64,$len + jb .Ltail_ssse3 + + movdqu 0x00($inp),$t + movdqu 0x10($inp),$t1 + pxor $t,$a # xor with input + movdqu 0x20($inp),$t + pxor $t1,$b + movdqu 0x30($inp),$t1 + lea 0x40($inp),$inp # inp+=64 + pxor $t,$c + pxor $t1,$d + + movdqu $a,0x00($out) # write output + movdqu $b,0x10($out) + movdqu $c,0x20($out) + movdqu $d,0x30($out) + lea 0x40($out),$out # out+=64 + + sub \$64,$len + jnz .Loop_outer_ssse3 + + jmp .Ldone_ssse3 + +.align 16 +.Ltail_ssse3: + movdqa $a,0x00(%rsp) + movdqa $b,0x10(%rsp) + movdqa $c,0x20(%rsp) + movdqa $d,0x30(%rsp) + xor $counter,$counter + +.Loop_tail_ssse3: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_ssse3 + +.Ldone_ssse3: +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lssse3_epilogue: + ret +.cfi_endproc +.size ChaCha20_ssse3,.-ChaCha20_ssse3 +___ +} + +######################################################################## +# SSSE3 code path that handles 128-byte inputs +{ +my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); +my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); + +sub SSSE3ROUND_2x { + &paddd ($a,$b); + &pxor ($d,$a); + &paddd ($a1,$b1); + &pxor ($d1,$a1); + &pshufb ($d,$rot16); + &pshufb($d1,$rot16); + + &paddd ($c,$d); + &paddd ($c1,$d1); + &pxor ($b,$c); + &pxor ($b1,$c1); + &movdqa ($t,$b); + &psrld ($b,20); + &movdqa($t1,$b1); + &pslld ($t,12); + &psrld ($b1,20); + &por ($b,$t); + &pslld ($t1,12); + &por ($b1,$t1); + + &paddd ($a,$b); + &pxor ($d,$a); + &paddd ($a1,$b1); + &pxor ($d1,$a1); + &pshufb ($d,$rot24); + &pshufb($d1,$rot24); + + &paddd ($c,$d); + &paddd ($c1,$d1); + &pxor ($b,$c); + &pxor ($b1,$c1); + &movdqa ($t,$b); + &psrld ($b,25); + &movdqa($t1,$b1); + &pslld ($t,7); + &psrld ($b1,25); + &por ($b,$t); + &pslld ($t1,7); + &por ($b1,$t1); +} + +my $xframe = $win64 ? 0x68 : 8; + +$code.=<<___; +.type ChaCha20_128,\@function,5 +.align 32 +ChaCha20_128: +.cfi_startproc +.LChaCha20_128: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x68(%r9) + movaps %xmm7,-0x58(%r9) + movaps %xmm8,-0x48(%r9) + movaps %xmm9,-0x38(%r9) + movaps %xmm10,-0x28(%r9) + movaps %xmm11,-0x18(%r9) +.L128_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$a + movdqu ($key),$b + movdqu 16($key),$c + movdqu ($counter),$d + movdqa .Lone(%rip),$d1 + movdqa .Lrot16(%rip),$rot16 + movdqa .Lrot24(%rip),$rot24 + + movdqa $a,$a1 + movdqa $a,0x00(%rsp) + movdqa $b,$b1 + movdqa $b,0x10(%rsp) + movdqa $c,$c1 + movdqa $c,0x20(%rsp) + paddd $d,$d1 + movdqa $d,0x30(%rsp) + mov \$10,$counter # reuse $counter + jmp .Loop_128 + +.align 32 +.Loop_128: +___ + &SSSE3ROUND_2x(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b00111001); + &pshufd ($d,$d,0b10010011); + &pshufd ($c1,$c1,0b01001110); + &pshufd ($b1,$b1,0b00111001); + &pshufd ($d1,$d1,0b10010011); + + &SSSE3ROUND_2x(); + &pshufd ($c,$c,0b01001110); + &pshufd ($b,$b,0b10010011); + &pshufd ($d,$d,0b00111001); + &pshufd ($c1,$c1,0b01001110); + &pshufd ($b1,$b1,0b10010011); + &pshufd ($d1,$d1,0b00111001); + + &dec ($counter); + &jnz (".Loop_128"); + +$code.=<<___; + paddd 0x00(%rsp),$a + paddd 0x10(%rsp),$b + paddd 0x20(%rsp),$c + paddd 0x30(%rsp),$d + paddd .Lone(%rip),$d1 + paddd 0x00(%rsp),$a1 + paddd 0x10(%rsp),$b1 + paddd 0x20(%rsp),$c1 + paddd 0x30(%rsp),$d1 + + movdqu 0x00($inp),$t + movdqu 0x10($inp),$t1 + pxor $t,$a # xor with input + movdqu 0x20($inp),$t + pxor $t1,$b + movdqu 0x30($inp),$t1 + pxor $t,$c + movdqu 0x40($inp),$t + pxor $t1,$d + movdqu 0x50($inp),$t1 + pxor $t,$a1 + movdqu 0x60($inp),$t + pxor $t1,$b1 + movdqu 0x70($inp),$t1 + pxor $t,$c1 + pxor $t1,$d1 + + movdqu $a,0x00($out) # write output + movdqu $b,0x10($out) + movdqu $c,0x20($out) + movdqu $d,0x30($out) + movdqu $a1,0x40($out) + movdqu $b1,0x50($out) + movdqu $c1,0x60($out) + movdqu $d1,0x70($out) +___ +$code.=<<___ if ($win64); + movaps -0x68(%r9),%xmm6 + movaps -0x58(%r9),%xmm7 + movaps -0x48(%r9),%xmm8 + movaps -0x38(%r9),%xmm9 + movaps -0x28(%r9),%xmm10 + movaps -0x18(%r9),%xmm11 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L128_epilogue: + ret +.cfi_endproc +.size ChaCha20_128,.-ChaCha20_128 +___ +} + +######################################################################## +# SSSE3 code path that handles longer messages. +{ +# assign variables to favor Atom front-end +my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, + $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub SSSE3_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&paddd (@x[$a0],@x[$b0])", # Q1 + "&paddd (@x[$a1],@x[$b1])", # Q2 + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t1)", + "&pshufb (@x[$d1],$t1)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t0,@x[$b0])", + "&pslld (@x[$b0],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b1])", + "&pslld (@x[$b1],12)", + "&por (@x[$b0],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b1],$t1)", + + "&paddd (@x[$a0],@x[$b0])", + "&paddd (@x[$a1],@x[$b1])", + "&pxor (@x[$d0],@x[$a0])", + "&pxor (@x[$d1],@x[$a1])", + "&pshufb (@x[$d0],$t0)", + "&pshufb (@x[$d1],$t0)", + + "&paddd ($xc,@x[$d0])", + "&paddd ($xc_,@x[$d1])", + "&pxor (@x[$b0],$xc)", + "&pxor (@x[$b1],$xc_)", + "&movdqa ($t1,@x[$b0])", + "&pslld (@x[$b0],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b1])", + "&pslld (@x[$b1],7)", + "&por (@x[$b0],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b1],$t0)", + + "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", + "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", + "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", + + "&paddd (@x[$a2],@x[$b2])", # Q3 + "&paddd (@x[$a3],@x[$b3])", # Q4 + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t1)", + "&pshufb (@x[$d3],$t1)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t0,@x[$b2])", + "&pslld (@x[$b2],12)", + "&psrld ($t0,20)", + "&movdqa ($t1,@x[$b3])", + "&pslld (@x[$b3],12)", + "&por (@x[$b2],$t0)", + "&psrld ($t1,20)", + "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) + "&por (@x[$b3],$t1)", + + "&paddd (@x[$a2],@x[$b2])", + "&paddd (@x[$a3],@x[$b3])", + "&pxor (@x[$d2],@x[$a2])", + "&pxor (@x[$d3],@x[$a3])", + "&pshufb (@x[$d2],$t0)", + "&pshufb (@x[$d3],$t0)", + + "&paddd ($xc,@x[$d2])", + "&paddd ($xc_,@x[$d3])", + "&pxor (@x[$b2],$xc)", + "&pxor (@x[$b3],$xc_)", + "&movdqa ($t1,@x[$b2])", + "&pslld (@x[$b2],7)", + "&psrld ($t1,25)", + "&movdqa ($t0,@x[$b3])", + "&pslld (@x[$b3],7)", + "&por (@x[$b2],$t1)", + "&psrld ($t0,25)", + "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) + "&por (@x[$b3],$t0)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_4x,\@function,5 +.align 32 +ChaCha20_4x: +.cfi_startproc +.LChaCha20_4x: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 + mov %r10,%r11 +___ +$code.=<<___ if ($avx>1); + shr \$32,%r10 # OPENSSL_ia32cap_P+8 + test \$`1<<5`,%r10 # test AVX2 + jnz .LChaCha20_8x +___ +$code.=<<___; + cmp \$192,$len + ja .Lproceed4x + + and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE + cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE + je .Ldo_sse3_after_all # to detect Atom + +.Lproceed4x: + sub \$0x140+$xframe,%rsp +___ + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x40 constant copy of key[0-2] smashed by lanes + # ... + # +0x100 SIMD counters (with nonce smashed by lanes) + # ... + # +0x140 +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4x_body: +___ +$code.=<<___; + movdqa .Lsigma(%rip),$xa3 # key[0] + movdqu ($key),$xb3 # key[1] + movdqu 16($key),$xt3 # key[2] + movdqu ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + pshufd \$0x00,$xa3,$xa0 # smash key by lanes... + pshufd \$0x55,$xa3,$xa1 + movdqa $xa0,0x40(%rsp) # ... and offload + pshufd \$0xaa,$xa3,$xa2 + movdqa $xa1,0x50(%rsp) + pshufd \$0xff,$xa3,$xa3 + movdqa $xa2,0x60(%rsp) + movdqa $xa3,0x70(%rsp) + + pshufd \$0x00,$xb3,$xb0 + pshufd \$0x55,$xb3,$xb1 + movdqa $xb0,0x80-0x100(%rcx) + pshufd \$0xaa,$xb3,$xb2 + movdqa $xb1,0x90-0x100(%rcx) + pshufd \$0xff,$xb3,$xb3 + movdqa $xb2,0xa0-0x100(%rcx) + movdqa $xb3,0xb0-0x100(%rcx) + + pshufd \$0x00,$xt3,$xt0 # "$xc0" + pshufd \$0x55,$xt3,$xt1 # "$xc1" + movdqa $xt0,0xc0-0x100(%rcx) + pshufd \$0xaa,$xt3,$xt2 # "$xc2" + movdqa $xt1,0xd0-0x100(%rcx) + pshufd \$0xff,$xt3,$xt3 # "$xc3" + movdqa $xt2,0xe0-0x100(%rcx) + movdqa $xt3,0xf0-0x100(%rcx) + + pshufd \$0x00,$xd3,$xd0 + pshufd \$0x55,$xd3,$xd1 + paddd .Linc(%rip),$xd0 # don't save counters yet + pshufd \$0xaa,$xd3,$xd2 + movdqa $xd1,0x110-0x100(%rcx) + pshufd \$0xff,$xd3,$xd3 + movdqa $xd2,0x120-0x100(%rcx) + movdqa $xd3,0x130-0x100(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 0x40(%rsp),$xa0 # re-load smashed key + movdqa 0x50(%rsp),$xa1 + movdqa 0x60(%rsp),$xa2 + movdqa 0x70(%rsp),$xa3 + movdqa 0x80-0x100(%rcx),$xb0 + movdqa 0x90-0x100(%rcx),$xb1 + movdqa 0xa0-0x100(%rcx),$xb2 + movdqa 0xb0-0x100(%rcx),$xb3 + movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" + movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" + movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" + movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" + movdqa 0x100-0x100(%rcx),$xd0 + movdqa 0x110-0x100(%rcx),$xd1 + movdqa 0x120-0x100(%rcx),$xd2 + movdqa 0x130-0x100(%rcx),$xd3 + paddd .Lfour(%rip),$xd0 # next SIMD counters + +.Loop_enter4x: + movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" + movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" + movdqa (%r10),$xt3 # .Lrot16(%rip) + mov \$10,%eax + movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters + jmp .Loop4x + +.align 32 +.Loop4x: +___ + foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop4x + + paddd 0x40(%rsp),$xa0 # accumulate key material + paddd 0x50(%rsp),$xa1 + paddd 0x60(%rsp),$xa2 + paddd 0x70(%rsp),$xa3 + + movdqa $xa0,$xt2 # "de-interlace" data + punpckldq $xa1,$xa0 + movdqa $xa2,$xt3 + punpckldq $xa3,$xa2 + punpckhdq $xa1,$xt2 + punpckhdq $xa3,$xt3 + movdqa $xa0,$xa1 + punpcklqdq $xa2,$xa0 # "a0" + movdqa $xt2,$xa3 + punpcklqdq $xt3,$xt2 # "a2" + punpckhqdq $xa2,$xa1 # "a1" + punpckhqdq $xt3,$xa3 # "a3" +___ + ($xa2,$xt2)=($xt2,$xa2); +$code.=<<___; + paddd 0x80-0x100(%rcx),$xb0 + paddd 0x90-0x100(%rcx),$xb1 + paddd 0xa0-0x100(%rcx),$xb2 + paddd 0xb0-0x100(%rcx),$xb3 + + movdqa $xa0,0x00(%rsp) # offload $xaN + movdqa $xa1,0x10(%rsp) + movdqa 0x20(%rsp),$xa0 # "xc2" + movdqa 0x30(%rsp),$xa1 # "xc3" + + movdqa $xb0,$xt2 + punpckldq $xb1,$xb0 + movdqa $xb2,$xt3 + punpckldq $xb3,$xb2 + punpckhdq $xb1,$xt2 + punpckhdq $xb3,$xt3 + movdqa $xb0,$xb1 + punpcklqdq $xb2,$xb0 # "b0" + movdqa $xt2,$xb3 + punpcklqdq $xt3,$xt2 # "b2" + punpckhqdq $xb2,$xb1 # "b1" + punpckhqdq $xt3,$xb3 # "b3" +___ + ($xb2,$xt2)=($xt2,$xb2); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + paddd 0xc0-0x100(%rcx),$xc0 + paddd 0xd0-0x100(%rcx),$xc1 + paddd 0xe0-0x100(%rcx),$xc2 + paddd 0xf0-0x100(%rcx),$xc3 + + movdqa $xa2,0x20(%rsp) # keep offloading $xaN + movdqa $xa3,0x30(%rsp) + + movdqa $xc0,$xt2 + punpckldq $xc1,$xc0 + movdqa $xc2,$xt3 + punpckldq $xc3,$xc2 + punpckhdq $xc1,$xt2 + punpckhdq $xc3,$xt3 + movdqa $xc0,$xc1 + punpcklqdq $xc2,$xc0 # "c0" + movdqa $xt2,$xc3 + punpcklqdq $xt3,$xt2 # "c2" + punpckhqdq $xc2,$xc1 # "c1" + punpckhqdq $xt3,$xc3 # "c3" +___ + ($xc2,$xt2)=($xt2,$xc2); + ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary +$code.=<<___; + paddd 0x100-0x100(%rcx),$xd0 + paddd 0x110-0x100(%rcx),$xd1 + paddd 0x120-0x100(%rcx),$xd2 + paddd 0x130-0x100(%rcx),$xd3 + + movdqa $xd0,$xt2 + punpckldq $xd1,$xd0 + movdqa $xd2,$xt3 + punpckldq $xd3,$xd2 + punpckhdq $xd1,$xt2 + punpckhdq $xd3,$xt3 + movdqa $xd0,$xd1 + punpcklqdq $xd2,$xd0 # "d0" + movdqa $xt2,$xd3 + punpcklqdq $xt3,$xt2 # "d2" + punpckhqdq $xd2,$xd1 # "d1" + punpckhqdq $xt3,$xd3 # "d3" +___ + ($xd2,$xt2)=($xt2,$xd2); +$code.=<<___; + cmp \$64*4,$len + jb .Ltail4x + + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # inp+=64*4 + pxor 0x30(%rsp),$xt0 + pxor $xb3,$xt1 + pxor $xc3,$xt2 + pxor $xd3,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + lea 0x80($out),$out # out+=64*4 + + sub \$64*4,$len + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmp \$192,$len + jae .L192_or_more4x + cmp \$128,$len + jae .L128_or_more4x + cmp \$64,$len + jae .L64_or_more4x + + #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + xor %r10,%r10 + #movdqa $xt0,0x00(%rsp) + movdqa $xb0,0x10(%rsp) + movdqa $xc0,0x20(%rsp) + movdqa $xd0,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb1,0x10(%rsp) + lea 0x40($out),$out # out+=64*1 + movdqa $xc1,0x20(%rsp) + sub \$64,$len # len-=64*1 + movdqa $xd1,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + movdqu $xt0,0x40($out) + movdqu $xt1,0x50($out) + movdqu $xt2,0x60($out) + movdqu $xt3,0x70($out) + je .Ldone4x + + movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb2,0x10(%rsp) + lea 0x80($out),$out # out+=64*2 + movdqa $xc2,0x20(%rsp) + sub \$128,$len # len-=64*2 + movdqa $xd2,0x30(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0x00($inp),$xt0 # xor with input + movdqu 0x10($inp),$xt1 + movdqu 0x20($inp),$xt2 + movdqu 0x30($inp),$xt3 + pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? + pxor $xb0,$xt1 + pxor $xc0,$xt2 + pxor $xd0,$xt3 + + movdqu $xt0,0x00($out) + movdqu 0x40($inp),$xt0 + movdqu $xt1,0x10($out) + movdqu 0x50($inp),$xt1 + movdqu $xt2,0x20($out) + movdqu 0x60($inp),$xt2 + movdqu $xt3,0x30($out) + movdqu 0x70($inp),$xt3 + lea 0x80($inp),$inp # size optimization + pxor 0x10(%rsp),$xt0 + pxor $xb1,$xt1 + pxor $xc1,$xt2 + pxor $xd1,$xt3 + + movdqu $xt0,0x40($out) + movdqu 0x00($inp),$xt0 + movdqu $xt1,0x50($out) + movdqu 0x10($inp),$xt1 + movdqu $xt2,0x60($out) + movdqu 0x20($inp),$xt2 + movdqu $xt3,0x70($out) + lea 0x80($out),$out # size optimization + movdqu 0x30($inp),$xt3 + pxor 0x20(%rsp),$xt0 + pxor $xb2,$xt1 + pxor $xc2,$xt2 + pxor $xd2,$xt3 + movdqu $xt0,0x00($out) + movdqu $xt1,0x10($out) + movdqu $xt2,0x20($out) + movdqu $xt3,0x30($out) + je .Ldone4x + + movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? + lea 0x40($inp),$inp # inp+=64*3 + xor %r10,%r10 + movdqa $xt0,0x00(%rsp) + movdqa $xb3,0x10(%rsp) + lea 0x40($out),$out # out+=64*3 + movdqa $xc3,0x20(%rsp) + sub \$192,$len # len-=64*3 + movdqa $xd3,0x30(%rsp) + +.Loop_tail4x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail4x + +.Ldone4x: +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4x_epilogue: + ret +.cfi_endproc +.size ChaCha20_4x,.-ChaCha20_4x +___ +} + +######################################################################## +# XOP code path that handles all lengths. +if ($avx) { +# There is some "anomaly" observed depending on instructions' size or +# alignment. If you look closely at below code you'll notice that +# sometimes argument order varies. The order affects instruction +# encoding by making it larger, and such fiddling gives 5% performance +# improvement. This is on FX-4100... + +my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, + $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); + +sub XOP_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("\"$_\"",@xx); + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vprotd (@x[$d0],@x[$d0],16)", + "&vprotd (@x[$d1],@x[$d1],16)", + "&vprotd (@x[$d2],@x[$d2],16)", + "&vprotd (@x[$d3],@x[$d3],16)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxor (@x[$b0],@x[$c0],@x[$b0])", + "&vpxor (@x[$b1],@x[$c1],@x[$b1])", + "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip + "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip + "&vprotd (@x[$b0],@x[$b0],12)", + "&vprotd (@x[$b1],@x[$b1],12)", + "&vprotd (@x[$b2],@x[$b2],12)", + "&vprotd (@x[$b3],@x[$b3],12)", + + "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip + "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vprotd (@x[$d0],@x[$d0],8)", + "&vprotd (@x[$d1],@x[$d1],8)", + "&vprotd (@x[$d2],@x[$d2],8)", + "&vprotd (@x[$d3],@x[$d3],8)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxor (@x[$b0],@x[$c0],@x[$b0])", + "&vpxor (@x[$b1],@x[$c1],@x[$b1])", + "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip + "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip + "&vprotd (@x[$b0],@x[$b0],7)", + "&vprotd (@x[$b1],@x[$b1],7)", + "&vprotd (@x[$b2],@x[$b2],7)", + "&vprotd (@x[$b3],@x[$b3],7)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_4xop,\@function,5 +.align 32 +ChaCha20_4xop: +.cfi_startproc +.LChaCha20_4xop: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 + sub \$0x140+$xframe,%rsp +___ + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x40 constant copy of key[0-2] smashed by lanes + # ... + # +0x100 SIMD counters (with nonce smashed by lanes) + # ... + # +0x140 +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L4xop_body: +___ +$code.=<<___; + vzeroupper + + vmovdqa .Lsigma(%rip),$xa3 # key[0] + vmovdqu ($key),$xb3 # key[1] + vmovdqu 16($key),$xt3 # key[2] + vmovdqu ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vmovdqa $xa0,0x40(%rsp) # ... and offload + vpshufd \$0xaa,$xa3,$xa2 + vmovdqa $xa1,0x50(%rsp) + vpshufd \$0xff,$xa3,$xa3 + vmovdqa $xa2,0x60(%rsp) + vmovdqa $xa3,0x70(%rsp) + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vmovdqa $xb0,0x80-0x100(%rcx) + vpshufd \$0xaa,$xb3,$xb2 + vmovdqa $xb1,0x90-0x100(%rcx) + vpshufd \$0xff,$xb3,$xb3 + vmovdqa $xb2,0xa0-0x100(%rcx) + vmovdqa $xb3,0xb0-0x100(%rcx) + + vpshufd \$0x00,$xt3,$xt0 # "$xc0" + vpshufd \$0x55,$xt3,$xt1 # "$xc1" + vmovdqa $xt0,0xc0-0x100(%rcx) + vpshufd \$0xaa,$xt3,$xt2 # "$xc2" + vmovdqa $xt1,0xd0-0x100(%rcx) + vpshufd \$0xff,$xt3,$xt3 # "$xc3" + vmovdqa $xt2,0xe0-0x100(%rcx) + vmovdqa $xt3,0xf0-0x100(%rcx) + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet + vpshufd \$0xaa,$xd3,$xd2 + vmovdqa $xd1,0x110-0x100(%rcx) + vpshufd \$0xff,$xd3,$xd3 + vmovdqa $xd2,0x120-0x100(%rcx) + vmovdqa $xd3,0x130-0x100(%rcx) + + jmp .Loop_enter4xop + +.align 32 +.Loop_outer4xop: + vmovdqa 0x40(%rsp),$xa0 # re-load smashed key + vmovdqa 0x50(%rsp),$xa1 + vmovdqa 0x60(%rsp),$xa2 + vmovdqa 0x70(%rsp),$xa3 + vmovdqa 0x80-0x100(%rcx),$xb0 + vmovdqa 0x90-0x100(%rcx),$xb1 + vmovdqa 0xa0-0x100(%rcx),$xb2 + vmovdqa 0xb0-0x100(%rcx),$xb3 + vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" + vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" + vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" + vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" + vmovdqa 0x100-0x100(%rcx),$xd0 + vmovdqa 0x110-0x100(%rcx),$xd1 + vmovdqa 0x120-0x100(%rcx),$xd2 + vmovdqa 0x130-0x100(%rcx),$xd3 + vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters + +.Loop_enter4xop: + mov \$10,%eax + vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters + jmp .Loop4xop + +.align 32 +.Loop4xop: +___ + foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop4xop + + vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material + vpaddd 0x50(%rsp),$xa1,$xa1 + vpaddd 0x60(%rsp),$xa2,$xa2 + vpaddd 0x70(%rsp),$xa3,$xa3 + + vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 + vmovdqa $xt3,0x30(%rsp) + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd 0x80-0x100(%rcx),$xb0,$xb0 + vpaddd 0x90-0x100(%rcx),$xb1,$xb1 + vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 + vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 + + vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 + vmovdqa $xa1,0x10(%rsp) + vmovdqa 0x20(%rsp),$xa0 # "xc2" + vmovdqa 0x30(%rsp),$xa1 # "xc3" + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 + vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 + vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 + vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd 0x100-0x100(%rcx),$xd0,$xd0 + vpaddd 0x110-0x100(%rcx),$xd1,$xd1 + vpaddd 0x120-0x100(%rcx),$xd2,$xd2 + vpaddd 0x130-0x100(%rcx),$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); + ($xa0,$xa1)=($xt2,$xt3); +$code.=<<___; + vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 + vmovdqa 0x10(%rsp),$xa1 + + cmp \$64*4,$len + jb .Ltail4xop + + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x10($inp),$xb0,$xb0 + vpxor 0x20($inp),$xc0,$xc0 + vpxor 0x30($inp),$xd0,$xd0 + vpxor 0x40($inp),$xa1,$xa1 + vpxor 0x50($inp),$xb1,$xb1 + vpxor 0x60($inp),$xc1,$xc1 + vpxor 0x70($inp),$xd1,$xd1 + lea 0x80($inp),$inp # size optimization + vpxor 0x00($inp),$xa2,$xa2 + vpxor 0x10($inp),$xb2,$xb2 + vpxor 0x20($inp),$xc2,$xc2 + vpxor 0x30($inp),$xd2,$xd2 + vpxor 0x40($inp),$xa3,$xa3 + vpxor 0x50($inp),$xb3,$xb3 + vpxor 0x60($inp),$xc3,$xc3 + vpxor 0x70($inp),$xd3,$xd3 + lea 0x80($inp),$inp # inp+=64*4 + + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x10($out) + vmovdqu $xc0,0x20($out) + vmovdqu $xd0,0x30($out) + vmovdqu $xa1,0x40($out) + vmovdqu $xb1,0x50($out) + vmovdqu $xc1,0x60($out) + vmovdqu $xd1,0x70($out) + lea 0x80($out),$out # size optimization + vmovdqu $xa2,0x00($out) + vmovdqu $xb2,0x10($out) + vmovdqu $xc2,0x20($out) + vmovdqu $xd2,0x30($out) + vmovdqu $xa3,0x40($out) + vmovdqu $xb3,0x50($out) + vmovdqu $xc3,0x60($out) + vmovdqu $xd3,0x70($out) + lea 0x80($out),$out # out+=64*4 + + sub \$64*4,$len + jnz .Loop_outer4xop + + jmp .Ldone4xop + +.align 32 +.Ltail4xop: + cmp \$192,$len + jae .L192_or_more4xop + cmp \$128,$len + jae .L128_or_more4xop + cmp \$64,$len + jae .L64_or_more4xop + + xor %r10,%r10 + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xb0,0x10(%rsp) + vmovdqa $xc0,0x20(%rsp) + vmovdqa $xd0,0x30(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L64_or_more4xop: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x10($inp),$xb0,$xb0 + vpxor 0x20($inp),$xc0,$xc0 + vpxor 0x30($inp),$xd0,$xd0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x10($out) + vmovdqu $xc0,0x20($out) + vmovdqu $xd0,0x30($out) + je .Ldone4xop + + lea 0x40($inp),$inp # inp+=64*1 + vmovdqa $xa1,0x00(%rsp) + xor %r10,%r10 + vmovdqa $xb1,0x10(%rsp) + lea 0x40($out),$out # out+=64*1 + vmovdqa $xc1,0x20(%rsp) + sub \$64,$len # len-=64*1 + vmovdqa $xd1,0x30(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L128_or_more4xop: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x10($inp),$xb0,$xb0 + vpxor 0x20($inp),$xc0,$xc0 + vpxor 0x30($inp),$xd0,$xd0 + vpxor 0x40($inp),$xa1,$xa1 + vpxor 0x50($inp),$xb1,$xb1 + vpxor 0x60($inp),$xc1,$xc1 + vpxor 0x70($inp),$xd1,$xd1 + + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x10($out) + vmovdqu $xc0,0x20($out) + vmovdqu $xd0,0x30($out) + vmovdqu $xa1,0x40($out) + vmovdqu $xb1,0x50($out) + vmovdqu $xc1,0x60($out) + vmovdqu $xd1,0x70($out) + je .Ldone4xop + + lea 0x80($inp),$inp # inp+=64*2 + vmovdqa $xa2,0x00(%rsp) + xor %r10,%r10 + vmovdqa $xb2,0x10(%rsp) + lea 0x80($out),$out # out+=64*2 + vmovdqa $xc2,0x20(%rsp) + sub \$128,$len # len-=64*2 + vmovdqa $xd2,0x30(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L192_or_more4xop: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x10($inp),$xb0,$xb0 + vpxor 0x20($inp),$xc0,$xc0 + vpxor 0x30($inp),$xd0,$xd0 + vpxor 0x40($inp),$xa1,$xa1 + vpxor 0x50($inp),$xb1,$xb1 + vpxor 0x60($inp),$xc1,$xc1 + vpxor 0x70($inp),$xd1,$xd1 + lea 0x80($inp),$inp # size optimization + vpxor 0x00($inp),$xa2,$xa2 + vpxor 0x10($inp),$xb2,$xb2 + vpxor 0x20($inp),$xc2,$xc2 + vpxor 0x30($inp),$xd2,$xd2 + + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x10($out) + vmovdqu $xc0,0x20($out) + vmovdqu $xd0,0x30($out) + vmovdqu $xa1,0x40($out) + vmovdqu $xb1,0x50($out) + vmovdqu $xc1,0x60($out) + vmovdqu $xd1,0x70($out) + lea 0x80($out),$out # size optimization + vmovdqu $xa2,0x00($out) + vmovdqu $xb2,0x10($out) + vmovdqu $xc2,0x20($out) + vmovdqu $xd2,0x30($out) + je .Ldone4xop + + lea 0x40($inp),$inp # inp+=64*3 + vmovdqa $xa3,0x00(%rsp) + xor %r10,%r10 + vmovdqa $xb3,0x10(%rsp) + lea 0x40($out),$out # out+=64*3 + vmovdqa $xc3,0x20(%rsp) + sub \$192,$len # len-=64*3 + vmovdqa $xd3,0x30(%rsp) + +.Loop_tail4xop: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail4xop + +.Ldone4xop: + vzeroupper +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4xop_epilogue: + ret +.cfi_endproc +.size ChaCha20_4xop,.-ChaCha20_4xop +___ +} + +######################################################################## +# AVX2 code path +if ($avx>1) { +my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, + $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); + +sub AVX2_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); +my @x=map("\"$_\"",@xx); + + # Consider order in which variables are addressed by their + # index: + # + # a b c d + # + # 0 4 8 12 < even round + # 1 5 9 13 + # 2 6 10 14 + # 3 7 11 15 + # 0 5 10 15 < odd round + # 1 6 11 12 + # 2 7 8 13 + # 3 4 9 14 + # + # 'a', 'b' and 'd's are permanently allocated in registers, + # @x[0..7,12..15], while 'c's are maintained in memory. If + # you observe 'c' column, you'll notice that pair of 'c's is + # invariant between rounds. This means that we have to reload + # them once per round, in the middle. This is why you'll see + # bunch of 'c' stores and loads in the middle, but none in + # the beginning or end. + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t1)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t1)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t0,@x[$b0],12)", + "&vpsrld (@x[$b0],@x[$b0],20)", + "&vpor (@x[$b0],$t0,@x[$b0])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t1,@x[$b1],12)", + "&vpsrld (@x[$b1],@x[$b1],20)", + "&vpor (@x[$b1],$t1,@x[$b1])", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpxor (@x[$d0],@x[$a0],@x[$d0])", + "&vpshufb (@x[$d0],@x[$d0],$t0)", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpxor (@x[$d1],@x[$a1],@x[$d1])", + "&vpshufb (@x[$d1],@x[$d1],$t0)", + + "&vpaddd ($xc,$xc,@x[$d0])", + "&vpxor (@x[$b0],$xc,@x[$b0])", + "&vpslld ($t1,@x[$b0],7)", + "&vpsrld (@x[$b0],@x[$b0],25)", + "&vpor (@x[$b0],$t1,@x[$b0])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d1])", + "&vpxor (@x[$b1],$xc_,@x[$b1])", + "&vpslld ($t0,@x[$b1],7)", + "&vpsrld (@x[$b1],@x[$b1],25)", + "&vpor (@x[$b1],$t0,@x[$b1])", + + "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's + "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", + "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", + "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t1)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t1)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t0,@x[$b2],12)", + "&vpsrld (@x[$b2],@x[$b2],20)", + "&vpor (@x[$b2],$t0,@x[$b2])", + "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t1,@x[$b3],12)", + "&vpsrld (@x[$b3],@x[$b3],20)", + "&vpor (@x[$b3],$t1,@x[$b3])", + + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpxor (@x[$d2],@x[$a2],@x[$d2])", + "&vpshufb (@x[$d2],@x[$d2],$t0)", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxor (@x[$d3],@x[$a3],@x[$d3])", + "&vpshufb (@x[$d3],@x[$d3],$t0)", + + "&vpaddd ($xc,$xc,@x[$d2])", + "&vpxor (@x[$b2],$xc,@x[$b2])", + "&vpslld ($t1,@x[$b2],7)", + "&vpsrld (@x[$b2],@x[$b2],25)", + "&vpor (@x[$b2],$t1,@x[$b2])", + "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) + "&vpaddd ($xc_,$xc_,@x[$d3])", + "&vpxor (@x[$b3],$xc_,@x[$b3])", + "&vpslld ($t0,@x[$b3],7)", + "&vpsrld (@x[$b3],@x[$b3],25)", + "&vpor (@x[$b3],$t0,@x[$b3])" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_8x,\@function,5 +.align 32 +ChaCha20_8x: +.cfi_startproc +.LChaCha20_8x: + mov %rsp,%r9 # frame register +.cfi_def_cfa_register %r9 + sub \$0x280+$xframe,%rsp + and \$-32,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8x_body: +___ +$code.=<<___; + vzeroupper + + ################ stack layout + # +0x00 SIMD equivalent of @x[8-12] + # ... + # +0x80 constant copy of key[0-2] smashed by lanes + # ... + # +0x200 SIMD counters (with nonce smashed by lanes) + # ... + # +0x280 + + vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] + vbroadcasti128 ($key),$xb3 # key[1] + vbroadcasti128 16($key),$xt3 # key[2] + vbroadcasti128 ($counter),$xd3 # key[3] + lea 0x100(%rsp),%rcx # size optimization + lea 0x200(%rsp),%rax # size optimization + lea .Lrot16(%rip),%r10 + lea .Lrot24(%rip),%r11 + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload + vpshufd \$0xaa,$xa3,$xa2 + vmovdqa $xa1,0xa0-0x100(%rcx) + vpshufd \$0xff,$xa3,$xa3 + vmovdqa $xa2,0xc0-0x100(%rcx) + vmovdqa $xa3,0xe0-0x100(%rcx) + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vmovdqa $xb0,0x100-0x100(%rcx) + vpshufd \$0xaa,$xb3,$xb2 + vmovdqa $xb1,0x120-0x100(%rcx) + vpshufd \$0xff,$xb3,$xb3 + vmovdqa $xb2,0x140-0x100(%rcx) + vmovdqa $xb3,0x160-0x100(%rcx) + + vpshufd \$0x00,$xt3,$xt0 # "xc0" + vpshufd \$0x55,$xt3,$xt1 # "xc1" + vmovdqa $xt0,0x180-0x200(%rax) + vpshufd \$0xaa,$xt3,$xt2 # "xc2" + vmovdqa $xt1,0x1a0-0x200(%rax) + vpshufd \$0xff,$xt3,$xt3 # "xc3" + vmovdqa $xt2,0x1c0-0x200(%rax) + vmovdqa $xt3,0x1e0-0x200(%rax) + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet + vpshufd \$0xaa,$xd3,$xd2 + vmovdqa $xd1,0x220-0x200(%rax) + vpshufd \$0xff,$xd3,$xd3 + vmovdqa $xd2,0x240-0x200(%rax) + vmovdqa $xd3,0x260-0x200(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key + vmovdqa 0xa0-0x100(%rcx),$xa1 + vmovdqa 0xc0-0x100(%rcx),$xa2 + vmovdqa 0xe0-0x100(%rcx),$xa3 + vmovdqa 0x100-0x100(%rcx),$xb0 + vmovdqa 0x120-0x100(%rcx),$xb1 + vmovdqa 0x140-0x100(%rcx),$xb2 + vmovdqa 0x160-0x100(%rcx),$xb3 + vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" + vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" + vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" + vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" + vmovdqa 0x200-0x200(%rax),$xd0 + vmovdqa 0x220-0x200(%rax),$xd1 + vmovdqa 0x240-0x200(%rax),$xd2 + vmovdqa 0x260-0x200(%rax),$xd3 + vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters + +.Loop_enter8x: + vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" + vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" + vbroadcasti128 (%r10),$xt3 + vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters + mov \$10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: +___ + foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop8x + + lea 0x200(%rsp),%rax # size optimization + vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key + vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 + vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 + vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd 0x100-0x100(%rcx),$xb0,$xb0 + vpaddd 0x120-0x100(%rcx),$xb1,$xb1 + vpaddd 0x140-0x100(%rcx),$xb2,$xb2 + vpaddd 0x160-0x100(%rcx),$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xb0,$xa0,$xb0 + vperm2i128 \$0x20,$xb1,$xa1,$xa0 + vperm2i128 \$0x31,$xb1,$xa1,$xb1 + vperm2i128 \$0x20,$xb2,$xa2,$xa1 + vperm2i128 \$0x31,$xb2,$xa2,$xb2 + vperm2i128 \$0x20,$xb3,$xa3,$xa2 + vperm2i128 \$0x31,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); + my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); +$code.=<<___; + vmovdqa $xa0,0x00(%rsp) # offload $xaN + vmovdqa $xa1,0x20(%rsp) + vmovdqa 0x40(%rsp),$xc2 # $xa0 + vmovdqa 0x60(%rsp),$xc3 # $xa1 + + vpaddd 0x180-0x200(%rax),$xc0,$xc0 + vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 + vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 + vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd 0x200-0x200(%rax),$xd0,$xd0 + vpaddd 0x220-0x200(%rax),$xd1,$xd1 + vpaddd 0x240-0x200(%rax),$xd2,$xd2 + vpaddd 0x260-0x200(%rax),$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xd0,$xc0,$xd0 + vperm2i128 \$0x20,$xd1,$xc1,$xc0 + vperm2i128 \$0x31,$xd1,$xc1,$xd1 + vperm2i128 \$0x20,$xd2,$xc2,$xc1 + vperm2i128 \$0x31,$xd2,$xc2,$xd2 + vperm2i128 \$0x20,$xd3,$xc3,$xc2 + vperm2i128 \$0x31,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); + ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= + ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); + ($xa0,$xa1)=($xt2,$xt3); +$code.=<<___; + vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? + vmovdqa 0x20(%rsp),$xa1 + + cmp \$64*8,$len + jb .Ltail8x + + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa1,$xa1 + vpxor 0x20($inp),$xb1,$xb1 + vpxor 0x40($inp),$xc1,$xc1 + vpxor 0x60($inp),$xd1,$xd1 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa1,0x00($out) + vmovdqu $xb1,0x20($out) + vmovdqu $xc1,0x40($out) + vmovdqu $xd1,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa2,$xa2 + vpxor 0x20($inp),$xb2,$xb2 + vpxor 0x40($inp),$xc2,$xc2 + vpxor 0x60($inp),$xd2,$xd2 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa2,0x00($out) + vmovdqu $xb2,0x20($out) + vmovdqu $xc2,0x40($out) + vmovdqu $xd2,0x60($out) + lea 0x80($out),$out # size optimization + + vpxor 0x00($inp),$xa3,$xa3 + vpxor 0x20($inp),$xb3,$xb3 + vpxor 0x40($inp),$xc3,$xc3 + vpxor 0x60($inp),$xd3,$xd3 + lea 0x80($inp),$inp # size optimization + vmovdqu $xa3,0x00($out) + vmovdqu $xb3,0x20($out) + vmovdqu $xc3,0x40($out) + vmovdqu $xd3,0x60($out) + lea 0x80($out),$out # size optimization + + sub \$64*8,$len + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmp \$448,$len + jae .L448_or_more8x + cmp \$384,$len + jae .L384_or_more8x + cmp \$320,$len + jae .L320_or_more8x + cmp \$256,$len + jae .L256_or_more8x + cmp \$192,$len + jae .L192_or_more8x + cmp \$128,$len + jae .L128_or_more8x + cmp \$64,$len + jae .L64_or_more8x + + xor %r10,%r10 + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xb0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + je .Ldone8x + + lea 0x40($inp),$inp # inp+=64*1 + xor %r10,%r10 + vmovdqa $xc0,0x00(%rsp) + lea 0x40($out),$out # out+=64*1 + sub \$64,$len # len-=64*1 + vmovdqa $xd0,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + je .Ldone8x + + lea 0x80($inp),$inp # inp+=64*2 + xor %r10,%r10 + vmovdqa $xa1,0x00(%rsp) + lea 0x80($out),$out # out+=64*2 + sub \$128,$len # len-=64*2 + vmovdqa $xb1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + je .Ldone8x + + lea 0xc0($inp),$inp # inp+=64*3 + xor %r10,%r10 + vmovdqa $xc1,0x00(%rsp) + lea 0xc0($out),$out # out+=64*3 + sub \$192,$len # len-=64*3 + vmovdqa $xd1,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + je .Ldone8x + + lea 0x100($inp),$inp # inp+=64*4 + xor %r10,%r10 + vmovdqa $xa2,0x00(%rsp) + lea 0x100($out),$out # out+=64*4 + sub \$256,$len # len-=64*4 + vmovdqa $xb2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + je .Ldone8x + + lea 0x140($inp),$inp # inp+=64*5 + xor %r10,%r10 + vmovdqa $xc2,0x00(%rsp) + lea 0x140($out),$out # out+=64*5 + sub \$320,$len # len-=64*5 + vmovdqa $xd2,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + je .Ldone8x + + lea 0x180($inp),$inp # inp+=64*6 + xor %r10,%r10 + vmovdqa $xa3,0x00(%rsp) + lea 0x180($out),$out # out+=64*6 + sub \$384,$len # len-=64*6 + vmovdqa $xb3,0x20(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + vpxor 0x80($inp),$xa1,$xa1 + vpxor 0xa0($inp),$xb1,$xb1 + vpxor 0xc0($inp),$xc1,$xc1 + vpxor 0xe0($inp),$xd1,$xd1 + vpxor 0x100($inp),$xa2,$xa2 + vpxor 0x120($inp),$xb2,$xb2 + vpxor 0x140($inp),$xc2,$xc2 + vpxor 0x160($inp),$xd2,$xd2 + vpxor 0x180($inp),$xa3,$xa3 + vpxor 0x1a0($inp),$xb3,$xb3 + vmovdqu $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + vmovdqu $xa1,0x80($out) + vmovdqu $xb1,0xa0($out) + vmovdqu $xc1,0xc0($out) + vmovdqu $xd1,0xe0($out) + vmovdqu $xa2,0x100($out) + vmovdqu $xb2,0x120($out) + vmovdqu $xc2,0x140($out) + vmovdqu $xd2,0x160($out) + vmovdqu $xa3,0x180($out) + vmovdqu $xb3,0x1a0($out) + je .Ldone8x + + lea 0x1c0($inp),$inp # inp+=64*7 + xor %r10,%r10 + vmovdqa $xc3,0x00(%rsp) + lea 0x1c0($out),$out # out+=64*7 + sub \$448,$len # len-=64*7 + vmovdqa $xd3,0x20(%rsp) + +.Loop_tail8x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail8x + +.Ldone8x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8x_epilogue: + ret +.cfi_endproc +.size ChaCha20_8x,.-ChaCha20_8x +___ +} + +######################################################################## +# AVX512 code paths +if ($avx>2) { +# This one handles shorter inputs... + +my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); +my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); + +sub vpxord() # size optimization +{ my $opcode = "vpxor"; # adhere to vpxor when possible + + foreach (@_) { + if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { + $opcode = "vpxord"; + last; + } + } + + $code .= "\t$opcode\t".join(',',reverse @_)."\n"; +} + +sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,16); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,12); + + &vpaddd ($a,$a,$b); + &vpxord ($d,$d,$a); + &vprold ($d,$d,8); + + &vpaddd ($c,$c,$d); + &vpxord ($b,$b,$c); + &vprold ($b,$b,7); +} + +my $xframe = $win64 ? 32+8 : 8; + +$code.=<<___; +.type ChaCha20_avx512,\@function,5 +.align 32 +ChaCha20_avx512: +.cfi_startproc +.LChaCha20_avx512: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 + cmp \$512,$len + ja .LChaCha20_16x + + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lavx512_body: +___ +$code.=<<___; + vbroadcasti32x4 .Lsigma(%rip),$a + vbroadcasti32x4 ($key),$b + vbroadcasti32x4 16($key),$c + vbroadcasti32x4 ($counter),$d + + vmovdqa32 $a,$a_ + vmovdqa32 $b,$b_ + vmovdqa32 $c,$c_ + vpaddd .Lzeroz(%rip),$d,$d + vmovdqa32 .Lfourz(%rip),$fourz + mov \$10,$counter # reuse $counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 16 +.Loop_outer_avx512: + vmovdqa32 $a_,$a + vmovdqa32 $b_,$b + vmovdqa32 $c_,$c + vpaddd $fourz,$d_,$d + mov \$10,$counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: +___ + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b00111001); + &vpshufd ($d,$d,0b10010011); + + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b10010011); + &vpshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_avx512"); + +$code.=<<___; + vpaddd $a_,$a,$a + vpaddd $b_,$b,$b + vpaddd $c_,$c,$c + vpaddd $d_,$d,$d + + sub \$64,$len + jb .Ltail64_avx512 + + vpxor 0x00($inp),%x#$a,$t0 # xor with input + vpxor 0x10($inp),%x#$b,$t1 + vpxor 0x20($inp),%x#$c,$t2 + vpxor 0x30($inp),%x#$d,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$1,$a,$t0 + vextracti32x4 \$1,$b,$t1 + vextracti32x4 \$1,$c,$t2 + vextracti32x4 \$1,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$2,$a,$t0 + vextracti32x4 \$2,$b,$t1 + vextracti32x4 \$2,$c,$t2 + vextracti32x4 \$2,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512 + + vextracti32x4 \$3,$a,$t0 + vextracti32x4 \$3,$b,$t1 + vextracti32x4 \$3,$c,$t2 + vextracti32x4 \$3,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512 + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jnz .Loop_outer_avx512 + + jmp .Ldone_avx512 + +.align 16 +.Ltail64_avx512: + vmovdqa %x#$a,0x00(%rsp) + vmovdqa %x#$b,0x10(%rsp) + vmovdqa %x#$c,0x20(%rsp) + vmovdqa %x#$d,0x30(%rsp) + add \$64,$len + jmp .Loop_tail_avx512 + +.align 16 +.Ltail_avx512: + vmovdqa $t0,0x00(%rsp) + vmovdqa $t1,0x10(%rsp) + vmovdqa $t2,0x20(%rsp) + vmovdqa $t3,0x30(%rsp) + add \$64,$len + +.Loop_tail_avx512: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_avx512 + + vmovdqu32 $a_,0x00(%rsp) + +.Ldone_avx512: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lavx512_epilogue: + ret +.cfi_endproc +.size ChaCha20_avx512,.-ChaCha20_avx512 +___ + +map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); + +$code.=<<___; +.type ChaCha20_avx512vl,\@function,5 +.align 32 +ChaCha20_avx512vl: +.cfi_startproc +.LChaCha20_avx512vl: + mov %rsp,%r9 # frame pointer +.cfi_def_cfa_register %r9 + cmp \$128,$len + ja .LChaCha20_8xvl + + sub \$64+$xframe,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0x28(%r9) + movaps %xmm7,-0x18(%r9) +.Lavx512vl_body: +___ +$code.=<<___; + vbroadcasti128 .Lsigma(%rip),$a + vbroadcasti128 ($key),$b + vbroadcasti128 16($key),$c + vbroadcasti128 ($counter),$d + + vmovdqa32 $a,$a_ + vmovdqa32 $b,$b_ + vmovdqa32 $c,$c_ + vpaddd .Lzeroz(%rip),$d,$d + vmovdqa32 .Ltwoy(%rip),$fourz + mov \$10,$counter # reuse $counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512vl + +.align 16 +.Loop_outer_avx512vl: + vmovdqa32 $c_,$c + vpaddd $fourz,$d_,$d + mov \$10,$counter + vmovdqa32 $d,$d_ + jmp .Loop_avx512vl + +.align 32 +.Loop_avx512vl: +___ + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b00111001); + &vpshufd ($d,$d,0b10010011); + + &AVX512ROUND(); + &vpshufd ($c,$c,0b01001110); + &vpshufd ($b,$b,0b10010011); + &vpshufd ($d,$d,0b00111001); + + &dec ($counter); + &jnz (".Loop_avx512vl"); + +$code.=<<___; + vpaddd $a_,$a,$a + vpaddd $b_,$b,$b + vpaddd $c_,$c,$c + vpaddd $d_,$d,$d + + sub \$64,$len + jb .Ltail64_avx512vl + + vpxor 0x00($inp),%x#$a,$t0 # xor with input + vpxor 0x10($inp),%x#$b,$t1 + vpxor 0x20($inp),%x#$c,$t2 + vpxor 0x30($inp),%x#$d,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + jz .Ldone_avx512vl + + vextracti128 \$1,$a,$t0 + vextracti128 \$1,$b,$t1 + vextracti128 \$1,$c,$t2 + vextracti128 \$1,$d,$t3 + + sub \$64,$len + jb .Ltail_avx512vl + + vpxor 0x00($inp),$t0,$t0 # xor with input + vpxor 0x10($inp),$t1,$t1 + vpxor 0x20($inp),$t2,$t2 + vpxor 0x30($inp),$t3,$t3 + lea 0x40($inp),$inp # inp+=64 + + vmovdqu $t0,0x00($out) # write output + vmovdqu $t1,0x10($out) + vmovdqu $t2,0x20($out) + vmovdqu $t3,0x30($out) + lea 0x40($out),$out # out+=64 + + vmovdqa32 $a_,$a + vmovdqa32 $b_,$b + jnz .Loop_outer_avx512vl + + jmp .Ldone_avx512vl + +.align 16 +.Ltail64_avx512vl: + vmovdqa %x#$a,0x00(%rsp) + vmovdqa %x#$b,0x10(%rsp) + vmovdqa %x#$c,0x20(%rsp) + vmovdqa %x#$d,0x30(%rsp) + add \$64,$len + jmp .Loop_tail_avx512vl + +.align 16 +.Ltail_avx512vl: + vmovdqa $t0,0x00(%rsp) + vmovdqa $t1,0x10(%rsp) + vmovdqa $t2,0x20(%rsp) + vmovdqa $t3,0x30(%rsp) + add \$64,$len + +.Loop_tail_avx512vl: + movzb ($inp,$counter),%eax + movzb (%rsp,$counter),%ecx + lea 1($counter),$counter + xor %ecx,%eax + mov %al,-1($out,$counter) + dec $len + jnz .Loop_tail_avx512vl + + vmovdqu32 $a_,0x00(%rsp) + vmovdqu32 $a_,0x20(%rsp) + +.Ldone_avx512vl: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0x28(%r9),%xmm6 + movaps -0x18(%r9),%xmm7 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.Lavx512vl_epilogue: + ret +.cfi_endproc +.size ChaCha20_avx512vl,.-ChaCha20_avx512vl +___ +} +if ($avx>2) { +# This one handles longer inputs... + +my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); +my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +my @key=map("%zmm$_",(16..31)); +my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; + +sub AVX512_lane_ROUND { +my ($a0,$b0,$c0,$d0)=@_; +my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); +my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); +my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); +my @x=map("\"$_\"",@xx); + + ( + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],16)", + "&vprold (@x[$d1],@x[$d1],16)", + "&vprold (@x[$d2],@x[$d2],16)", + "&vprold (@x[$d3],@x[$d3],16)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],12)", + "&vprold (@x[$b1],@x[$b1],12)", + "&vprold (@x[$b2],@x[$b2],12)", + "&vprold (@x[$b3],@x[$b3],12)", + + "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", + "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", + "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", + "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", + "&vpxord (@x[$d0],@x[$d0],@x[$a0])", + "&vpxord (@x[$d1],@x[$d1],@x[$a1])", + "&vpxord (@x[$d2],@x[$d2],@x[$a2])", + "&vpxord (@x[$d3],@x[$d3],@x[$a3])", + "&vprold (@x[$d0],@x[$d0],8)", + "&vprold (@x[$d1],@x[$d1],8)", + "&vprold (@x[$d2],@x[$d2],8)", + "&vprold (@x[$d3],@x[$d3],8)", + + "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", + "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", + "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", + "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", + "&vpxord (@x[$b0],@x[$b0],@x[$c0])", + "&vpxord (@x[$b1],@x[$b1],@x[$c1])", + "&vpxord (@x[$b2],@x[$b2],@x[$c2])", + "&vpxord (@x[$b3],@x[$b3],@x[$c3])", + "&vprold (@x[$b0],@x[$b0],7)", + "&vprold (@x[$b1],@x[$b1],7)", + "&vprold (@x[$b2],@x[$b2],7)", + "&vprold (@x[$b3],@x[$b3],7)" + ); +} + +my $xframe = $win64 ? 0xa8 : 8; + +$code.=<<___; +.type ChaCha20_16x,\@function,5 +.align 32 +ChaCha20_16x: +.cfi_startproc +.LChaCha20_16x: + mov %rsp,%r9 # frame register +.cfi_def_cfa_register %r9 + sub \$64+$xframe,%rsp + and \$-64,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L16x_body: +___ +$code.=<<___; + vzeroupper + + lea .Lsigma(%rip),%r10 + vbroadcasti32x4 (%r10),$xa3 # key[0] + vbroadcasti32x4 ($key),$xb3 # key[1] + vbroadcasti32x4 16($key),$xc3 # key[2] + vbroadcasti32x4 ($counter),$xd3 # key[3] + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vpshufd \$0xaa,$xa3,$xa2 + vpshufd \$0xff,$xa3,$xa3 + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vpshufd \$0xaa,$xb3,$xb2 + vpshufd \$0xff,$xb3,$xb3 + vmovdqa64 $xb0,@key[4] + vmovdqa64 $xb1,@key[5] + vmovdqa64 $xb2,@key[6] + vmovdqa64 $xb3,@key[7] + + vpshufd \$0x00,$xc3,$xc0 + vpshufd \$0x55,$xc3,$xc1 + vpshufd \$0xaa,$xc3,$xc2 + vpshufd \$0xff,$xc3,$xc3 + vmovdqa64 $xc0,@key[8] + vmovdqa64 $xc1,@key[9] + vmovdqa64 $xc2,@key[10] + vmovdqa64 $xc3,@key[11] + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpshufd \$0xaa,$xd3,$xd2 + vpshufd \$0xff,$xd3,$xd3 + vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet + vmovdqa64 $xd0,@key[12] + vmovdqa64 $xd1,@key[13] + vmovdqa64 $xd2,@key[14] + vmovdqa64 $xd3,@key[15] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop_outer16x: + vpbroadcastd 0(%r10),$xa0 # reload key + vpbroadcastd 4(%r10),$xa1 + vpbroadcastd 8(%r10),$xa2 + vpbroadcastd 12(%r10),$xa3 + vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters + vmovdqa64 @key[4],$xb0 + vmovdqa64 @key[5],$xb1 + vmovdqa64 @key[6],$xb2 + vmovdqa64 @key[7],$xb3 + vmovdqa64 @key[8],$xc0 + vmovdqa64 @key[9],$xc1 + vmovdqa64 @key[10],$xc2 + vmovdqa64 @key[11],$xc3 + vmovdqa64 @key[12],$xd0 + vmovdqa64 @key[13],$xd1 + vmovdqa64 @key[14],$xd2 + vmovdqa64 @key[15],$xd3 + + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + mov \$10,%eax + jmp .Loop16x + +.align 32 +.Loop16x: +___ + foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop16x + + vpaddd @key[0],$xa0,$xa0 # accumulate key + vpaddd @key[1],$xa1,$xa1 + vpaddd @key[2],$xa2,$xa2 + vpaddd @key[3],$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd @key[4],$xb0,$xb0 + vpaddd @key[5],$xb1,$xb1 + vpaddd @key[6],$xb2,$xb2 + vpaddd @key[7],$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xb0,$xa0,$xb0 + vshufi32x4 \$0x44,$xb1,$xa1,$xa0 + vshufi32x4 \$0xee,$xb1,$xa1,$xb1 + vshufi32x4 \$0x44,$xb2,$xa2,$xa1 + vshufi32x4 \$0xee,$xb2,$xa2,$xb2 + vshufi32x4 \$0x44,$xb3,$xa3,$xa2 + vshufi32x4 \$0xee,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); +$code.=<<___; + vpaddd @key[8],$xc0,$xc0 + vpaddd @key[9],$xc1,$xc1 + vpaddd @key[10],$xc2,$xc2 + vpaddd @key[11],$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd @key[12],$xd0,$xd0 + vpaddd @key[13],$xd1,$xd1 + vpaddd @key[14],$xd2,$xd2 + vpaddd @key[15],$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further + vshufi32x4 \$0xee,$xd0,$xc0,$xd0 + vshufi32x4 \$0x44,$xd1,$xc1,$xc0 + vshufi32x4 \$0xee,$xd1,$xc1,$xd1 + vshufi32x4 \$0x44,$xd2,$xc2,$xc1 + vshufi32x4 \$0xee,$xd2,$xc2,$xd2 + vshufi32x4 \$0x44,$xd3,$xc3,$xc2 + vshufi32x4 \$0xee,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); +$code.=<<___; + vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further + vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 + vshufi32x4 \$0x88,$xd0,$xb0,$xc0 + vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 + vshufi32x4 \$0x88,$xc1,$xa1,$xt1 + vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 + vshufi32x4 \$0x88,$xd1,$xb1,$xc1 + vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 + vshufi32x4 \$0x88,$xc2,$xa2,$xt2 + vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 + vshufi32x4 \$0x88,$xd2,$xb2,$xc2 + vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 + vshufi32x4 \$0x88,$xc3,$xa3,$xt3 + vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 + vshufi32x4 \$0x88,$xd3,$xb3,$xc3 + vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 +___ + ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= + ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); + + ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, + $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = + ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +$code.=<<___; + cmp \$64*16,$len + jb .Ltail16x + + vpxord 0x00($inp),$xa0,$xa0 # xor with input + vpxord 0x40($inp),$xb0,$xb0 + vpxord 0x80($inp),$xc0,$xc0 + vpxord 0xc0($inp),$xd0,$xd0 + vmovdqu32 $xa0,0x00($out) + vmovdqu32 $xb0,0x40($out) + vmovdqu32 $xc0,0x80($out) + vmovdqu32 $xd0,0xc0($out) + + vpxord 0x100($inp),$xa1,$xa1 + vpxord 0x140($inp),$xb1,$xb1 + vpxord 0x180($inp),$xc1,$xc1 + vpxord 0x1c0($inp),$xd1,$xd1 + vmovdqu32 $xa1,0x100($out) + vmovdqu32 $xb1,0x140($out) + vmovdqu32 $xc1,0x180($out) + vmovdqu32 $xd1,0x1c0($out) + + vpxord 0x200($inp),$xa2,$xa2 + vpxord 0x240($inp),$xb2,$xb2 + vpxord 0x280($inp),$xc2,$xc2 + vpxord 0x2c0($inp),$xd2,$xd2 + vmovdqu32 $xa2,0x200($out) + vmovdqu32 $xb2,0x240($out) + vmovdqu32 $xc2,0x280($out) + vmovdqu32 $xd2,0x2c0($out) + + vpxord 0x300($inp),$xa3,$xa3 + vpxord 0x340($inp),$xb3,$xb3 + vpxord 0x380($inp),$xc3,$xc3 + vpxord 0x3c0($inp),$xd3,$xd3 + lea 0x400($inp),$inp + vmovdqu32 $xa3,0x300($out) + vmovdqu32 $xb3,0x340($out) + vmovdqu32 $xc3,0x380($out) + vmovdqu32 $xd3,0x3c0($out) + lea 0x400($out),$out + + sub \$64*16,$len + jnz .Loop_outer16x + + jmp .Ldone16x + +.align 32 +.Ltail16x: + xor %r10,%r10 + sub $inp,$out + cmp \$64*1,$len + jb .Less_than_64_16x + vpxord ($inp),$xa0,$xa0 # xor with input + vmovdqu32 $xa0,($out,$inp) + je .Ldone16x + vmovdqa32 $xb0,$xa0 + lea 64($inp),$inp + + cmp \$64*2,$len + jb .Less_than_64_16x + vpxord ($inp),$xb0,$xb0 + vmovdqu32 $xb0,($out,$inp) + je .Ldone16x + vmovdqa32 $xc0,$xa0 + lea 64($inp),$inp + + cmp \$64*3,$len + jb .Less_than_64_16x + vpxord ($inp),$xc0,$xc0 + vmovdqu32 $xc0,($out,$inp) + je .Ldone16x + vmovdqa32 $xd0,$xa0 + lea 64($inp),$inp + + cmp \$64*4,$len + jb .Less_than_64_16x + vpxord ($inp),$xd0,$xd0 + vmovdqu32 $xd0,($out,$inp) + je .Ldone16x + vmovdqa32 $xa1,$xa0 + lea 64($inp),$inp + + cmp \$64*5,$len + jb .Less_than_64_16x + vpxord ($inp),$xa1,$xa1 + vmovdqu32 $xa1,($out,$inp) + je .Ldone16x + vmovdqa32 $xb1,$xa0 + lea 64($inp),$inp + + cmp \$64*6,$len + jb .Less_than_64_16x + vpxord ($inp),$xb1,$xb1 + vmovdqu32 $xb1,($out,$inp) + je .Ldone16x + vmovdqa32 $xc1,$xa0 + lea 64($inp),$inp + + cmp \$64*7,$len + jb .Less_than_64_16x + vpxord ($inp),$xc1,$xc1 + vmovdqu32 $xc1,($out,$inp) + je .Ldone16x + vmovdqa32 $xd1,$xa0 + lea 64($inp),$inp + + cmp \$64*8,$len + jb .Less_than_64_16x + vpxord ($inp),$xd1,$xd1 + vmovdqu32 $xd1,($out,$inp) + je .Ldone16x + vmovdqa32 $xa2,$xa0 + lea 64($inp),$inp + + cmp \$64*9,$len + jb .Less_than_64_16x + vpxord ($inp),$xa2,$xa2 + vmovdqu32 $xa2,($out,$inp) + je .Ldone16x + vmovdqa32 $xb2,$xa0 + lea 64($inp),$inp + + cmp \$64*10,$len + jb .Less_than_64_16x + vpxord ($inp),$xb2,$xb2 + vmovdqu32 $xb2,($out,$inp) + je .Ldone16x + vmovdqa32 $xc2,$xa0 + lea 64($inp),$inp + + cmp \$64*11,$len + jb .Less_than_64_16x + vpxord ($inp),$xc2,$xc2 + vmovdqu32 $xc2,($out,$inp) + je .Ldone16x + vmovdqa32 $xd2,$xa0 + lea 64($inp),$inp + + cmp \$64*12,$len + jb .Less_than_64_16x + vpxord ($inp),$xd2,$xd2 + vmovdqu32 $xd2,($out,$inp) + je .Ldone16x + vmovdqa32 $xa3,$xa0 + lea 64($inp),$inp + + cmp \$64*13,$len + jb .Less_than_64_16x + vpxord ($inp),$xa3,$xa3 + vmovdqu32 $xa3,($out,$inp) + je .Ldone16x + vmovdqa32 $xb3,$xa0 + lea 64($inp),$inp + + cmp \$64*14,$len + jb .Less_than_64_16x + vpxord ($inp),$xb3,$xb3 + vmovdqu32 $xb3,($out,$inp) + je .Ldone16x + vmovdqa32 $xc3,$xa0 + lea 64($inp),$inp + + cmp \$64*15,$len + jb .Less_than_64_16x + vpxord ($inp),$xc3,$xc3 + vmovdqu32 $xc3,($out,$inp) + je .Ldone16x + vmovdqa32 $xd3,$xa0 + lea 64($inp),$inp + +.Less_than_64_16x: + vmovdqa32 $xa0,0x00(%rsp) + lea ($out,$inp),$out + and \$63,$len + +.Loop_tail16x: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail16x + + vpxord $xa0,$xa0,$xa0 + vmovdqa32 $xa0,0(%rsp) + +.Ldone16x: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L16x_epilogue: + ret +.cfi_endproc +.size ChaCha20_16x,.-ChaCha20_16x +___ + +# switch to %ymm domain +($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); +@xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, + $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); +@key=map("%ymm$_",(16..31)); +($xt0,$xt1,$xt2,$xt3)=@key[0..3]; + +$code.=<<___; +.type ChaCha20_8xvl,\@function,5 +.align 32 +ChaCha20_8xvl: +.cfi_startproc +.LChaCha20_8xvl: + mov %rsp,%r9 # frame register +.cfi_def_cfa_register %r9 + sub \$64+$xframe,%rsp + and \$-64,%rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,-0xa8(%r9) + movaps %xmm7,-0x98(%r9) + movaps %xmm8,-0x88(%r9) + movaps %xmm9,-0x78(%r9) + movaps %xmm10,-0x68(%r9) + movaps %xmm11,-0x58(%r9) + movaps %xmm12,-0x48(%r9) + movaps %xmm13,-0x38(%r9) + movaps %xmm14,-0x28(%r9) + movaps %xmm15,-0x18(%r9) +.L8xvl_body: +___ +$code.=<<___; + vzeroupper + + lea .Lsigma(%rip),%r10 + vbroadcasti128 (%r10),$xa3 # key[0] + vbroadcasti128 ($key),$xb3 # key[1] + vbroadcasti128 16($key),$xc3 # key[2] + vbroadcasti128 ($counter),$xd3 # key[3] + + vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... + vpshufd \$0x55,$xa3,$xa1 + vpshufd \$0xaa,$xa3,$xa2 + vpshufd \$0xff,$xa3,$xa3 + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + vpshufd \$0x00,$xb3,$xb0 + vpshufd \$0x55,$xb3,$xb1 + vpshufd \$0xaa,$xb3,$xb2 + vpshufd \$0xff,$xb3,$xb3 + vmovdqa64 $xb0,@key[4] + vmovdqa64 $xb1,@key[5] + vmovdqa64 $xb2,@key[6] + vmovdqa64 $xb3,@key[7] + + vpshufd \$0x00,$xc3,$xc0 + vpshufd \$0x55,$xc3,$xc1 + vpshufd \$0xaa,$xc3,$xc2 + vpshufd \$0xff,$xc3,$xc3 + vmovdqa64 $xc0,@key[8] + vmovdqa64 $xc1,@key[9] + vmovdqa64 $xc2,@key[10] + vmovdqa64 $xc3,@key[11] + + vpshufd \$0x00,$xd3,$xd0 + vpshufd \$0x55,$xd3,$xd1 + vpshufd \$0xaa,$xd3,$xd2 + vpshufd \$0xff,$xd3,$xd3 + vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet + vmovdqa64 $xd0,@key[12] + vmovdqa64 $xd1,@key[13] + vmovdqa64 $xd2,@key[14] + vmovdqa64 $xd3,@key[15] + + mov \$10,%eax + jmp .Loop8xvl + +.align 32 +.Loop_outer8xvl: + #vpbroadcastd 0(%r10),$xa0 # reload key + #vpbroadcastd 4(%r10),$xa1 + vpbroadcastd 8(%r10),$xa2 + vpbroadcastd 12(%r10),$xa3 + vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters + vmovdqa64 @key[4],$xb0 + vmovdqa64 @key[5],$xb1 + vmovdqa64 @key[6],$xb2 + vmovdqa64 @key[7],$xb3 + vmovdqa64 @key[8],$xc0 + vmovdqa64 @key[9],$xc1 + vmovdqa64 @key[10],$xc2 + vmovdqa64 @key[11],$xc3 + vmovdqa64 @key[12],$xd0 + vmovdqa64 @key[13],$xd1 + vmovdqa64 @key[14],$xd2 + vmovdqa64 @key[15],$xd3 + + vmovdqa64 $xa0,@key[0] + vmovdqa64 $xa1,@key[1] + vmovdqa64 $xa2,@key[2] + vmovdqa64 $xa3,@key[3] + + mov \$10,%eax + jmp .Loop8xvl + +.align 32 +.Loop8xvl: +___ + foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } + foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } +$code.=<<___; + dec %eax + jnz .Loop8xvl + + vpaddd @key[0],$xa0,$xa0 # accumulate key + vpaddd @key[1],$xa1,$xa1 + vpaddd @key[2],$xa2,$xa2 + vpaddd @key[3],$xa3,$xa3 + + vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data + vpunpckldq $xa3,$xa2,$xt3 + vpunpckhdq $xa1,$xa0,$xa0 + vpunpckhdq $xa3,$xa2,$xa2 + vpunpcklqdq $xt3,$xt2,$xa1 # "a0" + vpunpckhqdq $xt3,$xt2,$xt2 # "a1" + vpunpcklqdq $xa2,$xa0,$xa3 # "a2" + vpunpckhqdq $xa2,$xa0,$xa0 # "a3" +___ + ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); +$code.=<<___; + vpaddd @key[4],$xb0,$xb0 + vpaddd @key[5],$xb1,$xb1 + vpaddd @key[6],$xb2,$xb2 + vpaddd @key[7],$xb3,$xb3 + + vpunpckldq $xb1,$xb0,$xt2 + vpunpckldq $xb3,$xb2,$xt3 + vpunpckhdq $xb1,$xb0,$xb0 + vpunpckhdq $xb3,$xb2,$xb2 + vpunpcklqdq $xt3,$xt2,$xb1 # "b0" + vpunpckhqdq $xt3,$xt2,$xt2 # "b1" + vpunpcklqdq $xb2,$xb0,$xb3 # "b2" + vpunpckhqdq $xb2,$xb0,$xb0 # "b3" +___ + ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); +$code.=<<___; + vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further + vshufi32x4 \$3,$xb0,$xa0,$xb0 + vshufi32x4 \$0,$xb1,$xa1,$xa0 + vshufi32x4 \$3,$xb1,$xa1,$xb1 + vshufi32x4 \$0,$xb2,$xa2,$xa1 + vshufi32x4 \$3,$xb2,$xa2,$xb2 + vshufi32x4 \$0,$xb3,$xa3,$xa2 + vshufi32x4 \$3,$xb3,$xa3,$xb3 +___ + ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); +$code.=<<___; + vpaddd @key[8],$xc0,$xc0 + vpaddd @key[9],$xc1,$xc1 + vpaddd @key[10],$xc2,$xc2 + vpaddd @key[11],$xc3,$xc3 + + vpunpckldq $xc1,$xc0,$xt2 + vpunpckldq $xc3,$xc2,$xt3 + vpunpckhdq $xc1,$xc0,$xc0 + vpunpckhdq $xc3,$xc2,$xc2 + vpunpcklqdq $xt3,$xt2,$xc1 # "c0" + vpunpckhqdq $xt3,$xt2,$xt2 # "c1" + vpunpcklqdq $xc2,$xc0,$xc3 # "c2" + vpunpckhqdq $xc2,$xc0,$xc0 # "c3" +___ + ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); +$code.=<<___; + vpaddd @key[12],$xd0,$xd0 + vpaddd @key[13],$xd1,$xd1 + vpaddd @key[14],$xd2,$xd2 + vpaddd @key[15],$xd3,$xd3 + + vpunpckldq $xd1,$xd0,$xt2 + vpunpckldq $xd3,$xd2,$xt3 + vpunpckhdq $xd1,$xd0,$xd0 + vpunpckhdq $xd3,$xd2,$xd2 + vpunpcklqdq $xt3,$xt2,$xd1 # "d0" + vpunpckhqdq $xt3,$xt2,$xt2 # "d1" + vpunpcklqdq $xd2,$xd0,$xd3 # "d2" + vpunpckhqdq $xd2,$xd0,$xd0 # "d3" +___ + ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); +$code.=<<___; + vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further + vperm2i128 \$0x31,$xd0,$xc0,$xd0 + vperm2i128 \$0x20,$xd1,$xc1,$xc0 + vperm2i128 \$0x31,$xd1,$xc1,$xd1 + vperm2i128 \$0x20,$xd2,$xc2,$xc1 + vperm2i128 \$0x31,$xd2,$xc2,$xd2 + vperm2i128 \$0x20,$xd3,$xc3,$xc2 + vperm2i128 \$0x31,$xd3,$xc3,$xd3 +___ + ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); + ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= + ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); +$code.=<<___; + cmp \$64*8,$len + jb .Ltail8xvl + + mov \$0x80,%eax # size optimization + vpxord 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vpxor 0x40($inp),$xc0,$xc0 + vpxor 0x60($inp),$xd0,$xd0 + lea ($inp,%rax),$inp # size optimization + vmovdqu32 $xa0,0x00($out) + vmovdqu $xb0,0x20($out) + vmovdqu $xc0,0x40($out) + vmovdqu $xd0,0x60($out) + lea ($out,%rax),$out # size optimization + + vpxor 0x00($inp),$xa1,$xa1 + vpxor 0x20($inp),$xb1,$xb1 + vpxor 0x40($inp),$xc1,$xc1 + vpxor 0x60($inp),$xd1,$xd1 + lea ($inp,%rax),$inp # size optimization + vmovdqu $xa1,0x00($out) + vmovdqu $xb1,0x20($out) + vmovdqu $xc1,0x40($out) + vmovdqu $xd1,0x60($out) + lea ($out,%rax),$out # size optimization + + vpxord 0x00($inp),$xa2,$xa2 + vpxor 0x20($inp),$xb2,$xb2 + vpxor 0x40($inp),$xc2,$xc2 + vpxor 0x60($inp),$xd2,$xd2 + lea ($inp,%rax),$inp # size optimization + vmovdqu32 $xa2,0x00($out) + vmovdqu $xb2,0x20($out) + vmovdqu $xc2,0x40($out) + vmovdqu $xd2,0x60($out) + lea ($out,%rax),$out # size optimization + + vpxor 0x00($inp),$xa3,$xa3 + vpxor 0x20($inp),$xb3,$xb3 + vpxor 0x40($inp),$xc3,$xc3 + vpxor 0x60($inp),$xd3,$xd3 + lea ($inp,%rax),$inp # size optimization + vmovdqu $xa3,0x00($out) + vmovdqu $xb3,0x20($out) + vmovdqu $xc3,0x40($out) + vmovdqu $xd3,0x60($out) + lea ($out,%rax),$out # size optimization + + vpbroadcastd 0(%r10),%ymm0 # reload key + vpbroadcastd 4(%r10),%ymm1 + + sub \$64*8,$len + jnz .Loop_outer8xvl + + jmp .Ldone8xvl + +.align 32 +.Ltail8xvl: + vmovdqa64 $xa0,%ymm8 # size optimization +___ +$xa0 = "%ymm8"; +$code.=<<___; + xor %r10,%r10 + sub $inp,$out + cmp \$64*1,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xa0,$xa0 # xor with input + vpxor 0x20($inp),$xb0,$xb0 + vmovdqu $xa0,0x00($out,$inp) + vmovdqu $xb0,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xc0,$xa0 + vmovdqa $xd0,$xb0 + lea 64($inp),$inp + + cmp \$64*2,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xc0,$xc0 + vpxor 0x20($inp),$xd0,$xd0 + vmovdqu $xc0,0x00($out,$inp) + vmovdqu $xd0,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xa1,$xa0 + vmovdqa $xb1,$xb0 + lea 64($inp),$inp + + cmp \$64*3,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xa1,$xa1 + vpxor 0x20($inp),$xb1,$xb1 + vmovdqu $xa1,0x00($out,$inp) + vmovdqu $xb1,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xc1,$xa0 + vmovdqa $xd1,$xb0 + lea 64($inp),$inp + + cmp \$64*4,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xc1,$xc1 + vpxor 0x20($inp),$xd1,$xd1 + vmovdqu $xc1,0x00($out,$inp) + vmovdqu $xd1,0x20($out,$inp) + je .Ldone8xvl + vmovdqa32 $xa2,$xa0 + vmovdqa $xb2,$xb0 + lea 64($inp),$inp + + cmp \$64*5,$len + jb .Less_than_64_8xvl + vpxord 0x00($inp),$xa2,$xa2 + vpxor 0x20($inp),$xb2,$xb2 + vmovdqu32 $xa2,0x00($out,$inp) + vmovdqu $xb2,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xc2,$xa0 + vmovdqa $xd2,$xb0 + lea 64($inp),$inp + + cmp \$64*6,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xc2,$xc2 + vpxor 0x20($inp),$xd2,$xd2 + vmovdqu $xc2,0x00($out,$inp) + vmovdqu $xd2,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xa3,$xa0 + vmovdqa $xb3,$xb0 + lea 64($inp),$inp + + cmp \$64*7,$len + jb .Less_than_64_8xvl + vpxor 0x00($inp),$xa3,$xa3 + vpxor 0x20($inp),$xb3,$xb3 + vmovdqu $xa3,0x00($out,$inp) + vmovdqu $xb3,0x20($out,$inp) + je .Ldone8xvl + vmovdqa $xc3,$xa0 + vmovdqa $xd3,$xb0 + lea 64($inp),$inp + +.Less_than_64_8xvl: + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xb0,0x20(%rsp) + lea ($out,$inp),$out + and \$63,$len + +.Loop_tail8xvl: + movzb ($inp,%r10),%eax + movzb (%rsp,%r10),%ecx + lea 1(%r10),%r10 + xor %ecx,%eax + mov %al,-1($out,%r10) + dec $len + jnz .Loop_tail8xvl + + vpxor $xa0,$xa0,$xa0 + vmovdqa $xa0,0x00(%rsp) + vmovdqa $xa0,0x20(%rsp) + +.Ldone8xvl: + vzeroall +___ +$code.=<<___ if ($win64); + movaps -0xa8(%r9),%xmm6 + movaps -0x98(%r9),%xmm7 + movaps -0x88(%r9),%xmm8 + movaps -0x78(%r9),%xmm9 + movaps -0x68(%r9),%xmm10 + movaps -0x58(%r9),%xmm11 + movaps -0x48(%r9),%xmm12 + movaps -0x38(%r9),%xmm13 + movaps -0x28(%r9),%xmm14 + movaps -0x18(%r9),%xmm15 +___ +$code.=<<___; + lea (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8xvl_epilogue: + ret +.cfi_endproc +.size ChaCha20_8xvl,.-ChaCha20_8xvl +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + lea .Lctr32_body(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + lea .Lno_data(%rip),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 64+24+48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size se_handler,.-se_handler + +.type simd_handler,\@abi-omnipotent +.align 16 +simd_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipR9 + + mov 4(%r11),%r10d # HandlerData[1] + mov 8(%r11),%ecx # HandlerData[2] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + neg %rcx + lea -8(%rax,%rcx),%rsi + lea 512($context),%rdi # &context.Xmm6 + neg %ecx + shr \$3,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lcommon_seh_tail +.size simd_handler,.-simd_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_ChaCha20_ctr32 + .rva .LSEH_end_ChaCha20_ctr32 + .rva .LSEH_info_ChaCha20_ctr32 + + .rva .LSEH_begin_ChaCha20_ssse3 + .rva .LSEH_end_ChaCha20_ssse3 + .rva .LSEH_info_ChaCha20_ssse3 + + .rva .LSEH_begin_ChaCha20_128 + .rva .LSEH_end_ChaCha20_128 + .rva .LSEH_info_ChaCha20_128 + + .rva .LSEH_begin_ChaCha20_4x + .rva .LSEH_end_ChaCha20_4x + .rva .LSEH_info_ChaCha20_4x +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_ChaCha20_4xop + .rva .LSEH_end_ChaCha20_4xop + .rva .LSEH_info_ChaCha20_4xop +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_ChaCha20_8x + .rva .LSEH_end_ChaCha20_8x + .rva .LSEH_info_ChaCha20_8x +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_ChaCha20_avx512 + .rva .LSEH_end_ChaCha20_avx512 + .rva .LSEH_info_ChaCha20_avx512 + + .rva .LSEH_begin_ChaCha20_avx512vl + .rva .LSEH_end_ChaCha20_avx512vl + .rva .LSEH_info_ChaCha20_avx512vl + + .rva .LSEH_begin_ChaCha20_16x + .rva .LSEH_end_ChaCha20_16x + .rva .LSEH_info_ChaCha20_16x + + .rva .LSEH_begin_ChaCha20_8xvl + .rva .LSEH_end_ChaCha20_8xvl + .rva .LSEH_info_ChaCha20_8xvl +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_ChaCha20_ctr32: + .byte 9,0,0,0 + .rva se_handler + +.LSEH_info_ChaCha20_ssse3: + .byte 9,0,0,0 + .rva simd_handler + .rva .Lssse3_body,.Lssse3_epilogue + .long 0x20,0 + +.LSEH_info_ChaCha20_128: + .byte 9,0,0,0 + .rva simd_handler + .rva .L128_body,.L128_epilogue + .long 0x60,0 + +.LSEH_info_ChaCha20_4x: + .byte 9,0,0,0 + .rva simd_handler + .rva .L4x_body,.L4x_epilogue + .long 0xa0,0 +___ +$code.=<<___ if ($avx); +.LSEH_info_ChaCha20_4xop: + .byte 9,0,0,0 + .rva simd_handler + .rva .L4xop_body,.L4xop_epilogue # HandlerData[] + .long 0xa0,0 +___ +$code.=<<___ if ($avx>1); +.LSEH_info_ChaCha20_8x: + .byte 9,0,0,0 + .rva simd_handler + .rva .L8x_body,.L8x_epilogue # HandlerData[] + .long 0xa0,0 +___ +$code.=<<___ if ($avx>2); +.LSEH_info_ChaCha20_avx512: + .byte 9,0,0,0 + .rva simd_handler + .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] + .long 0x20,0 + +.LSEH_info_ChaCha20_avx512vl: + .byte 9,0,0,0 + .rva simd_handler + .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] + .long 0x20,0 + +.LSEH_info_ChaCha20_16x: + .byte 9,0,0,0 + .rva simd_handler + .rva .L16x_body,.L16x_epilogue # HandlerData[] + .long 0xa0,0 + +.LSEH_info_ChaCha20_8xvl: + .byte 9,0,0,0 + .rva simd_handler + .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] + .long 0xa0,0 +___ +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/ge; + + s/%x#%[yz]/%x/g; # "down-shift" + + print $_,"\n"; +} + +close STDOUT; diff --git a/src/crypto/zinc/perlasm/x86_64-xlate.pl b/src/crypto/zinc/perlasm/x86_64-xlate.pl new file mode 100644 index 0000000..f8380f2 --- /dev/null +++ b/src/crypto/zinc/perlasm/x86_64-xlate.pl @@ -0,0 +1,1432 @@ +#! /usr/bin/env perl +# Copyright 2005-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + + +# Ascetic x86_64 AT&T to MASM/NASM assembler translator by . +# +# Why AT&T to MASM and not vice versa? Several reasons. Because AT&T +# format is way easier to parse. Because it's simpler to "gear" from +# Unix ABI to Windows one [see cross-reference "card" at the end of +# file]. Because Linux targets were available first... +# +# In addition the script also "distills" code suitable for GNU +# assembler, so that it can be compiled with more rigid assemblers, +# such as Solaris /usr/ccs/bin/as. +# +# This translator is not designed to convert *arbitrary* assembler +# code from AT&T format to MASM one. It's designed to convert just +# enough to provide for dual-ABI OpenSSL modules development... +# There *are* limitations and you might have to modify your assembler +# code or this script to achieve the desired result... +# +# Currently recognized limitations: +# +# - can't use multiple ops per line; +# +# Dual-ABI styling rules. +# +# 1. Adhere to Unix register and stack layout [see cross-reference +# ABI "card" at the end for explanation]. +# 2. Forget about "red zone," stick to more traditional blended +# stack frame allocation. If volatile storage is actually required +# that is. If not, just leave the stack as is. +# 3. Functions tagged with ".type name,@function" get crafted with +# unified Win64 prologue and epilogue automatically. If you want +# to take care of ABI differences yourself, tag functions as +# ".type name,@abi-omnipotent" instead. +# 4. To optimize the Win64 prologue you can specify number of input +# arguments as ".type name,@function,N." Keep in mind that if N is +# larger than 6, then you *have to* write "abi-omnipotent" code, +# because >6 cases can't be addressed with unified prologue. +# 5. Name local labels as .L*, do *not* use dynamic labels such as 1: +# (sorry about latter). +# 6. Don't use [or hand-code with .byte] "rep ret." "ret" mnemonic is +# required to identify the spots, where to inject Win64 epilogue! +# But on the pros, it's then prefixed with rep automatically:-) +# 7. Stick to explicit ip-relative addressing. If you have to use +# GOTPCREL addressing, stick to mov symbol@GOTPCREL(%rip),%r??. +# Both are recognized and translated to proper Win64 addressing +# modes. +# +# 8. In order to provide for structured exception handling unified +# Win64 prologue copies %rsp value to %rax. For further details +# see SEH paragraph at the end. +# 9. .init segment is allowed to contain calls to functions only. +# a. If function accepts more than 4 arguments *and* >4th argument +# is declared as non 64-bit value, do clear its upper part. + + +use strict; + +my $flavour = shift; +my $output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +open STDOUT,">$output" || die "can't open $output: $!" + if (defined($output)); + +my $gas=1; $gas=0 if ($output =~ /\.asm$/); +my $elf=1; $elf=0 if (!$gas); +my $win64=0; +my $prefix=""; +my $decor=".L"; + +my $masmref=8 + 50727*2**-32; # 8.00.50727 shipped with VS2005 +my $masm=0; +my $PTR=" PTR"; + +my $nasmref=2.03; +my $nasm=0; + +if ($flavour eq "mingw64") { $gas=1; $elf=0; $win64=1; + $prefix=`echo __USER_LABEL_PREFIX__ | $ENV{CC} -E -P -`; + $prefix =~ s|\R$||; # Better chomp + } +elsif ($flavour eq "macosx") { $gas=1; $elf=0; $prefix="_"; $decor="L\$"; } +elsif ($flavour eq "masm") { $gas=0; $elf=0; $masm=$masmref; $win64=1; $decor="\$L\$"; } +elsif ($flavour eq "nasm") { $gas=0; $elf=0; $nasm=$nasmref; $win64=1; $decor="\$L\$"; $PTR=""; } +elsif (!$gas) +{ if ($ENV{ASM} =~ m/nasm/ && `nasm -v` =~ m/version ([0-9]+)\.([0-9]+)/i) + { $nasm = $1 + $2*0.01; $PTR=""; } + elsif (`ml64 2>&1` =~ m/Version ([0-9]+)\.([0-9]+)(\.([0-9]+))?/) + { $masm = $1 + $2*2**-16 + $4*2**-32; } + die "no assembler found on %PATH%" if (!($nasm || $masm)); + $win64=1; + $elf=0; + $decor="\$L\$"; +} + +my $current_segment; +my $current_function; +my %globals; + +{ package opcode; # pick up opcodes + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^([a-z][a-z0-9]*)/i) { + bless $self,$class; + $self->{op} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + undef $self->{sz}; + if ($self->{op} =~ /^(movz)x?([bw]).*/) { # movz is pain... + $self->{op} = $1; + $self->{sz} = $2; + } elsif ($self->{op} =~ /call|jmp/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /^p/ && $' !~ /^(ush|op|insrw)/) { # SSEn + $self->{sz} = ""; + } elsif ($self->{op} =~ /^[vk]/) { # VEX or k* such as kmov + $self->{sz} = ""; + } elsif ($self->{op} =~ /mov[dq]/ && $$line =~ /%xmm/) { + $self->{sz} = ""; + } elsif ($self->{op} =~ /([a-z]{3,})([qlwb])$/) { + $self->{op} = $1; + $self->{sz} = $2; + } + } + $ret; + } + sub size { + my ($self, $sz) = @_; + $self->{sz} = $sz if (defined($sz) && !defined($self->{sz})); + $self->{sz}; + } + sub out { + my $self = shift; + if ($gas) { + if ($self->{op} eq "movz") { # movz is pain... + sprintf "%s%s%s",$self->{op},$self->{sz},shift; + } elsif ($self->{op} =~ /^set/) { + "$self->{op}"; + } elsif ($self->{op} eq "ret") { + my $epilogue = ""; + if ($win64 && $current_function->{abi} eq "svr4") { + $epilogue = "movq 8(%rsp),%rdi\n\t" . + "movq 16(%rsp),%rsi\n\t"; + } + $epilogue . ".byte 0xf3,0xc3"; + } elsif ($self->{op} eq "call" && !$elf && $current_segment eq ".init") { + ".p2align\t3\n\t.quad"; + } else { + "$self->{op}$self->{sz}"; + } + } else { + $self->{op} =~ s/^movz/movzx/; + if ($self->{op} eq "ret") { + $self->{op} = ""; + if ($win64 && $current_function->{abi} eq "svr4") { + $self->{op} = "mov rdi,QWORD$PTR\[8+rsp\]\t;WIN64 epilogue\n\t". + "mov rsi,QWORD$PTR\[16+rsp\]\n\t"; + } + $self->{op} .= "DB\t0F3h,0C3h\t\t;repret"; + } elsif ($self->{op} =~ /^(pop|push)f/) { + $self->{op} .= $self->{sz}; + } elsif ($self->{op} eq "call" && $current_segment eq ".CRT\$XCU") { + $self->{op} = "\tDQ"; + } + $self->{op}; + } + } + sub mnemonic { + my ($self, $op) = @_; + $self->{op}=$op if (defined($op)); + $self->{op}; + } +} +{ package const; # pick up constants, which start with $ + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /^\$([^,]+)/) { + bless $self, $class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub out { + my $self = shift; + + $self->{value} =~ s/\b(0b[0-1]+)/oct($1)/eig; + if ($gas) { + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{value} + my $value = $self->{value}; + no warnings; # oct might complain about overflow, ignore here... + $value =~ s/(?{value} = $value; + } + sprintf "\$%s",$self->{value}; + } else { + my $value = $self->{value}; + $value =~ s/0x([0-9a-f]+)/0$1h/ig if ($masm); + sprintf "%s",$value; + } + } +} +{ package ea; # pick up effective addresses: expr(%reg,%reg,scale) + + my %szmap = ( b=>"BYTE$PTR", w=>"WORD$PTR", + l=>"DWORD$PTR", d=>"DWORD$PTR", + q=>"QWORD$PTR", o=>"OWORD$PTR", + x=>"XMMWORD$PTR", y=>"YMMWORD$PTR", + z=>"ZMMWORD$PTR" ) if (!$gas); + + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)([^\(,]*)\(([%\w,]+)\)((?:{[^}]+})*)/) { + bless $self, $class; + $self->{asterisk} = $1; + $self->{label} = $2; + ($self->{base},$self->{index},$self->{scale})=split(/,/,$3); + $self->{scale} = 1 if (!defined($self->{scale})); + $self->{opmask} = $4; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + if ($win64 && $self->{label} =~ s/\@GOTPCREL//) { + die if ($opcode->mnemonic() ne "mov"); + $opcode->mnemonic("lea"); + } + $self->{base} =~ s/^%//; + $self->{index} =~ s/^%// if (defined($self->{index})); + $self->{opcode} = $opcode; + } + $ret; + } + sub size {} + sub out { + my ($self, $sz) = @_; + + $self->{label} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{label} =~ s/\.L/$decor/g; + + # Silently convert all EAs to 64-bit. This is required for + # elder GNU assembler and results in more compact code, + # *but* most importantly AES module depends on this feature! + $self->{index} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + $self->{base} =~ s/^[er](.?[0-9xpi])[d]?$/r\1/; + + # Solaris /usr/ccs/bin/as can't handle multiplications + # in $self->{label}... + use integer; + $self->{label} =~ s/(?{label} =~ s/\b([0-9]+\s*[\*\/\%]\s*[0-9]+)\b/eval($1)/eg; + + # Some assemblers insist on signed presentation of 32-bit + # offsets, but sign extension is a tricky business in perl... + if ((1<<31)<<1) { + $self->{label} =~ s/\b([0-9]+)\b/$1<<32>>32/eg; + } else { + $self->{label} =~ s/\b([0-9]+)\b/$1>>0/eg; + } + + # if base register is %rbp or %r13, see if it's possible to + # flip base and index registers [for better performance] + if (!$self->{label} && $self->{index} && $self->{scale}==1 && + $self->{base} =~ /(rbp|r13)/) { + $self->{base} = $self->{index}; $self->{index} = $1; + } + + if ($gas) { + $self->{label} =~ s/^___imp_/__imp__/ if ($flavour eq "mingw64"); + + if (defined($self->{index})) { + sprintf "%s%s(%s,%%%s,%d)%s", + $self->{asterisk},$self->{label}, + $self->{base}?"%$self->{base}":"", + $self->{index},$self->{scale}, + $self->{opmask}; + } else { + sprintf "%s%s(%%%s)%s", $self->{asterisk},$self->{label}, + $self->{base},$self->{opmask}; + } + } else { + $self->{label} =~ s/\./\$/g; + $self->{label} =~ s/(?{label} = "($self->{label})" if ($self->{label} =~ /[\*\+\-\/]/); + + my $mnemonic = $self->{opcode}->mnemonic(); + ($self->{asterisk}) && ($sz="q") || + ($mnemonic =~ /^v?mov([qd])$/) && ($sz=$1) || + ($mnemonic =~ /^v?pinsr([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^vpbroadcast([qdwb])$/) && ($sz=$1) || + ($mnemonic =~ /^v(?!perm)[a-z]+[fi]128$/) && ($sz="x"); + + $self->{opmask} =~ s/%(k[0-7])/$1/; + + if (defined($self->{index})) { + sprintf "%s[%s%s*%d%s]%s",$szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{index},$self->{scale}, + $self->{base}?"+$self->{base}":"", + $self->{opmask}; + } elsif ($self->{base} eq "rip") { + sprintf "%s[%s]",$szmap{$sz},$self->{label}; + } else { + sprintf "%s[%s%s]%s", $szmap{$sz}, + $self->{label}?"$self->{label}+":"", + $self->{base},$self->{opmask}; + } + } + } +} +{ package register; # pick up registers, which start with %. + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + # optional * ----vvv--- appears in indirect jmp/call + if ($$line =~ /^(\*?)%(\w+)((?:{[^}]+})*)/) { + bless $self,$class; + $self->{asterisk} = $1; + $self->{value} = $2; + $self->{opmask} = $3; + $opcode->size($self->size()); + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + } + $ret; + } + sub size { + my $self = shift; + my $ret; + + if ($self->{value} =~ /^r[\d]+b$/i) { $ret="b"; } + elsif ($self->{value} =~ /^r[\d]+w$/i) { $ret="w"; } + elsif ($self->{value} =~ /^r[\d]+d$/i) { $ret="l"; } + elsif ($self->{value} =~ /^r[\w]+$/i) { $ret="q"; } + elsif ($self->{value} =~ /^[a-d][hl]$/i){ $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}l$/i) { $ret="b"; } + elsif ($self->{value} =~ /^[\w]{2}$/i) { $ret="w"; } + elsif ($self->{value} =~ /^e[a-z]{2}$/i){ $ret="l"; } + + $ret; + } + sub out { + my $self = shift; + if ($gas) { sprintf "%s%%%s%s", $self->{asterisk}, + $self->{value}, + $self->{opmask}; } + else { $self->{opmask} =~ s/%(k[0-7])/$1/; + $self->{value}.$self->{opmask}; } + } +} +{ package label; # pick up labels, which end with : + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[\.\w]+)\:/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/^\.L/$decor/; + } + $ret; + } + sub out { + my $self = shift; + + if ($gas) { + my $func = ($globals{$self->{value}} or $self->{value}) . ":"; + if ($win64 && $current_function->{name} eq $self->{value} + && $current_function->{abi} eq "svr4") { + $func .= "\n"; + $func .= " movq %rdi,8(%rsp)\n"; + $func .= " movq %rsi,16(%rsp)\n"; + $func .= " movq %rsp,%rax\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " movq %rcx,%rdi\n" if ($narg>0); + $func .= " movq %rdx,%rsi\n" if ($narg>1); + $func .= " movq %r8,%rdx\n" if ($narg>2); + $func .= " movq %r9,%rcx\n" if ($narg>3); + $func .= " movq 40(%rsp),%r8\n" if ($narg>4); + $func .= " movq 48(%rsp),%r9\n" if ($narg>5); + } + $func; + } elsif ($self->{value} ne "$current_function->{name}") { + # Make all labels in masm global. + $self->{value} .= ":" if ($masm); + $self->{value} . ":"; + } elsif ($win64 && $current_function->{abi} eq "svr4") { + my $func = "$current_function->{name}" . + ($nasm ? ":" : "\tPROC $current_function->{scope}") . + "\n"; + $func .= " mov QWORD$PTR\[8+rsp\],rdi\t;WIN64 prologue\n"; + $func .= " mov QWORD$PTR\[16+rsp\],rsi\n"; + $func .= " mov rax,rsp\n"; + $func .= "${decor}SEH_begin_$current_function->{name}:"; + $func .= ":" if ($masm); + $func .= "\n"; + my $narg = $current_function->{narg}; + $narg=6 if (!defined($narg)); + $func .= " mov rdi,rcx\n" if ($narg>0); + $func .= " mov rsi,rdx\n" if ($narg>1); + $func .= " mov rdx,r8\n" if ($narg>2); + $func .= " mov rcx,r9\n" if ($narg>3); + $func .= " mov r8,QWORD$PTR\[40+rsp\]\n" if ($narg>4); + $func .= " mov r9,QWORD$PTR\[48+rsp\]\n" if ($narg>5); + $func .= "\n"; + } else { + "$current_function->{name}". + ($nasm ? ":" : "\tPROC $current_function->{scope}"); + } + } +} +{ package expr; # pick up expressions + sub re { + my ($class, $line, $opcode) = @_; + my $self = {}; + my $ret; + + if ($$line =~ /(^[^,]+)/) { + bless $self,$class; + $self->{value} = $1; + $ret = $self; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + $self->{value} =~ s/\@PLT// if (!$elf); + $self->{value} =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $self->{value} =~ s/\.L/$decor/g; + $self->{opcode} = $opcode; + } + $ret; + } + sub out { + my $self = shift; + if ($nasm && $self->{opcode}->mnemonic()=~m/^j(?![re]cxz)/) { + "NEAR ".$self->{value}; + } else { + $self->{value}; + } + } +} +{ package cfi_directive; + # CFI directives annotate instructions that are significant for + # stack unwinding procedure compliant with DWARF specification, + # see http://dwarfstd.org/. Besides naturally expected for this + # script platform-specific filtering function, this module adds + # three auxiliary synthetic directives not recognized by [GNU] + # assembler: + # + # - .cfi_push to annotate push instructions in prologue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_offset; + # - .cfi_pop to annotate pop instructions in epilogue, which + # translates to .cfi_adjust_cfa_offset (if needed) and + # .cfi_restore; + # - [and most notably] .cfi_cfa_expression which encodes + # DW_CFA_def_cfa_expression and passes it to .cfi_escape as + # byte vector; + # + # CFA expressions were introduced in DWARF specification version + # 3 and describe how to deduce CFA, Canonical Frame Address. This + # becomes handy if your stack frame is variable and you can't + # spare register for [previous] frame pointer. Suggested directive + # syntax is made-up mix of DWARF operator suffixes [subset of] + # and references to registers with optional bias. Following example + # describes offloaded *original* stack pointer at specific offset + # from *current* stack pointer: + # + # .cfi_cfa_expression %rsp+40,deref,+8 + # + # Final +8 has everything to do with the fact that CFA is defined + # as reference to top of caller's stack, and on x86_64 call to + # subroutine pushes 8-byte return address. In other words original + # stack pointer upon entry to a subroutine is 8 bytes off from CFA. + + # Below constants are taken from "DWARF Expressions" section of the + # DWARF specification, section is numbered 7.7 in versions 3 and 4. + my %DW_OP_simple = ( # no-arg operators, mapped directly + deref => 0x06, dup => 0x12, + drop => 0x13, over => 0x14, + pick => 0x15, swap => 0x16, + rot => 0x17, xderef => 0x18, + + abs => 0x19, and => 0x1a, + div => 0x1b, minus => 0x1c, + mod => 0x1d, mul => 0x1e, + neg => 0x1f, not => 0x20, + or => 0x21, plus => 0x22, + shl => 0x24, shr => 0x25, + shra => 0x26, xor => 0x27, + ); + + my %DW_OP_complex = ( # used in specific subroutines + constu => 0x10, # uleb128 + consts => 0x11, # sleb128 + plus_uconst => 0x23, # uleb128 + lit0 => 0x30, # add 0-31 to opcode + reg0 => 0x50, # add 0-31 to opcode + breg0 => 0x70, # add 0-31 to opcole, sleb128 + regx => 0x90, # uleb28 + fbreg => 0x91, # sleb128 + bregx => 0x92, # uleb128, sleb128 + piece => 0x93, # uleb128 + ); + + # Following constants are defined in x86_64 ABI supplement, for + # example available at https://www.uclibc.org/docs/psABI-x86_64.pdf, + # see section 3.7 "Stack Unwind Algorithm". + my %DW_reg_idx = ( + "%rax"=>0, "%rdx"=>1, "%rcx"=>2, "%rbx"=>3, + "%rsi"=>4, "%rdi"=>5, "%rbp"=>6, "%rsp"=>7, + "%r8" =>8, "%r9" =>9, "%r10"=>10, "%r11"=>11, + "%r12"=>12, "%r13"=>13, "%r14"=>14, "%r15"=>15 + ); + + my ($cfa_reg, $cfa_rsp); + + # [us]leb128 format is variable-length integer representation base + # 2^128, with most significant bit of each byte being 0 denoting + # *last* most significant digit. See "Variable Length Data" in the + # DWARF specification, numbered 7.6 at least in versions 3 and 4. + sub sleb128 { + use integer; # get right shift extend sign + + my $val = shift; + my $sign = ($val < 0) ? -1 : 0; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if remaining bits are same and equal to most + # significant bit of the current digit, if so, it's + # last digit... + last if (($val>>6) == $sign); + + @ret[-1] |= 0x80; + $val >>= 7; + } + + return @ret; + } + sub uleb128 { + my $val = shift; + my @ret = (); + + while(1) { + push @ret, $val&0x7f; + + # see if it's last significant digit... + last if (($val >>= 7) == 0); + + @ret[-1] |= 0x80; + } + + return @ret; + } + sub const { + my $val = shift; + + if ($val >= 0 && $val < 32) { + return ($DW_OP_complex{lit0}+$val); + } + return ($DW_OP_complex{consts}, sleb128($val)); + } + sub reg { + my $val = shift; + + return if ($val !~ m/^(%r\w+)(?:([\+\-])((?:0x)?[0-9a-f]+))?/); + + my $reg = $DW_reg_idx{$1}; + my $off = eval ("0 $2 $3"); + + return (($DW_OP_complex{breg0} + $reg), sleb128($off)); + # Yes, we use DW_OP_bregX+0 to push register value and not + # DW_OP_regX, because latter would require even DW_OP_piece, + # which would be a waste under the circumstances. If you have + # to use DWP_OP_reg, use "regx:N"... + } + sub cfa_expression { + my $line = shift; + my @ret; + + foreach my $token (split(/,\s*/,$line)) { + if ($token =~ /^%r/) { + push @ret,reg($token); + } elsif ($token =~ /((?:0x)?[0-9a-f]+)\((%r\w+)\)/) { + push @ret,reg("$2+$1"); + } elsif ($token =~ /(\w+):(\-?(?:0x)?[0-9a-f]+)(U?)/i) { + my $i = 1*eval($2); + push @ret,$DW_OP_complex{$1}, ($3 ? uleb128($i) : sleb128($i)); + } elsif (my $i = 1*eval($token) or $token eq "0") { + if ($token =~ /^\+/) { + push @ret,$DW_OP_complex{plus_uconst},uleb128($i); + } else { + push @ret,const($i); + } + } else { + push @ret,$DW_OP_simple{$token}; + } + } + + # Finally we return DW_CFA_def_cfa_expression, 15, followed by + # length of the expression and of course the expression itself. + return (15,scalar(@ret),@ret); + } + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + + if ($$line =~ s/^\s*\.cfi_(\w+)\s*//) { + bless $self,$class; + $ret = $self; + undef $self->{value}; + my $dir = $1; + + SWITCH: for ($dir) { + # What is $cfa_rsp? Effectively it's difference between %rsp + # value and current CFA, Canonical Frame Address, which is + # why it starts with -8. Recall that CFA is top of caller's + # stack... + /startproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", -8); last; }; + /endproc/ && do { ($cfa_reg, $cfa_rsp) = ("%rsp", 0); last; }; + /def_cfa_register/ + && do { $cfa_reg = $$line; last; }; + /def_cfa_offset/ + && do { $cfa_rsp = -1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /adjust_cfa_offset/ + && do { $cfa_rsp -= 1*eval($$line) if ($cfa_reg eq "%rsp"); + last; + }; + /def_cfa/ && do { if ($$line =~ /(%r\w+)\s*,\s*(.+)/) { + $cfa_reg = $1; + $cfa_rsp = -1*eval($2) if ($cfa_reg eq "%rsp"); + } + last; + }; + /push/ && do { $dir = undef; + $cfa_rsp -= 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t8\n"; + } + $self->{value} .= ".cfi_offset\t$$line,$cfa_rsp"; + last; + }; + /pop/ && do { $dir = undef; + $cfa_rsp += 8; + if ($cfa_reg eq "%rsp") { + $self->{value} = ".cfi_adjust_cfa_offset\t-8\n"; + } + $self->{value} .= ".cfi_restore\t$$line"; + last; + }; + /cfa_expression/ + && do { $dir = undef; + $self->{value} = ".cfi_escape\t" . + join(",", map(sprintf("0x%02x", $_), + cfa_expression($$line))); + last; + }; + } + + $self->{value} = ".cfi_$dir\t$$line" if ($dir); + + $$line = ""; + } + + return $ret; + } + sub out { + my $self = shift; + return ($elf ? $self->{value} : undef); + } +} +{ package directive; # pick up directives, which start with . + sub re { + my ($class, $line) = @_; + my $self = {}; + my $ret; + my $dir; + + # chain-call to cfi_directive + $ret = cfi_directive->re($line) and return $ret; + + if ($$line =~ /^\s*(\.\w+)/) { + bless $self,$class; + $dir = $1; + $ret = $self; + undef $self->{value}; + $$line = substr($$line,@+[0]); $$line =~ s/^\s+//; + + SWITCH: for ($dir) { + /\.global|\.globl|\.extern/ + && do { $globals{$$line} = $prefix . $$line; + $$line = $globals{$$line} if ($prefix); + last; + }; + /\.type/ && do { my ($sym,$type,$narg) = split(',',$$line); + if ($type eq "\@function") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{abi} = "svr4"; + $current_function->{narg} = $narg; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } elsif ($type eq "\@abi-omnipotent") { + undef $current_function; + $current_function->{name} = $sym; + $current_function->{scope} = defined($globals{$sym})?"PUBLIC":"PRIVATE"; + } + $$line =~ s/\@abi\-omnipotent/\@function/; + $$line =~ s/\@function.*/\@function/; + last; + }; + /\.asciz/ && do { if ($$line =~ /^"(.*)"$/) { + $dir = ".byte"; + $$line = join(",",unpack("C*",$1),0); + } + last; + }; + /\.rva|\.long|\.quad/ + && do { $$line =~ s/([_a-z][_a-z0-9]*)/$globals{$1} or $1/gei; + $$line =~ s/\.L/$decor/g; + last; + }; + } + + if ($gas) { + $self->{value} = $dir . "\t" . $$line; + + if ($dir =~ /\.extern/) { + $self->{value} = ""; # swallow extern + } elsif (!$elf && $dir =~ /\.type/) { + $self->{value} = ""; + $self->{value} = ".def\t" . ($globals{$1} or $1) . ";\t" . + (defined($globals{$1})?".scl 2;":".scl 3;") . + "\t.type 32;\t.endef" + if ($win64 && $$line =~ /([^,]+),\@function/); + } elsif (!$elf && $dir =~ /\.size/) { + $self->{value} = ""; + if (defined($current_function)) { + $self->{value} .= "${decor}SEH_end_$current_function->{name}:" + if ($win64 && $current_function->{abi} eq "svr4"); + undef $current_function; + } + } elsif (!$elf && $dir =~ /\.align/) { + $self->{value} = ".p2align\t" . (log($$line)/log(2)); + } elsif ($dir eq ".section") { + $current_segment=$$line; + if (!$elf && $current_segment eq ".init") { + if ($flavour eq "macosx") { $self->{value} = ".mod_init_func"; } + elsif ($flavour eq "mingw64") { $self->{value} = ".section\t.ctors"; } + } + } elsif ($dir =~ /\.(text|data)/) { + $current_segment=".$1"; + } elsif ($dir =~ /\.hidden/) { + if ($flavour eq "macosx") { $self->{value} = ".private_extern\t$prefix$$line"; } + elsif ($flavour eq "mingw64") { $self->{value} = ""; } + } elsif ($dir =~ /\.comm/) { + $self->{value} = "$dir\t$prefix$$line"; + $self->{value} =~ s|,([0-9]+),([0-9]+)$|",$1,".log($2)/log(2)|e if ($flavour eq "macosx"); + } + $$line = ""; + return $self; + } + + # non-gas case or nasm/masm + SWITCH: for ($dir) { + /\.text/ && do { my $v=undef; + if ($nasm) { + $v="section .text code align=64\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = ".text\$"; + $v.="$current_segment\tSEGMENT "; + $v.=$masm>=$masmref ? "ALIGN(256)" : "PAGE"; + $v.=" 'CODE'"; + } + $self->{value} = $v; + last; + }; + /\.data/ && do { my $v=undef; + if ($nasm) { + $v="section .data data align=8\n"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT"; + } + $self->{value} = $v; + last; + }; + /\.section/ && do { my $v=undef; + $$line =~ s/([^,]*).*/$1/; + $$line = ".CRT\$XCU" if ($$line eq ".init"); + if ($nasm) { + $v="section $$line"; + if ($$line=~/\.([px])data/) { + $v.=" rdata align="; + $v.=$1 eq "p"? 4 : 8; + } elsif ($$line=~/\.CRT\$/i) { + $v.=" rdata align=8"; + } + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $v.="$$line\tSEGMENT"; + if ($$line=~/\.([px])data/) { + $v.=" READONLY"; + $v.=" ALIGN(".($1 eq "p" ? 4 : 8).")" if ($masm>=$masmref); + } elsif ($$line=~/\.CRT\$/i) { + $v.=" READONLY "; + $v.=$masm>=$masmref ? "ALIGN(8)" : "DWORD"; + } + } + $current_segment = $$line; + $self->{value} = $v; + last; + }; + /\.extern/ && do { $self->{value} = "EXTERN\t".$$line; + $self->{value} .= ":NEAR" if ($masm); + last; + }; + /\.globl|.global/ + && do { $self->{value} = $masm?"PUBLIC":"global"; + $self->{value} .= "\t".$$line; + last; + }; + /\.size/ && do { if (defined($current_function)) { + undef $self->{value}; + if ($current_function->{abi} eq "svr4") { + $self->{value}="${decor}SEH_end_$current_function->{name}:"; + $self->{value}.=":\n" if($masm); + } + $self->{value}.="$current_function->{name}\tENDP" if($masm && $current_function->{name}); + undef $current_function; + } + last; + }; + /\.align/ && do { my $max = ($masm && $masm>=$masmref) ? 256 : 4096; + $self->{value} = "ALIGN\t".($$line>$max?$max:$$line); + last; + }; + /\.(value|long|rva|quad)/ + && do { my $sz = substr($1,0,1); + my @arr = split(/,\s*/,$$line); + my $last = pop(@arr); + my $conv = sub { my $var=shift; + $var=~s/^(0b[0-1]+)/oct($1)/eig; + $var=~s/^0x([0-9a-f]+)/0$1h/ig if ($masm); + if ($sz eq "D" && ($current_segment=~/.[px]data/ || $dir eq ".rva")) + { $var=~s/^([_a-z\$\@][_a-z0-9\$\@]*)/$nasm?"$1 wrt ..imagebase":"imagerel $1"/egi; } + $var; + }; + + $sz =~ tr/bvlrq/BWDDQ/; + $self->{value} = "\tD$sz\t"; + for (@arr) { $self->{value} .= &$conv($_).","; } + $self->{value} .= &$conv($last); + last; + }; + /\.byte/ && do { my @str=split(/,\s*/,$$line); + map(s/(0b[0-1]+)/oct($1)/eig,@str); + map(s/0x([0-9a-f]+)/0$1h/ig,@str) if ($masm); + while ($#str>15) { + $self->{value}.="DB\t" + .join(",",@str[0..15])."\n"; + foreach (0..15) { shift @str; } + } + $self->{value}.="DB\t" + .join(",",@str) if (@str); + last; + }; + /\.comm/ && do { my @str=split(/,\s*/,$$line); + my $v=undef; + if ($nasm) { + $v.="common $prefix@str[0] @str[1]"; + } else { + $v="$current_segment\tENDS\n" if ($current_segment); + $current_segment = "_DATA"; + $v.="$current_segment\tSEGMENT\n"; + $v.="COMM @str[0]:DWORD:".@str[1]/4; + } + $self->{value} = $v; + last; + }; + } + $$line = ""; + } + + $ret; + } + sub out { + my $self = shift; + $self->{value}; + } +} + +# Upon initial x86_64 introduction SSE>2 extensions were not introduced +# yet. In order not to be bothered by tracing exact assembler versions, +# but at the same time to provide a bare security minimum of AES-NI, we +# hard-code some instructions. Extensions past AES-NI on the other hand +# are traced by examining assembler version in individual perlasm +# modules... + +my %regrm = ( "%eax"=>0, "%ecx"=>1, "%edx"=>2, "%ebx"=>3, + "%esp"=>4, "%ebp"=>5, "%esi"=>6, "%edi"=>7 ); + +sub rex { + my $opcode=shift; + my ($dst,$src,$rex)=@_; + + $rex|=0x04 if($dst>=8); + $rex|=0x01 if($src>=8); + push @$opcode,($rex|0x40) if ($rex); +} + +my $movq = sub { # elderly gas can't handle inter-register movq + my $arg = shift; + my @opcode=(0x66); + if ($arg =~ /%xmm([0-9]+),\s*%r(\w+)/) { + my ($src,$dst)=($1,$2); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x7e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } elsif ($arg =~ /%r(\w+),\s*%xmm([0-9]+)/) { + my ($src,$dst)=($2,$1); + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,$src,$dst,0x8); + push @opcode,0x0f,0x6e; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + @opcode; + } else { + (); + } +}; + +my $pextrd = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*(%\w+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($dst =~ /%r([0-9]+)d/) { $dst = $1; } + elsif ($dst =~ /%e/) { $dst = $regrm{$dst}; } + rex(\@opcode,$src,$dst); + push @opcode,0x0f,0x3a,0x16; + push @opcode,0xc0|(($src&7)<<3)|($dst&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pinsrd = sub { + if (shift =~ /\$([0-9]+),\s*(%\w+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + my $imm=$1; + my $src=$2; + my $dst=$3; + if ($src =~ /%r([0-9]+)/) { $src = $1; } + elsif ($src =~ /%e/) { $src = $regrm{$src}; } + rex(\@opcode,$dst,$src); + push @opcode,0x0f,0x3a,0x22; + push @opcode,0xc0|(($dst&7)<<3)|($src&7); # ModR/M + push @opcode,$imm; + @opcode; + } else { + (); + } +}; + +my $pshufb = sub { + if (shift =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$2,$1); + push @opcode,0x0f,0x38,0x00; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + @opcode; + } else { + (); + } +}; + +my $palignr = sub { + if (shift =~ /\$([0-9]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x0f; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + push @opcode,$1; + @opcode; + } else { + (); + } +}; + +my $pclmulqdq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x66); + rex(\@opcode,$3,$2); + push @opcode,0x0f,0x3a,0x44; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $rdrand = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf0|($dst&7); + @opcode; + } else { + (); + } +}; + +my $rdseed = sub { + if (shift =~ /%[er](\w+)/) { + my @opcode=(); + my $dst=$1; + if ($dst !~ /[0-9]+/) { $dst = $regrm{"%e$dst"}; } + rex(\@opcode,0,$dst,8); + push @opcode,0x0f,0xc7,0xf8|($dst&7); + @opcode; + } else { + (); + } +}; + +# Not all AVX-capable assemblers recognize AMD XOP extension. Since we +# are using only two instructions hand-code them in order to be excused +# from chasing assembler versions... + +sub rxb { + my $opcode=shift; + my ($dst,$src1,$src2,$rxb)=@_; + + $rxb|=0x7<<5; + $rxb&=~(0x04<<5) if($dst>=8); + $rxb&=~(0x01<<5) if($src1>=8); + $rxb&=~(0x02<<5) if($src2>=8); + push @$opcode,$rxb; +} + +my $vprotd = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc2; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +my $vprotq = sub { + if (shift =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { + my @opcode=(0x8f); + rxb(\@opcode,$3,$2,-1,0x08); + push @opcode,0x78,0xc3; + push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M + my $c=$1; + push @opcode,$c=~/^0/?oct($c):$c; + @opcode; + } else { + (); + } +}; + +# Intel Control-flow Enforcement Technology extension. All functions and +# indirect branch targets will have to start with this instruction... + +my $endbranch = sub { + (0xf3,0x0f,0x1e,0xfa); +}; + +######################################################################## + +if ($nasm) { + print <<___; +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +___ +} elsif ($masm) { + print <<___; +OPTION DOTNAME +___ +} +while(defined(my $line=<>)) { + + $line =~ s|\R$||; # Better chomp + + $line =~ s|[#!].*$||; # get rid of asm-style comments... + $line =~ s|/\*.*\*/||; # ... and C-style comments... + $line =~ s|^\s+||; # ... and skip white spaces in beginning + $line =~ s|\s+$||; # ... and at the end + + if (my $label=label->re(\$line)) { print $label->out(); } + + if (my $directive=directive->re(\$line)) { + printf "%s",$directive->out(); + } elsif (my $opcode=opcode->re(\$line)) { + my $asm = eval("\$".$opcode->mnemonic()); + + if ((ref($asm) eq 'CODE') && scalar(my @bytes=&$asm($line))) { + print $gas?".byte\t":"DB\t",join(',',@bytes),"\n"; + next; + } + + my @args; + ARGUMENT: while (1) { + my $arg; + + ($arg=register->re(\$line, $opcode))|| + ($arg=const->re(\$line)) || + ($arg=ea->re(\$line, $opcode)) || + ($arg=expr->re(\$line, $opcode)) || + last ARGUMENT; + + push @args,$arg; + + last ARGUMENT if ($line !~ /^,/); + + $line =~ s/^,\s*//; + } # ARGUMENT: + + if ($#args>=0) { + my $insn; + my $sz=$opcode->size(); + + if ($gas) { + $insn = $opcode->out($#args>=1?$args[$#args]->size():$sz); + @args = map($_->out($sz),@args); + printf "\t%s\t%s",$insn,join(",",@args); + } else { + $insn = $opcode->out(); + foreach (@args) { + my $arg = $_->out(); + # $insn.=$sz compensates for movq, pinsrw, ... + if ($arg =~ /^xmm[0-9]+$/) { $insn.=$sz; $sz="x" if(!$sz); last; } + if ($arg =~ /^ymm[0-9]+$/) { $insn.=$sz; $sz="y" if(!$sz); last; } + if ($arg =~ /^zmm[0-9]+$/) { $insn.=$sz; $sz="z" if(!$sz); last; } + if ($arg =~ /^mm[0-9]+$/) { $insn.=$sz; $sz="q" if(!$sz); last; } + } + @args = reverse(@args); + undef $sz if ($nasm && $opcode->mnemonic() eq "lea"); + printf "\t%s\t%s",$insn,join(",",map($_->out($sz),@args)); + } + } else { + printf "\t%s",$opcode->out(); + } + } + + print $line,"\n"; +} + +print "\n$current_segment\tENDS\n" if ($current_segment && $masm); +print "END\n" if ($masm); + +close STDOUT; + + ################################################# +# Cross-reference x86_64 ABI "card" +# +# Unix Win64 +# %rax * * +# %rbx - - +# %rcx #4 #1 +# %rdx #3 #2 +# %rsi #2 - +# %rdi #1 - +# %rbp - - +# %rsp - - +# %r8 #5 #3 +# %r9 #6 #4 +# %r10 * * +# %r11 * * +# %r12 - - +# %r13 - - +# %r14 - - +# %r15 - - +# +# (*) volatile register +# (-) preserved by callee +# (#) Nth argument, volatile +# +# In Unix terms top of stack is argument transfer area for arguments +# which could not be accommodated in registers. Or in other words 7th +# [integer] argument resides at 8(%rsp) upon function entry point. +# 128 bytes above %rsp constitute a "red zone" which is not touched +# by signal handlers and can be used as temporal storage without +# allocating a frame. +# +# In Win64 terms N*8 bytes on top of stack is argument transfer area, +# which belongs to/can be overwritten by callee. N is the number of +# arguments passed to callee, *but* not less than 4! This means that +# upon function entry point 5th argument resides at 40(%rsp), as well +# as that 32 bytes from 8(%rsp) can always be used as temporal +# storage [without allocating a frame]. One can actually argue that +# one can assume a "red zone" above stack pointer under Win64 as well. +# Point is that at apparently no occasion Windows kernel would alter +# the area above user stack pointer in true asynchronous manner... +# +# All the above means that if assembler programmer adheres to Unix +# register and stack layout, but disregards the "red zone" existence, +# it's possible to use following prologue and epilogue to "gear" from +# Unix to Win64 ABI in leaf functions with not more than 6 arguments. +# +# omnipotent_function: +# ifdef WIN64 +# movq %rdi,8(%rsp) +# movq %rsi,16(%rsp) +# movq %rcx,%rdi ; if 1st argument is actually present +# movq %rdx,%rsi ; if 2nd argument is actually ... +# movq %r8,%rdx ; if 3rd argument is ... +# movq %r9,%rcx ; if 4th argument ... +# movq 40(%rsp),%r8 ; if 5th ... +# movq 48(%rsp),%r9 ; if 6th ... +# endif +# ... +# ifdef WIN64 +# movq 8(%rsp),%rdi +# movq 16(%rsp),%rsi +# endif +# ret +# + ################################################# +# Win64 SEH, Structured Exception Handling. +# +# Unlike on Unix systems(*) lack of Win64 stack unwinding information +# has undesired side-effect at run-time: if an exception is raised in +# assembler subroutine such as those in question (basically we're +# referring to segmentation violations caused by malformed input +# parameters), the application is briskly terminated without invoking +# any exception handlers, most notably without generating memory dump +# or any user notification whatsoever. This poses a problem. It's +# possible to address it by registering custom language-specific +# handler that would restore processor context to the state at +# subroutine entry point and return "exception is not handled, keep +# unwinding" code. Writing such handler can be a challenge... But it's +# doable, though requires certain coding convention. Consider following +# snippet: +# +# .type function,@function +# function: +# movq %rsp,%rax # copy rsp to volatile register +# pushq %r15 # save non-volatile registers +# pushq %rbx +# pushq %rbp +# movq %rsp,%r11 +# subq %rdi,%r11 # prepare [variable] stack frame +# andq $-64,%r11 +# movq %rax,0(%r11) # check for exceptions +# movq %r11,%rsp # allocate [variable] stack frame +# movq %rax,0(%rsp) # save original rsp value +# magic_point: +# ... +# movq 0(%rsp),%rcx # pull original rsp value +# movq -24(%rcx),%rbp # restore non-volatile registers +# movq -16(%rcx),%rbx +# movq -8(%rcx),%r15 +# movq %rcx,%rsp # restore original rsp +# magic_epilogue: +# ret +# .size function,.-function +# +# The key is that up to magic_point copy of original rsp value remains +# in chosen volatile register and no non-volatile register, except for +# rsp, is modified. While past magic_point rsp remains constant till +# the very end of the function. In this case custom language-specific +# exception handler would look like this: +# +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +# { ULONG64 *rsp = (ULONG64 *)context->Rax; +# ULONG64 rip = context->Rip; +# +# if (rip >= magic_point) +# { rsp = (ULONG64 *)context->Rsp; +# if (rip < magic_epilogue) +# { rsp = (ULONG64 *)rsp[0]; +# context->Rbp = rsp[-3]; +# context->Rbx = rsp[-2]; +# context->R15 = rsp[-1]; +# } +# } +# context->Rsp = (ULONG64)rsp; +# context->Rdi = rsp[1]; +# context->Rsi = rsp[2]; +# +# memcpy (disp->ContextRecord,context,sizeof(CONTEXT)); +# RtlVirtualUnwind(UNW_FLAG_NHANDLER,disp->ImageBase, +# dips->ControlPc,disp->FunctionEntry,disp->ContextRecord, +# &disp->HandlerData,&disp->EstablisherFrame,NULL); +# return ExceptionContinueSearch; +# } +# +# It's appropriate to implement this handler in assembler, directly in +# function's module. In order to do that one has to know members' +# offsets in CONTEXT and DISPATCHER_CONTEXT structures and some constant +# values. Here they are: +# +# CONTEXT.Rax 120 +# CONTEXT.Rcx 128 +# CONTEXT.Rdx 136 +# CONTEXT.Rbx 144 +# CONTEXT.Rsp 152 +# CONTEXT.Rbp 160 +# CONTEXT.Rsi 168 +# CONTEXT.Rdi 176 +# CONTEXT.R8 184 +# CONTEXT.R9 192 +# CONTEXT.R10 200 +# CONTEXT.R11 208 +# CONTEXT.R12 216 +# CONTEXT.R13 224 +# CONTEXT.R14 232 +# CONTEXT.R15 240 +# CONTEXT.Rip 248 +# CONTEXT.Xmm6 512 +# sizeof(CONTEXT) 1232 +# DISPATCHER_CONTEXT.ControlPc 0 +# DISPATCHER_CONTEXT.ImageBase 8 +# DISPATCHER_CONTEXT.FunctionEntry 16 +# DISPATCHER_CONTEXT.EstablisherFrame 24 +# DISPATCHER_CONTEXT.TargetIp 32 +# DISPATCHER_CONTEXT.ContextRecord 40 +# DISPATCHER_CONTEXT.LanguageHandler 48 +# DISPATCHER_CONTEXT.HandlerData 56 +# UNW_FLAG_NHANDLER 0 +# ExceptionContinueSearch 1 +# +# In order to tie the handler to the function one has to compose +# couple of structures: one for .xdata segment and one for .pdata. +# +# UNWIND_INFO structure for .xdata segment would be +# +# function_unwind_info: +# .byte 9,0,0,0 +# .rva handler +# +# This structure designates exception handler for a function with +# zero-length prologue, no stack frame or frame register. +# +# To facilitate composing of .pdata structures, auto-generated "gear" +# prologue copies rsp value to rax and denotes next instruction with +# .LSEH_begin_{function_name} label. This essentially defines the SEH +# styling rule mentioned in the beginning. Position of this label is +# chosen in such manner that possible exceptions raised in the "gear" +# prologue would be accounted to caller and unwound from latter's frame. +# End of function is marked with respective .LSEH_end_{function_name} +# label. To summarize, .pdata segment would contain +# +# .rva .LSEH_begin_function +# .rva .LSEH_end_function +# .rva function_unwind_info +# +# Reference to function_unwind_info from .xdata segment is the anchor. +# In case you wonder why references are 32-bit .rvas and not 64-bit +# .quads. References put into these two segments are required to be +# *relative* to the base address of the current binary module, a.k.a. +# image base. No Win64 module, be it .exe or .dll, can be larger than +# 2GB and thus such relative references can be and are accommodated in +# 32 bits. +# +# Having reviewed the example function code, one can argue that "movq +# %rsp,%rax" above is redundant. It is not! Keep in mind that on Unix +# rax would contain an undefined value. If this "offends" you, use +# another register and refrain from modifying rax till magic_point is +# reached, i.e. as if it was a non-volatile register. If more registers +# are required prior [variable] frame setup is completed, note that +# nobody says that you can have only one "magic point." You can +# "liberate" non-volatile registers by denoting last stack off-load +# instruction and reflecting it in finer grade unwind logic in handler. +# After all, isn't it why it's called *language-specific* handler... +# +# SE handlers are also involved in unwinding stack when executable is +# profiled or debugged. Profiling implies additional limitations that +# are too subtle to discuss here. For now it's sufficient to say that +# in order to simplify handlers one should either a) offload original +# %rsp to stack (like discussed above); or b) if you have a register to +# spare for frame pointer, choose volatile one. +# +# (*) Note that we're talking about run-time, not debug-time. Lack of +# unwind information makes debugging hard on both Windows and +# Unix. "Unlike" refers to the fact that on Unix signal handler +# will always be invoked, core dumped and appropriate exit code +# returned to parent (for user notification). diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.S b/src/crypto/zinc/poly1305/poly1305-x86_64.S deleted file mode 100644 index 3c3f2b4..0000000 --- a/src/crypto/zinc/poly1305/poly1305-x86_64.S +++ /dev/null @@ -1,2792 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ -/* - * Copyright (C) 2017 Samuel Neves . All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * Copyright (C) 2006-2017 CRYPTOGAMS by . All Rights Reserved. - * - * This is based in part on Andy Polyakov's implementation from CRYPTOGAMS. - */ - -#include - -.section .rodata.cst192.Lconst, "aM", @progbits, 192 -.align 64 -.Lconst: -.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 -.long 16777216,0,16777216,0,16777216,0,16777216,0 -.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 -.long 2,2,2,3,2,0,2,1 -.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 - -.text - -.align 32 -ENTRY(poly1305_init_x86_64) - xorq %rax,%rax - movq %rax,0(%rdi) - movq %rax,8(%rdi) - movq %rax,16(%rdi) - - cmpq $0,%rsi - je .Lno_key - - movq $0x0ffffffc0fffffff,%rax - movq $0x0ffffffc0ffffffc,%rcx - andq 0(%rsi),%rax - andq 8(%rsi),%rcx - movq %rax,24(%rdi) - movq %rcx,32(%rdi) - movl $1,%eax -.Lno_key: - ret -ENDPROC(poly1305_init_x86_64) - -.align 32 -ENTRY(poly1305_blocks_x86_64) -.Lblocks: - shrq $4,%rdx - jz .Lno_data - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movq 16(%rdi),%r10 - - movq %r13,%r12 - shrq $2,%r13 - movq %r12,%rax - addq %r12,%r13 - jmp .Loop - -.align 32 -.Loop: - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - mulq %r14 - movq %rax,%r9 - movq %r11,%rax - movq %rdx,%rdi - - mulq %r14 - movq %rax,%r14 - movq %r11,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r9 - movq %r13,%rax - adcq %rdx,%rdi - - mulq %rbx - movq %r10,%rbx - addq %rax,%r14 - adcq %rdx,%r8 - - imulq %r13,%rbx - addq %rbx,%r9 - movq %r8,%rbx - adcq $0,%rdi - - imulq %r11,%r10 - addq %r9,%rbx - movq $-4,%rax - adcq %r10,%rdi - - andq %rdi,%rax - movq %rdi,%r10 - shrq $2,%rdi - andq $3,%r10 - addq %rdi,%rax - addq %rax,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r12,%rax - decq %r15 - jnz .Loop - - movq 0(%rsp),%rdi - - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp -.Lno_data: -.Lblocks_epilogue: - ret -ENDPROC(poly1305_blocks_x86_64) - -.align 32 -ENTRY(poly1305_emit_x86_64) -.Lemit: - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - - movq %r8,%rax - addq $5,%r8 - movq %r9,%rcx - adcq $0,%r9 - adcq $0,%r10 - shrq $2,%r10 - cmovnzq %r8,%rax - cmovnzq %r9,%rcx - - addq 0(%rdx),%rax - adcq 8(%rdx),%rcx - movq %rax,0(%rsi) - movq %rcx,8(%rsi) - - ret -ENDPROC(poly1305_emit_x86_64) - -.macro __poly1305_block - mulq %r14 - movq %rax,%r9 - movq %r11,%rax - movq %rdx,%rdi - - mulq %r14 - movq %rax,%r14 - movq %r11,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r9 - movq %r13,%rax - adcq %rdx,%rdi - - mulq %rbx - movq %r10,%rbx - addq %rax,%r14 - adcq %rdx,%r8 - - imulq %r13,%rbx - addq %rbx,%r9 - movq %r8,%rbx - adcq $0,%rdi - - imulq %r11,%r10 - addq %r9,%rbx - movq $-4,%rax - adcq %r10,%rdi - - andq %rdi,%rax - movq %rdi,%r10 - shrq $2,%rdi - andq $3,%r10 - addq %rdi,%rax - addq %rax,%r14 - adcq $0,%rbx - adcq $0,%r10 -.endm - -.macro __poly1305_init_avx - movq %r11,%r14 - movq %r12,%rbx - xorq %r10,%r10 - - leaq 48+64(%rdi),%rdi - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - movq %r14,%r8 - andl %r14d,%eax - movq %r11,%r9 - andl %r11d,%edx - movl %eax,-64(%rdi) - shrq $26,%r8 - movl %edx,-60(%rdi) - shrq $26,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - andl %r8d,%eax - andl %r9d,%edx - movl %eax,-48(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-44(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,-32(%rdi) - shrq $26,%r8 - movl %edx,-28(%rdi) - shrq $26,%r9 - - movq %rbx,%rax - movq %r12,%rdx - shlq $12,%rax - shlq $12,%rdx - orq %r8,%rax - orq %r9,%rdx - andl $0x3ffffff,%eax - andl $0x3ffffff,%edx - movl %eax,-16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,-12(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,0(%rdi) - movq %rbx,%r8 - movl %edx,4(%rdi) - movq %r12,%r9 - - movl $0x3ffffff,%eax - movl $0x3ffffff,%edx - shrq $14,%r8 - shrq $14,%r9 - andl %r8d,%eax - andl %r9d,%edx - movl %eax,16(%rdi) - leal (%rax,%rax,4),%eax - movl %edx,20(%rdi) - leal (%rdx,%rdx,4),%edx - movl %eax,32(%rdi) - shrq $26,%r8 - movl %edx,36(%rdi) - shrq $26,%r9 - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,48(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r9d,52(%rdi) - leaq (%r9,%r9,4),%r9 - movl %r8d,64(%rdi) - movl %r9d,68(%rdi) - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-52(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-36(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-20(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-4(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,12(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,28(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,44(%rdi) - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,60(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,76(%rdi) - - movq %r12,%rax - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - movl $0x3ffffff,%eax - movq %r14,%r8 - andl %r14d,%eax - shrq $26,%r8 - movl %eax,-56(%rdi) - - movl $0x3ffffff,%edx - andl %r8d,%edx - movl %edx,-40(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,-24(%rdi) - - movq %rbx,%rax - shlq $12,%rax - orq %r8,%rax - andl $0x3ffffff,%eax - movl %eax,-8(%rdi) - leal (%rax,%rax,4),%eax - movq %rbx,%r8 - movl %eax,8(%rdi) - - movl $0x3ffffff,%edx - shrq $14,%r8 - andl %r8d,%edx - movl %edx,24(%rdi) - leal (%rdx,%rdx,4),%edx - shrq $26,%r8 - movl %edx,40(%rdi) - - movq %r10,%rax - shlq $24,%rax - orq %rax,%r8 - movl %r8d,56(%rdi) - leaq (%r8,%r8,4),%r8 - movl %r8d,72(%rdi) - - leaq -48-64(%rdi),%rdi -.endm - -#ifdef CONFIG_AS_AVX -.align 32 -ENTRY(poly1305_blocks_avx) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx: - andq $-16,%rdx - jz .Lno_data_avx - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx - - testq $31,%rdx - jz .Leven_avx - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - - testq %rcx,%rcx - jz .Lstore_base2_64_avx - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - subq $16,%r15 - jz .Lstore_base2_26_avx - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx - -.align 32 -.Lstore_base2_64_avx: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx - -.align 16 -.Lstore_base2_26_avx: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx: -.Lblocks_avx_epilogue: - ret - -.align 32 -.Lbase2_64_avx: - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $31,%rdx - jz .Linit_avx - - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - -.Linit_avx: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx_epilogue: - jmp .Ldo_avx - - -.align 32 -.Leven_avx: - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx: - leaq 8(%rsp),%r10 - andq $-32,%rsp - subq $8,%rsp - leaq -88(%rsp),%r11 - subq $0x178,%rsp - subq $64,%rdx - leaq -32(%rsi),%rax - cmovcq %rax,%rsi - - vmovdqu 48(%rdi),%xmm14 - leaq 112(%rdi),%rdi - leaq .Lconst(%rip),%rcx - - vmovdqu 32(%rsi),%xmm5 - vmovdqu 48(%rsi),%xmm6 - vmovdqa 64(%rcx),%xmm15 - - vpsrldq $6,%xmm5,%xmm7 - vpsrldq $6,%xmm6,%xmm8 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - vpsrlq $40,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - jbe .Lskip_loop_avx - - - vmovdqu -48(%rdi),%xmm11 - vmovdqu -32(%rdi),%xmm12 - vpshufd $0xEE,%xmm14,%xmm13 - vpshufd $0x44,%xmm14,%xmm10 - vmovdqa %xmm13,-144(%r11) - vmovdqa %xmm10,0(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vmovdqu -16(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-128(%r11) - vmovdqa %xmm11,16(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqu 0(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-112(%r11) - vmovdqa %xmm12,32(%rsp) - vpshufd $0xEE,%xmm10,%xmm14 - vmovdqu 16(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm14,-96(%r11) - vmovdqa %xmm10,48(%rsp) - vpshufd $0xEE,%xmm11,%xmm13 - vmovdqu 32(%rdi),%xmm10 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm13,-80(%r11) - vmovdqa %xmm11,64(%rsp) - vpshufd $0xEE,%xmm12,%xmm14 - vmovdqu 48(%rdi),%xmm11 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm14,-64(%r11) - vmovdqa %xmm12,80(%rsp) - vpshufd $0xEE,%xmm10,%xmm13 - vmovdqu 64(%rdi),%xmm12 - vpshufd $0x44,%xmm10,%xmm10 - vmovdqa %xmm13,-48(%r11) - vmovdqa %xmm10,96(%rsp) - vpshufd $0xEE,%xmm11,%xmm14 - vpshufd $0x44,%xmm11,%xmm11 - vmovdqa %xmm14,-32(%r11) - vmovdqa %xmm11,112(%rsp) - vpshufd $0xEE,%xmm12,%xmm13 - vmovdqa 0(%rsp),%xmm14 - vpshufd $0x44,%xmm12,%xmm12 - vmovdqa %xmm13,-16(%r11) - vmovdqa %xmm12,128(%rsp) - - jmp .Loop_avx - -.align 32 -.Loop_avx: - - vpmuludq %xmm5,%xmm14,%xmm10 - vpmuludq %xmm6,%xmm14,%xmm11 - vmovdqa %xmm2,32(%r11) - vpmuludq %xmm7,%xmm14,%xmm12 - vmovdqa 16(%rsp),%xmm2 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vmovdqa %xmm0,0(%r11) - vpmuludq 32(%rsp),%xmm9,%xmm0 - vmovdqa %xmm1,16(%r11) - vpmuludq %xmm8,%xmm2,%xmm1 - vpaddq %xmm0,%xmm10,%xmm10 - vpaddq %xmm1,%xmm14,%xmm14 - vmovdqa %xmm3,48(%r11) - vpmuludq %xmm7,%xmm2,%xmm0 - vpmuludq %xmm6,%xmm2,%xmm1 - vpaddq %xmm0,%xmm13,%xmm13 - vmovdqa 48(%rsp),%xmm3 - vpaddq %xmm1,%xmm12,%xmm12 - vmovdqa %xmm4,64(%r11) - vpmuludq %xmm5,%xmm2,%xmm2 - vpmuludq %xmm7,%xmm3,%xmm0 - vpaddq %xmm2,%xmm11,%xmm11 - - vmovdqa 64(%rsp),%xmm4 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm3,%xmm1 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm1,%xmm13,%xmm13 - vmovdqa 80(%rsp),%xmm2 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm4,%xmm0 - vpmuludq %xmm8,%xmm4,%xmm4 - vpaddq %xmm0,%xmm11,%xmm11 - vmovdqa 96(%rsp),%xmm3 - vpaddq %xmm4,%xmm10,%xmm10 - - vmovdqa 128(%rsp),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm1 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm1,%xmm14,%xmm14 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq %xmm9,%xmm3,%xmm0 - vpmuludq %xmm8,%xmm3,%xmm1 - vpaddq %xmm0,%xmm12,%xmm12 - vmovdqu 0(%rsi),%xmm0 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm3,%xmm3 - vpmuludq %xmm7,%xmm4,%xmm7 - vpaddq %xmm3,%xmm10,%xmm10 - - vmovdqu 16(%rsi),%xmm1 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm8,%xmm4,%xmm8 - vpmuludq %xmm9,%xmm4,%xmm9 - vpsrldq $6,%xmm0,%xmm2 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm9,%xmm13,%xmm13 - vpsrldq $6,%xmm1,%xmm3 - vpmuludq 112(%rsp),%xmm5,%xmm9 - vpmuludq %xmm6,%xmm4,%xmm5 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpaddq %xmm9,%xmm14,%xmm14 - vmovdqa -144(%r11),%xmm9 - vpaddq %xmm5,%xmm10,%xmm10 - - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - - vpsrldq $5,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpand 0(%rcx),%xmm4,%xmm4 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - leaq 32(%rsi),%rax - leaq 64(%rsi),%rsi - subq $64,%rdx - cmovcq %rax,%rsi - - vpmuludq %xmm0,%xmm9,%xmm5 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vmovdqa -128(%r11),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm5 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm5,%xmm12,%xmm12 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpmuludq -112(%r11),%xmm4,%xmm5 - vpaddq %xmm9,%xmm14,%xmm14 - - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm2,%xmm7,%xmm6 - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -96(%r11),%xmm8 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm7,%xmm6 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm6,%xmm12,%xmm12 - vpaddq %xmm7,%xmm11,%xmm11 - - vmovdqa -80(%r11),%xmm9 - vpmuludq %xmm2,%xmm8,%xmm5 - vpmuludq %xmm1,%xmm8,%xmm6 - vpaddq %xmm5,%xmm14,%xmm14 - vpaddq %xmm6,%xmm13,%xmm13 - vmovdqa -64(%r11),%xmm7 - vpmuludq %xmm0,%xmm8,%xmm8 - vpmuludq %xmm4,%xmm9,%xmm5 - vpaddq %xmm8,%xmm12,%xmm12 - vpaddq %xmm5,%xmm11,%xmm11 - vmovdqa -48(%r11),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm9 - vpmuludq %xmm1,%xmm7,%xmm6 - vpaddq %xmm9,%xmm10,%xmm10 - - vmovdqa -16(%r11),%xmm9 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm7,%xmm7 - vpmuludq %xmm4,%xmm8,%xmm5 - vpaddq %xmm7,%xmm13,%xmm13 - vpaddq %xmm5,%xmm12,%xmm12 - vmovdqu 32(%rsi),%xmm5 - vpmuludq %xmm3,%xmm8,%xmm7 - vpmuludq %xmm2,%xmm8,%xmm8 - vpaddq %xmm7,%xmm11,%xmm11 - vmovdqu 48(%rsi),%xmm6 - vpaddq %xmm8,%xmm10,%xmm10 - - vpmuludq %xmm2,%xmm9,%xmm2 - vpmuludq %xmm3,%xmm9,%xmm3 - vpsrldq $6,%xmm5,%xmm7 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm9,%xmm4 - vpsrldq $6,%xmm6,%xmm8 - vpaddq %xmm3,%xmm12,%xmm2 - vpaddq %xmm4,%xmm13,%xmm3 - vpmuludq -32(%r11),%xmm0,%xmm4 - vpmuludq %xmm1,%xmm9,%xmm0 - vpunpckhqdq %xmm6,%xmm5,%xmm9 - vpaddq %xmm4,%xmm14,%xmm4 - vpaddq %xmm0,%xmm10,%xmm0 - - vpunpcklqdq %xmm6,%xmm5,%xmm5 - vpunpcklqdq %xmm8,%xmm7,%xmm8 - - - vpsrldq $5,%xmm9,%xmm9 - vpsrlq $26,%xmm5,%xmm6 - vmovdqa 0(%rsp),%xmm14 - vpand %xmm15,%xmm5,%xmm5 - vpsrlq $4,%xmm8,%xmm7 - vpand %xmm15,%xmm6,%xmm6 - vpand 0(%rcx),%xmm9,%xmm9 - vpsrlq $30,%xmm8,%xmm8 - vpand %xmm15,%xmm7,%xmm7 - vpand %xmm15,%xmm8,%xmm8 - vpor 32(%rcx),%xmm9,%xmm9 - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm11,%xmm1 - - vpsrlq $26,%xmm4,%xmm10 - vpand %xmm15,%xmm4,%xmm4 - - vpsrlq $26,%xmm1,%xmm11 - vpand %xmm15,%xmm1,%xmm1 - vpaddq %xmm11,%xmm2,%xmm2 - - vpaddq %xmm10,%xmm0,%xmm0 - vpsllq $2,%xmm10,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - - vpsrlq $26,%xmm2,%xmm12 - vpand %xmm15,%xmm2,%xmm2 - vpaddq %xmm12,%xmm3,%xmm3 - - vpsrlq $26,%xmm0,%xmm10 - vpand %xmm15,%xmm0,%xmm0 - vpaddq %xmm10,%xmm1,%xmm1 - - vpsrlq $26,%xmm3,%xmm13 - vpand %xmm15,%xmm3,%xmm3 - vpaddq %xmm13,%xmm4,%xmm4 - - ja .Loop_avx - -.Lskip_loop_avx: - vpshufd $0x10,%xmm14,%xmm14 - addq $32,%rdx - jnz .Long_tail_avx - - vpaddq %xmm2,%xmm7,%xmm7 - vpaddq %xmm0,%xmm5,%xmm5 - vpaddq %xmm1,%xmm6,%xmm6 - vpaddq %xmm3,%xmm8,%xmm8 - vpaddq %xmm4,%xmm9,%xmm9 - -.Long_tail_avx: - vmovdqa %xmm2,32(%r11) - vmovdqa %xmm0,0(%r11) - vmovdqa %xmm1,16(%r11) - vmovdqa %xmm3,48(%r11) - vmovdqa %xmm4,64(%r11) - - vpmuludq %xmm7,%xmm14,%xmm12 - vpmuludq %xmm5,%xmm14,%xmm10 - vpshufd $0x10,-48(%rdi),%xmm2 - vpmuludq %xmm6,%xmm14,%xmm11 - vpmuludq %xmm8,%xmm14,%xmm13 - vpmuludq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm8,%xmm2,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpshufd $0x10,-32(%rdi),%xmm3 - vpmuludq %xmm7,%xmm2,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpshufd $0x10,-16(%rdi),%xmm4 - vpmuludq %xmm6,%xmm2,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm9,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - vpshufd $0x10,0(%rdi),%xmm2 - vpmuludq %xmm7,%xmm4,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq %xmm6,%xmm4,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpshufd $0x10,16(%rdi),%xmm3 - vpmuludq %xmm5,%xmm4,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq %xmm9,%xmm2,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpshufd $0x10,32(%rdi),%xmm4 - vpmuludq %xmm8,%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - - vpmuludq %xmm6,%xmm3,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpmuludq %xmm5,%xmm3,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpshufd $0x10,48(%rdi),%xmm2 - vpmuludq %xmm9,%xmm4,%xmm1 - vpaddq %xmm1,%xmm12,%xmm12 - vpshufd $0x10,64(%rdi),%xmm3 - vpmuludq %xmm8,%xmm4,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpmuludq %xmm7,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpmuludq %xmm5,%xmm2,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq %xmm9,%xmm3,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpmuludq %xmm8,%xmm3,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpmuludq %xmm7,%xmm3,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq %xmm6,%xmm3,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - - jz .Lshort_tail_avx - - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - - vpsrldq $6,%xmm0,%xmm2 - vpsrldq $6,%xmm1,%xmm3 - vpunpckhqdq %xmm1,%xmm0,%xmm4 - vpunpcklqdq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm3,%xmm2,%xmm3 - - vpsrlq $40,%xmm4,%xmm4 - vpsrlq $26,%xmm0,%xmm1 - vpand %xmm15,%xmm0,%xmm0 - vpsrlq $4,%xmm3,%xmm2 - vpand %xmm15,%xmm1,%xmm1 - vpsrlq $30,%xmm3,%xmm3 - vpand %xmm15,%xmm2,%xmm2 - vpand %xmm15,%xmm3,%xmm3 - vpor 32(%rcx),%xmm4,%xmm4 - - vpshufd $0x32,-64(%rdi),%xmm9 - vpaddq 0(%r11),%xmm0,%xmm0 - vpaddq 16(%r11),%xmm1,%xmm1 - vpaddq 32(%r11),%xmm2,%xmm2 - vpaddq 48(%r11),%xmm3,%xmm3 - vpaddq 64(%r11),%xmm4,%xmm4 - - vpmuludq %xmm0,%xmm9,%xmm5 - vpaddq %xmm5,%xmm10,%xmm10 - vpmuludq %xmm1,%xmm9,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpshufd $0x32,-48(%rdi),%xmm7 - vpmuludq %xmm3,%xmm9,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm4,%xmm9,%xmm9 - vpaddq %xmm9,%xmm14,%xmm14 - - vpmuludq %xmm3,%xmm7,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpshufd $0x32,-32(%rdi),%xmm8 - vpmuludq %xmm2,%xmm7,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpshufd $0x32,-16(%rdi),%xmm9 - vpmuludq %xmm1,%xmm7,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - - vpshufd $0x32,0(%rdi),%xmm7 - vpmuludq %xmm2,%xmm9,%xmm6 - vpaddq %xmm6,%xmm14,%xmm14 - vpmuludq %xmm1,%xmm9,%xmm5 - vpaddq %xmm5,%xmm13,%xmm13 - vpshufd $0x32,16(%rdi),%xmm8 - vpmuludq %xmm0,%xmm9,%xmm9 - vpaddq %xmm9,%xmm12,%xmm12 - vpmuludq %xmm4,%xmm7,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpshufd $0x32,32(%rdi),%xmm9 - vpmuludq %xmm3,%xmm7,%xmm7 - vpaddq %xmm7,%xmm10,%xmm10 - - vpmuludq %xmm1,%xmm8,%xmm5 - vpaddq %xmm5,%xmm14,%xmm14 - vpmuludq %xmm0,%xmm8,%xmm8 - vpaddq %xmm8,%xmm13,%xmm13 - vpshufd $0x32,48(%rdi),%xmm7 - vpmuludq %xmm4,%xmm9,%xmm6 - vpaddq %xmm6,%xmm12,%xmm12 - vpshufd $0x32,64(%rdi),%xmm8 - vpmuludq %xmm3,%xmm9,%xmm5 - vpaddq %xmm5,%xmm11,%xmm11 - vpmuludq %xmm2,%xmm9,%xmm9 - vpaddq %xmm9,%xmm10,%xmm10 - - vpmuludq %xmm0,%xmm7,%xmm7 - vpaddq %xmm7,%xmm14,%xmm14 - vpmuludq %xmm4,%xmm8,%xmm6 - vpaddq %xmm6,%xmm13,%xmm13 - vpmuludq %xmm3,%xmm8,%xmm5 - vpaddq %xmm5,%xmm12,%xmm12 - vpmuludq %xmm2,%xmm8,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpmuludq %xmm1,%xmm8,%xmm8 - vpaddq %xmm8,%xmm10,%xmm10 - -.Lshort_tail_avx: - - vpsrldq $8,%xmm14,%xmm9 - vpsrldq $8,%xmm13,%xmm8 - vpsrldq $8,%xmm11,%xmm6 - vpsrldq $8,%xmm10,%xmm5 - vpsrldq $8,%xmm12,%xmm7 - vpaddq %xmm8,%xmm13,%xmm13 - vpaddq %xmm9,%xmm14,%xmm14 - vpaddq %xmm5,%xmm10,%xmm10 - vpaddq %xmm6,%xmm11,%xmm11 - vpaddq %xmm7,%xmm12,%xmm12 - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm14,%xmm4 - vpand %xmm15,%xmm14,%xmm14 - - vpsrlq $26,%xmm11,%xmm1 - vpand %xmm15,%xmm11,%xmm11 - vpaddq %xmm1,%xmm12,%xmm12 - - vpaddq %xmm4,%xmm10,%xmm10 - vpsllq $2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - - vpsrlq $26,%xmm12,%xmm2 - vpand %xmm15,%xmm12,%xmm12 - vpaddq %xmm2,%xmm13,%xmm13 - - vpsrlq $26,%xmm10,%xmm0 - vpand %xmm15,%xmm10,%xmm10 - vpaddq %xmm0,%xmm11,%xmm11 - - vpsrlq $26,%xmm13,%xmm3 - vpand %xmm15,%xmm13,%xmm13 - vpaddq %xmm3,%xmm14,%xmm14 - - vmovd %xmm10,-112(%rdi) - vmovd %xmm11,-108(%rdi) - vmovd %xmm12,-104(%rdi) - vmovd %xmm13,-100(%rdi) - vmovd %xmm14,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret -ENDPROC(poly1305_blocks_avx) - -.align 32 -ENTRY(poly1305_emit_avx) - cmpl $0,20(%rdi) - je .Lemit - - movl 0(%rdi),%eax - movl 4(%rdi),%ecx - movl 8(%rdi),%r8d - movl 12(%rdi),%r11d - movl 16(%rdi),%r10d - - shlq $26,%rcx - movq %r8,%r9 - shlq $52,%r8 - addq %rcx,%rax - shrq $12,%r9 - addq %rax,%r8 - adcq $0,%r9 - - shlq $14,%r11 - movq %r10,%rax - shrq $24,%r10 - addq %r11,%r9 - shlq $40,%rax - addq %rax,%r9 - adcq $0,%r10 - - movq %r10,%rax - movq %r10,%rcx - andq $3,%r10 - shrq $2,%rax - andq $-4,%rcx - addq %rcx,%rax - addq %rax,%r8 - adcq $0,%r9 - adcq $0,%r10 - - movq %r8,%rax - addq $5,%r8 - movq %r9,%rcx - adcq $0,%r9 - adcq $0,%r10 - shrq $2,%r10 - cmovnzq %r8,%rax - cmovnzq %r9,%rcx - - addq 0(%rdx),%rax - adcq 8(%rdx),%rcx - movq %rax,0(%rsi) - movq %rcx,8(%rsi) - - ret -ENDPROC(poly1305_emit_avx) -#endif /* CONFIG_AS_AVX */ - -#ifdef CONFIG_AS_AVX2 -.align 32 -ENTRY(poly1305_blocks_avx2) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx2 - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2: - andq $-16,%rdx - jz .Lno_data_avx2 - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx2 - - testq $63,%rdx - jz .Leven_avx2 - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx2_body: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - -.Lbase2_26_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_26_pre_avx2 - - testq %rcx,%rcx - jz .Lstore_base2_64_avx2 - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - testq %r15,%r15 - jz .Lstore_base2_26_avx2 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx2 - -.align 32 -.Lstore_base2_64_avx2: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx2 - -.align 16 -.Lstore_base2_26_avx2: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx2: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx2: -.Lblocks_avx2_epilogue: - ret - - -.align 32 -.Lbase2_64_avx2: - - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx2_body: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $63,%rdx - jz .Linit_avx2 - -.Lbase2_64_pre_avx2: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_64_pre_avx2 - -.Linit_avx2: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx2: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx2_epilogue: - jmp .Ldo_avx2 - - -.align 32 -.Leven_avx2: - - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx2: - leaq 8(%rsp),%r10 - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm7 - - - vmovdqu -64(%rdi),%xmm9 - andq $-512,%rsp - vmovdqu -48(%rdi),%xmm10 - vmovdqu -32(%rdi),%xmm6 - vmovdqu -16(%rdi),%xmm11 - vmovdqu 0(%rdi),%xmm12 - vmovdqu 16(%rdi),%xmm13 - leaq 144(%rsp),%rax - vmovdqu 32(%rdi),%xmm14 - vpermd %ymm9,%ymm7,%ymm9 - vmovdqu 48(%rdi),%xmm15 - vpermd %ymm10,%ymm7,%ymm10 - vmovdqu 64(%rdi),%xmm5 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqa %ymm9,0(%rsp) - vpermd %ymm11,%ymm7,%ymm11 - vmovdqa %ymm10,32-144(%rax) - vpermd %ymm12,%ymm7,%ymm12 - vmovdqa %ymm6,64-144(%rax) - vpermd %ymm13,%ymm7,%ymm13 - vmovdqa %ymm11,96-144(%rax) - vpermd %ymm14,%ymm7,%ymm14 - vmovdqa %ymm12,128-144(%rax) - vpermd %ymm15,%ymm7,%ymm15 - vmovdqa %ymm13,160-144(%rax) - vpermd %ymm5,%ymm7,%ymm5 - vmovdqa %ymm14,192-144(%rax) - vmovdqa %ymm15,224-144(%rax) - vmovdqa %ymm5,256-144(%rax) - vmovdqa 64(%rcx),%ymm5 - - - - vmovdqu 0(%rsi),%xmm7 - vmovdqu 16(%rsi),%xmm8 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - vpaddq %ymm2,%ymm9,%ymm2 - subq $64,%rdx - jz .Ltail_avx2 - jmp .Loop_avx2 - -.align 32 -.Loop_avx2: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqa 0(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqa 32(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqa 96(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqa 48(%rax),%ymm10 - vmovdqa 112(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 64(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - vmovdqa -16(%rax),%ymm8 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vmovdqu 0(%rsi),%xmm7 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vmovdqu 16(%rsi),%xmm8 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqa 16(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpsrldq $6,%ymm7,%ymm9 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpsrldq $6,%ymm8,%ymm10 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpunpcklqdq %ymm10,%ymm9,%ymm10 - vpmuludq 80(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $4,%ymm10,%ymm9 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpand %ymm5,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpaddq %ymm9,%ymm2,%ymm2 - vpsrlq $30,%ymm10,%ymm10 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $40,%ymm6,%ymm6 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - subq $64,%rdx - jnz .Loop_avx2 - -.byte 0x66,0x90 -.Ltail_avx2: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqu 4(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqu 36(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqu 100(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqu 52(%rax),%ymm10 - vmovdqu 116(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 68(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vmovdqu -12(%rax),%ymm8 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqu 20(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpmuludq 84(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrldq $8,%ymm12,%ymm8 - vpsrldq $8,%ymm2,%ymm9 - vpsrldq $8,%ymm3,%ymm10 - vpsrldq $8,%ymm4,%ymm6 - vpsrldq $8,%ymm0,%ymm7 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - - vpermq $0x2,%ymm3,%ymm10 - vpermq $0x2,%ymm4,%ymm6 - vpermq $0x2,%ymm0,%ymm7 - vpermq $0x2,%ymm12,%ymm8 - vpermq $0x2,%ymm2,%ymm9 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret - -ENDPROC(poly1305_blocks_avx2) -#endif /* CONFIG_AS_AVX2 */ - -#ifdef CONFIG_AS_AVX512 -.align 32 -ENTRY(poly1305_blocks_avx512) - - movl 20(%rdi),%r8d - cmpq $128,%rdx - jae .Lblocks_avx2_512 - testl %r8d,%r8d - jz .Lblocks - -.Lblocks_avx2_512: - andq $-16,%rdx - jz .Lno_data_avx2_512 - - vzeroupper - - testl %r8d,%r8d - jz .Lbase2_64_avx2_512 - - testq $63,%rdx - jz .Leven_avx2_512 - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lblocks_avx2_body_512: - - movq %rdx,%r15 - - movq 0(%rdi),%r8 - movq 8(%rdi),%r9 - movl 16(%rdi),%r10d - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - - movl %r8d,%r14d - andq $-2147483648,%r8 - movq %r9,%r12 - movl %r9d,%ebx - andq $-2147483648,%r9 - - shrq $6,%r8 - shlq $52,%r12 - addq %r8,%r14 - shrq $12,%rbx - shrq $18,%r9 - addq %r12,%r14 - adcq %r9,%rbx - - movq %r10,%r8 - shlq $40,%r8 - shrq $24,%r10 - addq %r8,%rbx - adcq $0,%r10 - - movq $-4,%r9 - movq %r10,%r8 - andq %r10,%r9 - shrq $2,%r8 - andq $3,%r10 - addq %r9,%r8 - addq %r8,%r14 - adcq $0,%rbx - adcq $0,%r10 - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - -.Lbase2_26_pre_avx2_512: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_26_pre_avx2_512 - - testq %rcx,%rcx - jz .Lstore_base2_64_avx2_512 - - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r11 - movq %rbx,%r12 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r11 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r11,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r12 - andq $0x3ffffff,%rbx - orq %r12,%r10 - - testq %r15,%r15 - jz .Lstore_base2_26_avx2_512 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - jmp .Lproceed_avx2_512 - -.align 32 -.Lstore_base2_64_avx2_512: - movq %r14,0(%rdi) - movq %rbx,8(%rdi) - movq %r10,16(%rdi) - jmp .Ldone_avx2_512 - -.align 16 -.Lstore_base2_26_avx2_512: - movl %eax,0(%rdi) - movl %edx,4(%rdi) - movl %r14d,8(%rdi) - movl %ebx,12(%rdi) - movl %r10d,16(%rdi) -.align 16 -.Ldone_avx2_512: - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rsp - -.Lno_data_avx2_512: -.Lblocks_avx2_epilogue_512: - ret - - -.align 32 -.Lbase2_64_avx2_512: - - pushq %rbx - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - pushq %rdi - -.Lbase2_64_avx2_body_512: - - movq %rdx,%r15 - - movq 24(%rdi),%r11 - movq 32(%rdi),%r13 - - movq 0(%rdi),%r14 - movq 8(%rdi),%rbx - movl 16(%rdi),%r10d - - movq %r13,%r12 - movq %r13,%rax - shrq $2,%r13 - addq %r12,%r13 - - testq $63,%rdx - jz .Linit_avx2_512 - -.Lbase2_64_pre_avx2_512: - addq 0(%rsi),%r14 - adcq 8(%rsi),%rbx - leaq 16(%rsi),%rsi - adcq %rcx,%r10 - subq $16,%r15 - - movq %rdi,0(%rsp) - __poly1305_block - movq 0(%rsp),%rdi - movq %r12,%rax - - testq $63,%r15 - jnz .Lbase2_64_pre_avx2_512 - -.Linit_avx2_512: - - movq %r14,%rax - movq %r14,%rdx - shrq $52,%r14 - movq %rbx,%r8 - movq %rbx,%r9 - shrq $26,%rdx - andq $0x3ffffff,%rax - shlq $12,%r8 - andq $0x3ffffff,%rdx - shrq $14,%rbx - orq %r8,%r14 - shlq $24,%r10 - andq $0x3ffffff,%r14 - shrq $40,%r9 - andq $0x3ffffff,%rbx - orq %r9,%r10 - - vmovd %eax,%xmm0 - vmovd %edx,%xmm1 - vmovd %r14d,%xmm2 - vmovd %ebx,%xmm3 - vmovd %r10d,%xmm4 - movl $1,20(%rdi) - - __poly1305_init_avx - -.Lproceed_avx2_512: - movq %r15,%rdx - - movq 8(%rsp),%r15 - movq 16(%rsp),%r14 - movq 24(%rsp),%r13 - movq 32(%rsp),%r12 - movq 40(%rsp),%rbx - leaq 48(%rsp),%rax - leaq 48(%rsp),%rsp - -.Lbase2_64_avx2_epilogue_512: - jmp .Ldo_avx2_512 - - -.align 32 -.Leven_avx2_512: - - vmovd 0(%rdi),%xmm0 - vmovd 4(%rdi),%xmm1 - vmovd 8(%rdi),%xmm2 - vmovd 12(%rdi),%xmm3 - vmovd 16(%rdi),%xmm4 - -.Ldo_avx2_512: - cmpq $512,%rdx - jae .Lblocks_avx512 -.Lskip_avx512: - leaq 8(%rsp),%r10 - - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm7 - - - vmovdqu -64(%rdi),%xmm9 - andq $-512,%rsp - vmovdqu -48(%rdi),%xmm10 - vmovdqu -32(%rdi),%xmm6 - vmovdqu -16(%rdi),%xmm11 - vmovdqu 0(%rdi),%xmm12 - vmovdqu 16(%rdi),%xmm13 - leaq 144(%rsp),%rax - vmovdqu 32(%rdi),%xmm14 - vpermd %ymm9,%ymm7,%ymm9 - vmovdqu 48(%rdi),%xmm15 - vpermd %ymm10,%ymm7,%ymm10 - vmovdqu 64(%rdi),%xmm5 - vpermd %ymm6,%ymm7,%ymm6 - vmovdqa %ymm9,0(%rsp) - vpermd %ymm11,%ymm7,%ymm11 - vmovdqa %ymm10,32-144(%rax) - vpermd %ymm12,%ymm7,%ymm12 - vmovdqa %ymm6,64-144(%rax) - vpermd %ymm13,%ymm7,%ymm13 - vmovdqa %ymm11,96-144(%rax) - vpermd %ymm14,%ymm7,%ymm14 - vmovdqa %ymm12,128-144(%rax) - vpermd %ymm15,%ymm7,%ymm15 - vmovdqa %ymm13,160-144(%rax) - vpermd %ymm5,%ymm7,%ymm5 - vmovdqa %ymm14,192-144(%rax) - vmovdqa %ymm15,224-144(%rax) - vmovdqa %ymm5,256-144(%rax) - vmovdqa 64(%rcx),%ymm5 - - - - vmovdqu 0(%rsi),%xmm7 - vmovdqu 16(%rsi),%xmm8 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - vpaddq %ymm2,%ymm9,%ymm2 - subq $64,%rdx - jz .Ltail_avx2_512 - jmp .Loop_avx2_512 - -.align 32 -.Loop_avx2_512: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqa 0(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqa 32(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqa 96(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqa 48(%rax),%ymm10 - vmovdqa 112(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 64(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - vmovdqa -16(%rax),%ymm8 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vmovdqu 0(%rsi),%xmm7 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vmovdqu 16(%rsi),%xmm8 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqa 16(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - leaq 64(%rsi),%rsi - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpsrldq $6,%ymm7,%ymm9 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpsrldq $6,%ymm8,%ymm10 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpunpcklqdq %ymm10,%ymm9,%ymm10 - vpmuludq 80(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $4,%ymm10,%ymm9 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpand %ymm5,%ymm9,%ymm9 - vpsrlq $26,%ymm7,%ymm8 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpaddq %ymm9,%ymm2,%ymm2 - vpsrlq $30,%ymm10,%ymm10 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $40,%ymm6,%ymm6 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpand %ymm5,%ymm7,%ymm7 - vpand %ymm5,%ymm8,%ymm8 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - - subq $64,%rdx - jnz .Loop_avx2_512 - -.byte 0x66,0x90 -.Ltail_avx2_512: - - vpaddq %ymm0,%ymm7,%ymm0 - vmovdqu 4(%rsp),%ymm7 - vpaddq %ymm1,%ymm8,%ymm1 - vmovdqu 36(%rsp),%ymm8 - vpaddq %ymm3,%ymm10,%ymm3 - vmovdqu 100(%rsp),%ymm9 - vpaddq %ymm4,%ymm6,%ymm4 - vmovdqu 52(%rax),%ymm10 - vmovdqu 116(%rax),%ymm5 - - vpmuludq %ymm2,%ymm7,%ymm13 - vpmuludq %ymm2,%ymm8,%ymm14 - vpmuludq %ymm2,%ymm9,%ymm15 - vpmuludq %ymm2,%ymm10,%ymm11 - vpmuludq %ymm2,%ymm5,%ymm12 - - vpmuludq %ymm0,%ymm8,%ymm6 - vpmuludq %ymm1,%ymm8,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq 68(%rsp),%ymm4,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm11,%ymm11 - - vpmuludq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm1,%ymm7,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vmovdqu -12(%rax),%ymm8 - vpaddq %ymm2,%ymm12,%ymm12 - vpmuludq %ymm3,%ymm7,%ymm6 - vpmuludq %ymm4,%ymm7,%ymm2 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm2,%ymm15,%ymm15 - - vpmuludq %ymm3,%ymm8,%ymm6 - vpmuludq %ymm4,%ymm8,%ymm2 - vpaddq %ymm6,%ymm11,%ymm11 - vpaddq %ymm2,%ymm12,%ymm12 - vmovdqu 20(%rax),%ymm2 - vpmuludq %ymm1,%ymm9,%ymm6 - vpmuludq %ymm0,%ymm9,%ymm9 - vpaddq %ymm6,%ymm14,%ymm14 - vpaddq %ymm9,%ymm13,%ymm13 - - vpmuludq %ymm1,%ymm2,%ymm6 - vpmuludq %ymm0,%ymm2,%ymm2 - vpaddq %ymm6,%ymm15,%ymm15 - vpaddq %ymm2,%ymm14,%ymm14 - vpmuludq %ymm3,%ymm10,%ymm6 - vpmuludq %ymm4,%ymm10,%ymm2 - vpaddq %ymm6,%ymm12,%ymm12 - vpaddq %ymm2,%ymm13,%ymm13 - - vpmuludq %ymm3,%ymm5,%ymm3 - vpmuludq %ymm4,%ymm5,%ymm4 - vpaddq %ymm3,%ymm13,%ymm2 - vpaddq %ymm4,%ymm14,%ymm3 - vpmuludq 84(%rax),%ymm0,%ymm4 - vpmuludq %ymm1,%ymm5,%ymm0 - vmovdqa 64(%rcx),%ymm5 - vpaddq %ymm4,%ymm15,%ymm4 - vpaddq %ymm0,%ymm11,%ymm0 - - vpsrldq $8,%ymm12,%ymm8 - vpsrldq $8,%ymm2,%ymm9 - vpsrldq $8,%ymm3,%ymm10 - vpsrldq $8,%ymm4,%ymm6 - vpsrldq $8,%ymm0,%ymm7 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - - vpermq $0x2,%ymm3,%ymm10 - vpermq $0x2,%ymm4,%ymm6 - vpermq $0x2,%ymm0,%ymm7 - vpermq $0x2,%ymm12,%ymm8 - vpermq $0x2,%ymm2,%ymm9 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm6,%ymm4,%ymm4 - vpaddq %ymm7,%ymm0,%ymm0 - vpaddq %ymm8,%ymm12,%ymm12 - vpaddq %ymm9,%ymm2,%ymm2 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm12,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpaddq %ymm14,%ymm4,%ymm4 - - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - leaq -8(%r10),%rsp - - vzeroupper - ret - -.Lblocks_avx512: - - movl $15,%eax - kmovw %eax,%k2 - leaq 8(%rsp),%r10 - - subq $0x128,%rsp - leaq .Lconst(%rip),%rcx - leaq 48+64(%rdi),%rdi - vmovdqa 96(%rcx),%ymm9 - - vmovdqu32 -64(%rdi),%zmm16{%k2}{z} - andq $-512,%rsp - vmovdqu32 -48(%rdi),%zmm17{%k2}{z} - movq $0x20,%rax - vmovdqu32 -32(%rdi),%zmm21{%k2}{z} - vmovdqu32 -16(%rdi),%zmm18{%k2}{z} - vmovdqu32 0(%rdi),%zmm22{%k2}{z} - vmovdqu32 16(%rdi),%zmm19{%k2}{z} - vmovdqu32 32(%rdi),%zmm23{%k2}{z} - vmovdqu32 48(%rdi),%zmm20{%k2}{z} - vmovdqu32 64(%rdi),%zmm24{%k2}{z} - vpermd %zmm16,%zmm9,%zmm16 - vpbroadcastq 64(%rcx),%zmm5 - vpermd %zmm17,%zmm9,%zmm17 - vpermd %zmm21,%zmm9,%zmm21 - vpermd %zmm18,%zmm9,%zmm18 - vmovdqa64 %zmm16,0(%rsp){%k2} - vpsrlq $32,%zmm16,%zmm7 - vpermd %zmm22,%zmm9,%zmm22 - vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} - vpsrlq $32,%zmm17,%zmm8 - vpermd %zmm19,%zmm9,%zmm19 - vmovdqa64 %zmm21,64(%rsp){%k2} - vpermd %zmm23,%zmm9,%zmm23 - vpermd %zmm20,%zmm9,%zmm20 - vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} - vpermd %zmm24,%zmm9,%zmm24 - vmovdqa64 %zmm22,128(%rsp){%k2} - vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} - vmovdqa64 %zmm23,192(%rsp){%k2} - vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} - vmovdqa64 %zmm24,256(%rsp){%k2} - - vpmuludq %zmm7,%zmm16,%zmm11 - vpmuludq %zmm7,%zmm17,%zmm12 - vpmuludq %zmm7,%zmm18,%zmm13 - vpmuludq %zmm7,%zmm19,%zmm14 - vpmuludq %zmm7,%zmm20,%zmm15 - vpsrlq $32,%zmm18,%zmm9 - - vpmuludq %zmm8,%zmm24,%zmm25 - vpmuludq %zmm8,%zmm16,%zmm26 - vpmuludq %zmm8,%zmm17,%zmm27 - vpmuludq %zmm8,%zmm18,%zmm28 - vpmuludq %zmm8,%zmm19,%zmm29 - vpsrlq $32,%zmm19,%zmm10 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - - vpmuludq %zmm9,%zmm23,%zmm25 - vpmuludq %zmm9,%zmm24,%zmm26 - vpmuludq %zmm9,%zmm17,%zmm28 - vpmuludq %zmm9,%zmm18,%zmm29 - vpmuludq %zmm9,%zmm16,%zmm27 - vpsrlq $32,%zmm20,%zmm6 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm10,%zmm22,%zmm25 - vpmuludq %zmm10,%zmm16,%zmm28 - vpmuludq %zmm10,%zmm17,%zmm29 - vpmuludq %zmm10,%zmm23,%zmm26 - vpmuludq %zmm10,%zmm24,%zmm27 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm6,%zmm24,%zmm28 - vpmuludq %zmm6,%zmm16,%zmm29 - vpmuludq %zmm6,%zmm21,%zmm25 - vpmuludq %zmm6,%zmm22,%zmm26 - vpmuludq %zmm6,%zmm23,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vmovdqu64 0(%rsi),%zmm10 - vmovdqu64 64(%rsi),%zmm6 - leaq 128(%rsi),%rsi - - vpsrlq $26,%zmm14,%zmm28 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm28,%zmm15,%zmm15 - - vpsrlq $26,%zmm11,%zmm25 - vpandq %zmm5,%zmm11,%zmm11 - vpaddq %zmm25,%zmm12,%zmm12 - - vpsrlq $26,%zmm15,%zmm29 - vpandq %zmm5,%zmm15,%zmm15 - - vpsrlq $26,%zmm12,%zmm26 - vpandq %zmm5,%zmm12,%zmm12 - vpaddq %zmm26,%zmm13,%zmm13 - - vpaddq %zmm29,%zmm11,%zmm11 - vpsllq $2,%zmm29,%zmm29 - vpaddq %zmm29,%zmm11,%zmm11 - - vpsrlq $26,%zmm13,%zmm27 - vpandq %zmm5,%zmm13,%zmm13 - vpaddq %zmm27,%zmm14,%zmm14 - - vpsrlq $26,%zmm11,%zmm25 - vpandq %zmm5,%zmm11,%zmm11 - vpaddq %zmm25,%zmm12,%zmm12 - - vpsrlq $26,%zmm14,%zmm28 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm28,%zmm15,%zmm15 - - vpunpcklqdq %zmm6,%zmm10,%zmm7 - vpunpckhqdq %zmm6,%zmm10,%zmm6 - - vmovdqa32 128(%rcx),%zmm25 - movl $0x7777,%eax - kmovw %eax,%k1 - - vpermd %zmm16,%zmm25,%zmm16 - vpermd %zmm17,%zmm25,%zmm17 - vpermd %zmm18,%zmm25,%zmm18 - vpermd %zmm19,%zmm25,%zmm19 - vpermd %zmm20,%zmm25,%zmm20 - - vpermd %zmm11,%zmm25,%zmm16{%k1} - vpermd %zmm12,%zmm25,%zmm17{%k1} - vpermd %zmm13,%zmm25,%zmm18{%k1} - vpermd %zmm14,%zmm25,%zmm19{%k1} - vpermd %zmm15,%zmm25,%zmm20{%k1} - - vpslld $2,%zmm17,%zmm21 - vpslld $2,%zmm18,%zmm22 - vpslld $2,%zmm19,%zmm23 - vpslld $2,%zmm20,%zmm24 - vpaddd %zmm17,%zmm21,%zmm21 - vpaddd %zmm18,%zmm22,%zmm22 - vpaddd %zmm19,%zmm23,%zmm23 - vpaddd %zmm20,%zmm24,%zmm24 - - vpbroadcastq 32(%rcx),%zmm30 - - vpsrlq $52,%zmm7,%zmm9 - vpsllq $12,%zmm6,%zmm10 - vporq %zmm10,%zmm9,%zmm9 - vpsrlq $26,%zmm7,%zmm8 - vpsrlq $14,%zmm6,%zmm10 - vpsrlq $40,%zmm6,%zmm6 - vpandq %zmm5,%zmm9,%zmm9 - vpandq %zmm5,%zmm7,%zmm7 - - vpaddq %zmm2,%zmm9,%zmm2 - subq $192,%rdx - jbe .Ltail_avx512 - jmp .Loop_avx512 - -.align 32 -.Loop_avx512: - - vpmuludq %zmm2,%zmm17,%zmm14 - vpaddq %zmm0,%zmm7,%zmm0 - vpmuludq %zmm2,%zmm18,%zmm15 - vpandq %zmm5,%zmm8,%zmm8 - vpmuludq %zmm2,%zmm23,%zmm11 - vpandq %zmm5,%zmm10,%zmm10 - vpmuludq %zmm2,%zmm24,%zmm12 - vporq %zmm30,%zmm6,%zmm6 - vpmuludq %zmm2,%zmm16,%zmm13 - vpaddq %zmm1,%zmm8,%zmm1 - vpaddq %zmm3,%zmm10,%zmm3 - vpaddq %zmm4,%zmm6,%zmm4 - - vmovdqu64 0(%rsi),%zmm10 - vmovdqu64 64(%rsi),%zmm6 - leaq 128(%rsi),%rsi - vpmuludq %zmm0,%zmm19,%zmm28 - vpmuludq %zmm0,%zmm20,%zmm29 - vpmuludq %zmm0,%zmm16,%zmm25 - vpmuludq %zmm0,%zmm17,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - - vpmuludq %zmm1,%zmm18,%zmm28 - vpmuludq %zmm1,%zmm19,%zmm29 - vpmuludq %zmm1,%zmm24,%zmm25 - vpmuludq %zmm0,%zmm18,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm27,%zmm13,%zmm13 - - vpunpcklqdq %zmm6,%zmm10,%zmm7 - vpunpckhqdq %zmm6,%zmm10,%zmm6 - - vpmuludq %zmm3,%zmm16,%zmm28 - vpmuludq %zmm3,%zmm17,%zmm29 - vpmuludq %zmm1,%zmm16,%zmm26 - vpmuludq %zmm1,%zmm17,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm24,%zmm28 - vpmuludq %zmm4,%zmm16,%zmm29 - vpmuludq %zmm3,%zmm22,%zmm25 - vpmuludq %zmm3,%zmm23,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpmuludq %zmm3,%zmm24,%zmm27 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm21,%zmm25 - vpmuludq %zmm4,%zmm22,%zmm26 - vpmuludq %zmm4,%zmm23,%zmm27 - vpaddq %zmm25,%zmm11,%zmm0 - vpaddq %zmm26,%zmm12,%zmm1 - vpaddq %zmm27,%zmm13,%zmm2 - - vpsrlq $52,%zmm7,%zmm9 - vpsllq $12,%zmm6,%zmm10 - - vpsrlq $26,%zmm14,%zmm3 - vpandq %zmm5,%zmm14,%zmm14 - vpaddq %zmm3,%zmm15,%zmm4 - - vporq %zmm10,%zmm9,%zmm9 - - vpsrlq $26,%zmm0,%zmm11 - vpandq %zmm5,%zmm0,%zmm0 - vpaddq %zmm11,%zmm1,%zmm1 - - vpandq %zmm5,%zmm9,%zmm9 - - vpsrlq $26,%zmm4,%zmm15 - vpandq %zmm5,%zmm4,%zmm4 - - vpsrlq $26,%zmm1,%zmm12 - vpandq %zmm5,%zmm1,%zmm1 - vpaddq %zmm12,%zmm2,%zmm2 - - vpaddq %zmm15,%zmm0,%zmm0 - vpsllq $2,%zmm15,%zmm15 - vpaddq %zmm15,%zmm0,%zmm0 - - vpaddq %zmm9,%zmm2,%zmm2 - vpsrlq $26,%zmm7,%zmm8 - - vpsrlq $26,%zmm2,%zmm13 - vpandq %zmm5,%zmm2,%zmm2 - vpaddq %zmm13,%zmm14,%zmm3 - - vpsrlq $14,%zmm6,%zmm10 - - vpsrlq $26,%zmm0,%zmm11 - vpandq %zmm5,%zmm0,%zmm0 - vpaddq %zmm11,%zmm1,%zmm1 - - vpsrlq $40,%zmm6,%zmm6 - - vpsrlq $26,%zmm3,%zmm14 - vpandq %zmm5,%zmm3,%zmm3 - vpaddq %zmm14,%zmm4,%zmm4 - - vpandq %zmm5,%zmm7,%zmm7 - - subq $128,%rdx - ja .Loop_avx512 - -.Ltail_avx512: - - vpsrlq $32,%zmm16,%zmm16 - vpsrlq $32,%zmm17,%zmm17 - vpsrlq $32,%zmm18,%zmm18 - vpsrlq $32,%zmm23,%zmm23 - vpsrlq $32,%zmm24,%zmm24 - vpsrlq $32,%zmm19,%zmm19 - vpsrlq $32,%zmm20,%zmm20 - vpsrlq $32,%zmm21,%zmm21 - vpsrlq $32,%zmm22,%zmm22 - - leaq (%rsi,%rdx,1),%rsi - - vpaddq %zmm0,%zmm7,%zmm0 - - vpmuludq %zmm2,%zmm17,%zmm14 - vpmuludq %zmm2,%zmm18,%zmm15 - vpmuludq %zmm2,%zmm23,%zmm11 - vpandq %zmm5,%zmm8,%zmm8 - vpmuludq %zmm2,%zmm24,%zmm12 - vpandq %zmm5,%zmm10,%zmm10 - vpmuludq %zmm2,%zmm16,%zmm13 - vporq %zmm30,%zmm6,%zmm6 - vpaddq %zmm1,%zmm8,%zmm1 - vpaddq %zmm3,%zmm10,%zmm3 - vpaddq %zmm4,%zmm6,%zmm4 - - vmovdqu 0(%rsi),%xmm7 - vpmuludq %zmm0,%zmm19,%zmm28 - vpmuludq %zmm0,%zmm20,%zmm29 - vpmuludq %zmm0,%zmm16,%zmm25 - vpmuludq %zmm0,%zmm17,%zmm26 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - - vmovdqu 16(%rsi),%xmm8 - vpmuludq %zmm1,%zmm18,%zmm28 - vpmuludq %zmm1,%zmm19,%zmm29 - vpmuludq %zmm1,%zmm24,%zmm25 - vpmuludq %zmm0,%zmm18,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm27,%zmm13,%zmm13 - - vinserti128 $1,32(%rsi),%ymm7,%ymm7 - vpmuludq %zmm3,%zmm16,%zmm28 - vpmuludq %zmm3,%zmm17,%zmm29 - vpmuludq %zmm1,%zmm16,%zmm26 - vpmuludq %zmm1,%zmm17,%zmm27 - vpaddq %zmm28,%zmm14,%zmm14 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vinserti128 $1,48(%rsi),%ymm8,%ymm8 - vpmuludq %zmm4,%zmm24,%zmm28 - vpmuludq %zmm4,%zmm16,%zmm29 - vpmuludq %zmm3,%zmm22,%zmm25 - vpmuludq %zmm3,%zmm23,%zmm26 - vpmuludq %zmm3,%zmm24,%zmm27 - vpaddq %zmm28,%zmm14,%zmm3 - vpaddq %zmm29,%zmm15,%zmm15 - vpaddq %zmm25,%zmm11,%zmm11 - vpaddq %zmm26,%zmm12,%zmm12 - vpaddq %zmm27,%zmm13,%zmm13 - - vpmuludq %zmm4,%zmm21,%zmm25 - vpmuludq %zmm4,%zmm22,%zmm26 - vpmuludq %zmm4,%zmm23,%zmm27 - vpaddq %zmm25,%zmm11,%zmm0 - vpaddq %zmm26,%zmm12,%zmm1 - vpaddq %zmm27,%zmm13,%zmm2 - - movl $1,%eax - vpermq $0xb1,%zmm3,%zmm14 - vpermq $0xb1,%zmm15,%zmm4 - vpermq $0xb1,%zmm0,%zmm11 - vpermq $0xb1,%zmm1,%zmm12 - vpermq $0xb1,%zmm2,%zmm13 - vpaddq %zmm14,%zmm3,%zmm3 - vpaddq %zmm15,%zmm4,%zmm4 - vpaddq %zmm11,%zmm0,%zmm0 - vpaddq %zmm12,%zmm1,%zmm1 - vpaddq %zmm13,%zmm2,%zmm2 - - kmovw %eax,%k3 - vpermq $0x2,%zmm3,%zmm14 - vpermq $0x2,%zmm4,%zmm15 - vpermq $0x2,%zmm0,%zmm11 - vpermq $0x2,%zmm1,%zmm12 - vpermq $0x2,%zmm2,%zmm13 - vpaddq %zmm14,%zmm3,%zmm3 - vpaddq %zmm15,%zmm4,%zmm4 - vpaddq %zmm11,%zmm0,%zmm0 - vpaddq %zmm12,%zmm1,%zmm1 - vpaddq %zmm13,%zmm2,%zmm2 - - vextracti64x4 $0x1,%zmm3,%ymm14 - vextracti64x4 $0x1,%zmm4,%ymm15 - vextracti64x4 $0x1,%zmm0,%ymm11 - vextracti64x4 $0x1,%zmm1,%ymm12 - vextracti64x4 $0x1,%zmm2,%ymm13 - vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} - vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} - vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} - vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} - vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpsrldq $6,%ymm7,%ymm9 - vpsrldq $6,%ymm8,%ymm10 - vpunpckhqdq %ymm8,%ymm7,%ymm6 - vpaddq %ymm14,%ymm4,%ymm4 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpunpcklqdq %ymm10,%ymm9,%ymm9 - vpunpcklqdq %ymm8,%ymm7,%ymm7 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm4,%ymm15 - vpand %ymm5,%ymm4,%ymm4 - - vpsrlq $26,%ymm1,%ymm12 - vpand %ymm5,%ymm1,%ymm1 - vpsrlq $30,%ymm9,%ymm10 - vpsrlq $4,%ymm9,%ymm9 - vpaddq %ymm12,%ymm2,%ymm2 - - vpaddq %ymm15,%ymm0,%ymm0 - vpsllq $2,%ymm15,%ymm15 - vpsrlq $26,%ymm7,%ymm8 - vpsrlq $40,%ymm6,%ymm6 - vpaddq %ymm15,%ymm0,%ymm0 - - vpsrlq $26,%ymm2,%ymm13 - vpand %ymm5,%ymm2,%ymm2 - vpand %ymm5,%ymm9,%ymm9 - vpand %ymm5,%ymm7,%ymm7 - vpaddq %ymm13,%ymm3,%ymm3 - - vpsrlq $26,%ymm0,%ymm11 - vpand %ymm5,%ymm0,%ymm0 - vpaddq %ymm2,%ymm9,%ymm2 - vpand %ymm5,%ymm8,%ymm8 - vpaddq %ymm11,%ymm1,%ymm1 - - vpsrlq $26,%ymm3,%ymm14 - vpand %ymm5,%ymm3,%ymm3 - vpand %ymm5,%ymm10,%ymm10 - vpor 32(%rcx),%ymm6,%ymm6 - vpaddq %ymm14,%ymm4,%ymm4 - - leaq 144(%rsp),%rax - addq $64,%rdx - jnz .Ltail_avx2_512 - - vpsubq %ymm9,%ymm2,%ymm2 - vmovd %xmm0,-112(%rdi) - vmovd %xmm1,-108(%rdi) - vmovd %xmm2,-104(%rdi) - vmovd %xmm3,-100(%rdi) - vmovd %xmm4,-96(%rdi) - vzeroall - leaq -8(%r10),%rsp - - ret - -ENDPROC(poly1305_blocks_avx512) -#endif /* CONFIG_AS_AVX512 */ diff --git a/src/crypto/zinc/poly1305/poly1305-x86_64.pl b/src/crypto/zinc/poly1305/poly1305-x86_64.pl new file mode 100644 index 0000000..342ad7f --- /dev/null +++ b/src/crypto/zinc/poly1305/poly1305-x86_64.pl @@ -0,0 +1,4159 @@ +#! /usr/bin/env perl +# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# This module implements Poly1305 hash for x86_64. +# +# March 2015 +# +# Initial release. +# +# December 2016 +# +# Add AVX512F+VL+BW code path. +# +# November 2017 +# +# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be +# executed even on Knights Landing. Trigger for modification was +# observation that AVX512 code paths can negatively affect overall +# Skylake-X system performance. Since we are likely to suppress +# AVX512F capability flag [at least on Skylake-X], conversion serves +# as kind of "investment protection". Note that next *lake processor, +# Cannolake, has AVX512IFMA code path to execute... +# +# Numbers are cycles per processed byte with poly1305_blocks alone, +# measured with rdtsc at fixed clock frequency. +# +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 +# P4 4.46/+120% - +# Core 2 2.41/+90% - +# Westmere 1.88/+120% - +# Sandy Bridge 1.39/+140% 1.10 +# Haswell 1.14/+175% 1.11 0.65 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] +# Silvermont 2.83/+95% - +# Knights L 3.60/? 1.65 1.10 0.41(***) +# Goldmont 1.70/+180% - +# VIA Nano 1.82/+150% - +# Sledgehammer 1.38/+160% - +# Bulldozer 2.30/+130% 0.97 +# Ryzen 1.15/+200% 1.08 1.18 +# +# (*) improvement coefficients relative to clang are more modest and +# are ~50% on most processors, in both cases we are comparing to +# __int128 code; +# (**) SSE2 implementation was attempted, but among non-AVX processors +# it was faster than integer-only code only on older Intel P4 and +# Core processors, 50-30%, less newer processor is, but slower on +# contemporary ones, for example almost 2x slower on Atom, and as +# former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) strangely enough performance seems to vary from core to core, +# listed result is best case; + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { + $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); + $avx += 2 if ($1==2.11 && $2>=8); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=12); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); +my ($mac,$nonce)=($inp,$len); # *_emit arguments +my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); +my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); + +sub poly1305_iteration { +# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 +# output: $h0-$h2 *= $r0-$r1 +$code.=<<___; + mulq $h0 # h0*r1 + mov %rax,$d2 + mov $r0,%rax + mov %rdx,$d3 + + mulq $h0 # h0*r0 + mov %rax,$h0 # future $h0 + mov $r0,%rax + mov %rdx,$d1 + + mulq $h1 # h1*r0 + add %rax,$d2 + mov $s1,%rax + adc %rdx,$d3 + + mulq $h1 # h1*s1 + mov $h2,$h1 # borrow $h1 + add %rax,$h0 + adc %rdx,$d1 + + imulq $s1,$h1 # h2*s1 + add $h1,$d2 + mov $d1,$h1 + adc \$0,$d3 + + imulq $r0,$h2 # h2*r0 + add $d2,$h1 + mov \$-4,%rax # mask value + adc $h2,$d3 + + and $d3,%rax # last reduction step + mov $d3,$h2 + shr \$2,$d3 + and \$3,$h2 + add $d3,%rax + add %rax,$h0 + adc \$0,$h1 + adc \$0,$h2 +___ +} + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^64 +# unsigned __int64 r[2]; # key value base 2^64 + +$code.=<<___; +.text + +.extern OPENSSL_ia32cap_P + +.globl poly1305_init +.hidden poly1305_init +.globl poly1305_blocks +.hidden poly1305_blocks +.globl poly1305_emit +.hidden poly1305_emit + +.type poly1305_init,\@function,3 +.align 32 +poly1305_init: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + + cmp \$0,$inp + je .Lno_key + + lea poly1305_blocks(%rip),%r10 + lea poly1305_emit(%rip),%r11 +___ +$code.=<<___ if ($avx); + mov OPENSSL_ia32cap_P+4(%rip),%r9 + lea poly1305_blocks_avx(%rip),%rax + lea poly1305_emit_avx(%rip),%rcx + bt \$`60-32`,%r9 # AVX? + cmovc %rax,%r10 + cmovc %rcx,%r11 +___ +$code.=<<___ if ($avx>1); + lea poly1305_blocks_avx2(%rip),%rax + bt \$`5+32`,%r9 # AVX2? + cmovc %rax,%r10 +___ +$code.=<<___ if ($avx>3); + mov \$`(1<<31|1<<21|1<<16)`,%rax + shr \$32,%r9 + and %rax,%r9 + cmp %rax,%r9 + je .Linit_base2_44 +___ +$code.=<<___; + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + and 8($inp),%rcx + mov %rax,24($ctx) + mov %rcx,32($ctx) +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax +.Lno_key: + ret +.size poly1305_init,.-poly1305_init + +.type poly1305_blocks,\@function,4 +.align 32 +poly1305_blocks: +.cfi_startproc +.Lblocks: + shr \$4,$len + jz .Lno_data # too short + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2 + + mov $s1,$r1 + shr \$2,$s1 + mov $r1,%rax + add $r1,$s1 # s1 = r1 + (r1 >> 2) + jmp .Loop + +.align 32 +.Loop: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 +___ + &poly1305_iteration(); +$code.=<<___; + mov $r1,%rax + dec %r15 # len-=16 + jnz .Loop + + mov $h0,0($ctx) # store hash value + mov $h1,8($ctx) + mov $h2,16($ctx) + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data: +.Lblocks_epilogue: + ret +.cfi_endproc +.size poly1305_blocks,.-poly1305_blocks + +.type poly1305_emit,\@function,3 +.align 32 +poly1305_emit: +.Lemit: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit,.-poly1305_emit +___ +if ($avx) { + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int32 h[5]; # current hash value base 2^26 +# unsigned __int32 is_base2_26; +# unsigned __int64 r[2]; # key value base 2^64 +# unsigned __int64 pad; +# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; +# +# where r^n are base 2^26 digits of degrees of multiplier key. There are +# 5 digits, but last four are interleaved with multiples of 5, totalling +# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. + +my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = + map("%xmm$_",(0..15)); + +$code.=<<___; +.type __poly1305_block,\@abi-omnipotent +.align 32 +__poly1305_block: +___ + &poly1305_iteration(); +$code.=<<___; + ret +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,\@abi-omnipotent +.align 32 +__poly1305_init_avx: + mov $r0,$h0 + mov $r1,$h1 + xor $h2,$h2 + + lea 48+64($ctx),$ctx # size optimization + + mov $r1,%rax + call __poly1305_block # r^2 + + mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 + mov \$0x3ffffff,%edx + mov $h0,$d1 + and $h0#d,%eax + mov $r0,$d2 + and $r0#d,%edx + mov %eax,`16*0+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*0+4-64`($ctx) + shr \$26,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*1+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*1+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*2+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*2+4-64`($ctx) + shr \$26,$d2 + + mov $h1,%rax + mov $r1,%rdx + shl \$12,%rax + shl \$12,%rdx + or $d1,%rax + or $d2,%rdx + and \$0x3ffffff,%eax + and \$0x3ffffff,%edx + mov %eax,`16*3+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*3+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*4+0-64`($ctx) + mov $h1,$d1 + mov %edx,`16*4+4-64`($ctx) + mov $r1,$d2 + + mov \$0x3ffffff,%eax + mov \$0x3ffffff,%edx + shr \$14,$d1 + shr \$14,$d2 + and $d1#d,%eax + and $d2#d,%edx + mov %eax,`16*5+0-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov %edx,`16*5+4-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + mov %eax,`16*6+0-64`($ctx) + shr \$26,$d1 + mov %edx,`16*6+4-64`($ctx) + shr \$26,$d2 + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+0-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d2#d,`16*7+4-64`($ctx) + lea ($d2,$d2,4),$d2 # *5 + mov $d1#d,`16*8+0-64`($ctx) + mov $d2#d,`16*8+4-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^3 + + mov \$0x3ffffff,%eax # save r^3 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+12-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+12-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+12-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+12-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+12-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+12-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+12-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+12-64`($ctx) + + mov $r1,%rax + call __poly1305_block # r^4 + + mov \$0x3ffffff,%eax # save r^4 base 2^26 + mov $h0,$d1 + and $h0#d,%eax + shr \$26,$d1 + mov %eax,`16*0+8-64`($ctx) + + mov \$0x3ffffff,%edx + and $d1#d,%edx + mov %edx,`16*1+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*2+8-64`($ctx) + + mov $h1,%rax + shl \$12,%rax + or $d1,%rax + and \$0x3ffffff,%eax + mov %eax,`16*3+8-64`($ctx) + lea (%rax,%rax,4),%eax # *5 + mov $h1,$d1 + mov %eax,`16*4+8-64`($ctx) + + mov \$0x3ffffff,%edx + shr \$14,$d1 + and $d1#d,%edx + mov %edx,`16*5+8-64`($ctx) + lea (%rdx,%rdx,4),%edx # *5 + shr \$26,$d1 + mov %edx,`16*6+8-64`($ctx) + + mov $h2,%rax + shl \$24,%rax + or %rax,$d1 + mov $d1#d,`16*7+8-64`($ctx) + lea ($d1,$d1,4),$d1 # *5 + mov $d1#d,`16*8+8-64`($ctx) + + lea -48-64($ctx),$ctx # size [de-]optimization + ret +.size __poly1305_init_avx,.-__poly1305_init_avx + +.type poly1305_blocks_avx,\@function,4 +.align 32 +poly1305_blocks_avx: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + and \$-16,$len + jz .Lno_data_avx + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx + + test \$31,$len + jz .Leven_avx + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + + call __poly1305_block + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + sub \$16,%r15 + jz .Lstore_base2_26_avx + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx: +.Lblocks_avx_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$31,$len + jz .Linit_avx + + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + +.Linit_avx: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,$H0 + vmovd %rdx#d,$H1 + vmovd $h0#d,$H2 + vmovd $h1#d,$H3 + vmovd $h2#d,$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx: + mov %r15,$len + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 4*0($ctx),$H0 # load hash value + vmovd 4*1($ctx),$H1 + vmovd 4*2($ctx),$H2 + vmovd 4*3($ctx),$H3 + vmovd 4*4($ctx),$H4 + +.Ldo_avx: +___ +$code.=<<___ if (!$win64); + lea -0x58(%rsp),%r11 +.cfi_def_cfa %r11,0x60 + sub \$0x178,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x218,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx_body: +___ +$code.=<<___; + sub \$64,$len + lea -32($inp),%rax + cmovc %rax,$inp + + vmovdqu `16*3`($ctx),$D4 # preload r0^2 + lea `16*3+64`($ctx),$ctx # size optimization + lea .Lconst(%rip),%rcx + + ################################################################ + # load input + vmovdqu 16*2($inp),$T0 + vmovdqu 16*3($inp),$T1 + vmovdqa 64(%rcx),$MASK # .Lmask26 + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + vpsrlq \$40,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + jbe .Lskip_loop_avx + + # expand and copy pre-calculated table to stack + vmovdqu `16*1-64`($ctx),$D1 + vmovdqu `16*2-64`($ctx),$D2 + vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 + vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 + vmovdqa $D3,-0x90(%r11) + vmovdqa $D0,0x00(%rsp) + vpshufd \$0xEE,$D1,$D4 + vmovdqu `16*3-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x80(%r11) + vmovdqa $D1,0x10(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqu `16*4-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x70(%r11) + vmovdqa $D2,0x20(%rsp) + vpshufd \$0xEE,$D0,$D4 + vmovdqu `16*5-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D4,-0x60(%r11) + vmovdqa $D0,0x30(%rsp) + vpshufd \$0xEE,$D1,$D3 + vmovdqu `16*6-64`($ctx),$D0 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D3,-0x50(%r11) + vmovdqa $D1,0x40(%rsp) + vpshufd \$0xEE,$D2,$D4 + vmovdqu `16*7-64`($ctx),$D1 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D4,-0x40(%r11) + vmovdqa $D2,0x50(%rsp) + vpshufd \$0xEE,$D0,$D3 + vmovdqu `16*8-64`($ctx),$D2 + vpshufd \$0x44,$D0,$D0 + vmovdqa $D3,-0x30(%r11) + vmovdqa $D0,0x60(%rsp) + vpshufd \$0xEE,$D1,$D4 + vpshufd \$0x44,$D1,$D1 + vmovdqa $D4,-0x20(%r11) + vmovdqa $D1,0x70(%rsp) + vpshufd \$0xEE,$D2,$D3 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpshufd \$0x44,$D2,$D2 + vmovdqa $D3,-0x10(%r11) + vmovdqa $D2,0x80(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + ################################################################ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r + # \___________________/ + # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 + # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r + # \___________________/ \____________________/ + # + # Note that we start with inp[2:3]*r^2. This is because it + # doesn't depend on reduction in previous iteration. + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # though note that $Tx and $Hx are "reversed" in this section, + # and $D4 is preloaded with r0^2... + + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vmovdqa $H2,0x20(%r11) # offload hash + vpmuludq $T2,$D4,$D2 # d3 = h2*r0 + vmovdqa 0x10(%rsp),$H2 # r1^2 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vmovdqa $H0,0x00(%r11) # + vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 + vmovdqa $H1,0x10(%r11) # + vpmuludq $T3,$H2,$H1 # h3*r1 + vpaddq $H0,$D0,$D0 # d0 += h4*s1 + vpaddq $H1,$D4,$D4 # d4 += h3*r1 + vmovdqa $H3,0x30(%r11) # + vpmuludq $T2,$H2,$H0 # h2*r1 + vpmuludq $T1,$H2,$H1 # h1*r1 + vpaddq $H0,$D3,$D3 # d3 += h2*r1 + vmovdqa 0x30(%rsp),$H3 # r2^2 + vpaddq $H1,$D2,$D2 # d2 += h1*r1 + vmovdqa $H4,0x40(%r11) # + vpmuludq $T0,$H2,$H2 # h0*r1 + vpmuludq $T2,$H3,$H0 # h2*r2 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + + vmovdqa 0x40(%rsp),$H4 # s2^2 + vpaddq $H0,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H3,$H1 # h1*r2 + vpmuludq $T0,$H3,$H3 # h0*r2 + vpaddq $H1,$D3,$D3 # d3 += h1*r2 + vmovdqa 0x50(%rsp),$H2 # r3^2 + vpaddq $H3,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H4,$H0 # h4*s2 + vpmuludq $T3,$H4,$H4 # h3*s2 + vpaddq $H0,$D1,$D1 # d1 += h4*s2 + vmovdqa 0x60(%rsp),$H3 # s3^2 + vpaddq $H4,$D0,$D0 # d0 += h3*s2 + + vmovdqa 0x80(%rsp),$H4 # s4^2 + vpmuludq $T1,$H2,$H1 # h1*r3 + vpmuludq $T0,$H2,$H2 # h0*r3 + vpaddq $H1,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $T4,$H3,$H0 # h4*s3 + vpmuludq $T3,$H3,$H1 # h3*s3 + vpaddq $H0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*0($inp),$H0 # load input + vpaddq $H1,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H3,$H3 # h2*s3 + vpmuludq $T2,$H4,$T2 # h2*s4 + vpaddq $H3,$D0,$D0 # d0 += h2*s3 + + vmovdqu 16*1($inp),$H1 # + vpaddq $T2,$D1,$D1 # d1 += h2*s4 + vpmuludq $T3,$H4,$T3 # h3*s4 + vpmuludq $T4,$H4,$T4 # h4*s4 + vpsrldq \$6,$H0,$H2 # splat input + vpaddq $T3,$D2,$D2 # d2 += h3*s4 + vpaddq $T4,$D3,$D3 # d3 += h4*s4 + vpsrldq \$6,$H1,$H3 # + vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 + vpmuludq $T1,$H4,$T0 # h1*s4 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpaddq $T4,$D4,$D4 # d4 += h0*r4 + vmovdqa -0x90(%r11),$T4 # r0^4 + vpaddq $T0,$D0,$D0 # d0 += h1*s4 + + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + #vpsrlq \$40,$H4,$H4 # 4 + vpsrldq \$`40/8`,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpand 0(%rcx),$H4,$H4 # .Lmask24 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpaddq 0x00(%r11),$H0,$H0 # add hash value + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + lea 16*2($inp),%rax + lea 16*4($inp),$inp + sub \$64,$len + cmovc %rax,$inp + + ################################################################ + # Now we accumulate (inp[0:1]+hash)*r^4 + ################################################################ + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vmovdqa -0x80(%r11),$T2 # r1^4 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T0,$D2,$D2 + vpaddq $T1,$D3,$D3 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 + vpaddq $T4,$D4,$D4 + + vpaddq $T0,$D0,$D0 # d0 += h4*s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vmovdqa -0x60(%r11),$T3 # r2^4 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpmuludq $H1,$T2,$T1 # h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T1,$D2,$D2 # d2 += h1*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + + vmovdqa -0x50(%r11),$T4 # s2^4 + vpmuludq $H2,$T3,$T0 # h2*r2 + vpmuludq $H1,$T3,$T1 # h1*r2 + vpaddq $T0,$D4,$D4 # d4 += h2*r2 + vpaddq $T1,$D3,$D3 # d3 += h1*r2 + vmovdqa -0x40(%r11),$T2 # r3^4 + vpmuludq $H0,$T3,$T3 # h0*r2 + vpmuludq $H4,$T4,$T0 # h4*s2 + vpaddq $T3,$D2,$D2 # d2 += h0*r2 + vpaddq $T0,$D1,$D1 # d1 += h4*s2 + vmovdqa -0x30(%r11),$T3 # s3^4 + vpmuludq $H3,$T4,$T4 # h3*s2 + vpmuludq $H1,$T2,$T1 # h1*r3 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + + vmovdqa -0x10(%r11),$T4 # s4^4 + vpaddq $T1,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T2,$T2 # h0*r3 + vpmuludq $H4,$T3,$T0 # h4*s3 + vpaddq $T2,$D3,$D3 # d3 += h0*r3 + vpaddq $T0,$D2,$D2 # d2 += h4*s3 + vmovdqu 16*2($inp),$T0 # load input + vpmuludq $H3,$T3,$T2 # h3*s3 + vpmuludq $H2,$T3,$T3 # h2*s3 + vpaddq $T2,$D1,$D1 # d1 += h3*s3 + vmovdqu 16*3($inp),$T1 # + vpaddq $T3,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H2,$T4,$H2 # h2*s4 + vpmuludq $H3,$T4,$H3 # h3*s4 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $H2,$D1,$D1 # d1 += h2*s4 + vpmuludq $H4,$T4,$H4 # h4*s4 + vpsrldq \$6,$T1,$T3 # + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 + vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 + vpmuludq $H1,$T4,$H0 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + + #vpsrlq \$40,$T4,$T4 # 4 + vpsrldq \$`40/8`,$T4,$T4 # 4 + vpsrlq \$26,$T0,$T1 + vmovdqa 0x00(%rsp),$D4 # preload r0^2 + vpand $MASK,$T0,$T0 # 0 + vpsrlq \$4,$T3,$T2 + vpand $MASK,$T1,$T1 # 1 + vpand 0(%rcx),$T4,$T4 # .Lmask24 + vpsrlq \$30,$T3,$T3 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + ################################################################ + # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein + # and P. Schwabe + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D0 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D0,$H0,$H0 + vpsllq \$2,$D0,$D0 + vpaddq $D0,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + ja .Loop_avx + +.Lskip_loop_avx: + ################################################################ + # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 + + vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 + add \$32,$len + jnz .Long_tail_avx + + vpaddq $H2,$T2,$T2 + vpaddq $H0,$T0,$T0 + vpaddq $H1,$T1,$T1 + vpaddq $H3,$T3,$T3 + vpaddq $H4,$T4,$T4 + +.Long_tail_avx: + vmovdqa $H2,0x20(%r11) + vmovdqa $H0,0x00(%r11) + vmovdqa $H1,0x10(%r11) + vmovdqa $H3,0x30(%r11) + vmovdqa $H4,0x40(%r11) + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + + vpmuludq $T2,$D4,$D2 # d2 = h2*r0 + vpmuludq $T0,$D4,$D0 # d0 = h0*r0 + vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n + vpmuludq $T1,$D4,$D1 # d1 = h1*r0 + vpmuludq $T3,$D4,$D3 # d3 = h3*r0 + vpmuludq $T4,$D4,$D4 # d4 = h4*r0 + + vpmuludq $T3,$H2,$H0 # h3*r1 + vpaddq $H0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n + vpmuludq $T2,$H2,$H1 # h2*r1 + vpaddq $H1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n + vpmuludq $T1,$H2,$H0 # h1*r1 + vpaddq $H0,$D2,$D2 # d2 += h1*r1 + vpmuludq $T0,$H2,$H2 # h0*r1 + vpaddq $H2,$D1,$D1 # d1 += h0*r1 + vpmuludq $T4,$H3,$H3 # h4*s1 + vpaddq $H3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n + vpmuludq $T2,$H4,$H1 # h2*r2 + vpaddq $H1,$D4,$D4 # d4 += h2*r2 + vpmuludq $T1,$H4,$H0 # h1*r2 + vpaddq $H0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n + vpmuludq $T0,$H4,$H4 # h0*r2 + vpaddq $H4,$D2,$D2 # d2 += h0*r2 + vpmuludq $T4,$H2,$H1 # h4*s2 + vpaddq $H1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n + vpmuludq $T3,$H2,$H2 # h3*s2 + vpaddq $H2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $T1,$H3,$H0 # h1*r3 + vpaddq $H0,$D4,$D4 # d4 += h1*r3 + vpmuludq $T0,$H3,$H3 # h0*r3 + vpaddq $H3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n + vpmuludq $T4,$H4,$H1 # h4*s3 + vpaddq $H1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n + vpmuludq $T3,$H4,$H0 # h3*s3 + vpaddq $H0,$D1,$D1 # d1 += h3*s3 + vpmuludq $T2,$H4,$H4 # h2*s3 + vpaddq $H4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $T0,$H2,$H2 # h0*r4 + vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 + vpmuludq $T4,$H3,$H1 # h4*s4 + vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 + vpmuludq $T3,$H3,$H0 # h3*s4 + vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 + vpmuludq $T2,$H3,$H1 # h2*s4 + vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 + vpmuludq $T1,$H3,$H3 # h1*s4 + vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 + + jz .Lshort_tail_avx + + vmovdqu 16*0($inp),$H0 # load input + vmovdqu 16*1($inp),$H1 + + vpsrldq \$6,$H0,$H2 # splat input + vpsrldq \$6,$H1,$H3 + vpunpckhqdq $H1,$H0,$H4 # 4 + vpunpcklqdq $H1,$H0,$H0 # 0:1 + vpunpcklqdq $H3,$H2,$H3 # 2:3 + + vpsrlq \$40,$H4,$H4 # 4 + vpsrlq \$26,$H0,$H1 + vpand $MASK,$H0,$H0 # 0 + vpsrlq \$4,$H3,$H2 + vpand $MASK,$H1,$H1 # 1 + vpsrlq \$30,$H3,$H3 + vpand $MASK,$H2,$H2 # 2 + vpand $MASK,$H3,$H3 # 3 + vpor 32(%rcx),$H4,$H4 # padbit, yes, always + + vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 + vpaddq 0x00(%r11),$H0,$H0 + vpaddq 0x10(%r11),$H1,$H1 + vpaddq 0x20(%r11),$H2,$H2 + vpaddq 0x30(%r11),$H3,$H3 + vpaddq 0x40(%r11),$H4,$H4 + + ################################################################ + # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate + + vpmuludq $H0,$T4,$T0 # h0*r0 + vpaddq $T0,$D0,$D0 # d0 += h0*r0 + vpmuludq $H1,$T4,$T1 # h1*r0 + vpaddq $T1,$D1,$D1 # d1 += h1*r0 + vpmuludq $H2,$T4,$T0 # h2*r0 + vpaddq $T0,$D2,$D2 # d2 += h2*r0 + vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n + vpmuludq $H3,$T4,$T1 # h3*r0 + vpaddq $T1,$D3,$D3 # d3 += h3*r0 + vpmuludq $H4,$T4,$T4 # h4*r0 + vpaddq $T4,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T2,$T0 # h3*r1 + vpaddq $T0,$D4,$D4 # d4 += h3*r1 + vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 + vpmuludq $H2,$T2,$T1 # h2*r1 + vpaddq $T1,$D3,$D3 # d3 += h2*r1 + vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 + vpmuludq $H1,$T2,$T0 # h1*r1 + vpaddq $T0,$D2,$D2 # d2 += h1*r1 + vpmuludq $H0,$T2,$T2 # h0*r1 + vpaddq $T2,$D1,$D1 # d1 += h0*r1 + vpmuludq $H4,$T3,$T3 # h4*s1 + vpaddq $T3,$D0,$D0 # d0 += h4*s1 + + vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 + vpmuludq $H2,$T4,$T1 # h2*r2 + vpaddq $T1,$D4,$D4 # d4 += h2*r2 + vpmuludq $H1,$T4,$T0 # h1*r2 + vpaddq $T0,$D3,$D3 # d3 += h1*r2 + vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 + vpmuludq $H0,$T4,$T4 # h0*r2 + vpaddq $T4,$D2,$D2 # d2 += h0*r2 + vpmuludq $H4,$T2,$T1 # h4*s2 + vpaddq $T1,$D1,$D1 # d1 += h4*s2 + vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 + vpmuludq $H3,$T2,$T2 # h3*s2 + vpaddq $T2,$D0,$D0 # d0 += h3*s2 + + vpmuludq $H1,$T3,$T0 # h1*r3 + vpaddq $T0,$D4,$D4 # d4 += h1*r3 + vpmuludq $H0,$T3,$T3 # h0*r3 + vpaddq $T3,$D3,$D3 # d3 += h0*r3 + vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 + vpmuludq $H4,$T4,$T1 # h4*s3 + vpaddq $T1,$D2,$D2 # d2 += h4*s3 + vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 + vpmuludq $H3,$T4,$T0 # h3*s3 + vpaddq $T0,$D1,$D1 # d1 += h3*s3 + vpmuludq $H2,$T4,$T4 # h2*s3 + vpaddq $T4,$D0,$D0 # d0 += h2*s3 + + vpmuludq $H0,$T2,$T2 # h0*r4 + vpaddq $T2,$D4,$D4 # d4 += h0*r4 + vpmuludq $H4,$T3,$T1 # h4*s4 + vpaddq $T1,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$T3,$T0 # h3*s4 + vpaddq $T0,$D2,$D2 # d2 += h3*s4 + vpmuludq $H2,$T3,$T1 # h2*s4 + vpaddq $T1,$D1,$D1 # d1 += h2*s4 + vpmuludq $H1,$T3,$T3 # h1*s4 + vpaddq $T3,$D0,$D0 # d0 += h1*s4 + +.Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D4,$H4 + vpand $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$H1 + vpand $MASK,$D1,$D1 + vpaddq $H1,$D2,$D2 # h1 -> h2 + + vpaddq $H4,$D0,$D0 + vpsllq \$2,$H4,$H4 + vpaddq $H4,$D0,$D0 # h4 -> h0 + + vpsrlq \$26,$D2,$H2 + vpand $MASK,$D2,$D2 + vpaddq $H2,$D3,$D3 # h2 -> h3 + + vpsrlq \$26,$D0,$H0 + vpand $MASK,$D0,$D0 + vpaddq $H0,$D1,$D1 # h0 -> h1 + + vpsrlq \$26,$D3,$H3 + vpand $MASK,$D3,$D3 + vpaddq $H3,$D4,$D4 # h3 -> h4 + + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx_epilogue: +___ +$code.=<<___ if (!$win64); + lea 0x58(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx,.-poly1305_blocks_avx + +.type poly1305_emit_avx,\@function,3 +.align 32 +poly1305_emit_avx: + cmpl \$0,20($ctx) # is_base2_26? + je .Lemit + + mov 0($ctx),%eax # load hash value base 2^26 + mov 4($ctx),%ecx + mov 8($ctx),%r8d + mov 12($ctx),%r11d + mov 16($ctx),%r10d + + shl \$26,%rcx # base 2^26 -> base 2^64 + mov %r8,%r9 + shl \$52,%r8 + add %rcx,%rax + shr \$12,%r9 + add %rax,%r8 # h0 + adc \$0,%r9 + + shl \$14,%r11 + mov %r10,%rax + shr \$24,%r10 + add %r11,%r9 + shl \$40,%rax + add %rax,%r9 # h1 + adc \$0,%r10 # h2 + + mov %r10,%rax # could be partially reduced, so reduce + mov %r10,%rcx + and \$3,%r10 + shr \$2,%rax + and \$-4,%rcx + add %rcx,%rax + add %rax,%r8 + adc \$0,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_avx,.-poly1305_emit_avx +___ + +if ($avx>1) { +my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = + map("%ymm$_",(0..15)); +my $S4=$MASK; + +$code.=<<___; +.type poly1305_blocks_avx2,\@function,4 +.align 32 +poly1305_blocks_avx2: +.cfi_startproc + mov 20($ctx),%r8d # is_base2_26 + cmp \$128,$len + jae .Lblocks_avx2 + test %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + and \$-16,$len + jz .Lno_data_avx2 + + vzeroupper + + test %r8d,%r8d + jz .Lbase2_64_avx2 + + test \$63,$len + jz .Leven_avx2 + + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lblocks_avx2_body: + + mov $len,%r15 # reassign $len + + mov 0($ctx),$d1 # load hash value + mov 8($ctx),$d2 + mov 16($ctx),$h2#d + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + ################################# base 2^26 -> base 2^64 + mov $d1#d,$h0#d + and \$`-1*(1<<31)`,$d1 + mov $d2,$r1 # borrow $r1 + mov $d2#d,$h1#d + and \$`-1*(1<<31)`,$d2 + + shr \$6,$d1 + shl \$52,$r1 + add $d1,$h0 + shr \$12,$h1 + shr \$18,$d2 + add $r1,$h0 + adc $d2,$h1 + + mov $h2,$d1 + shl \$40,$d1 + shr \$24,$h2 + add $d1,$h1 + adc \$0,$h2 # can be partially reduced... + + mov \$-4,$d2 # ... so reduce + mov $h2,$d1 + and $h2,$d2 + shr \$2,$d1 + and \$3,$h2 + add $d2,$d1 # =*5 + add $d1,$h0 + adc \$0,$h1 + adc \$0,$h2 + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + +.Lbase2_26_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_26_pre_avx2 + + test $padbit,$padbit # if $padbit is zero, + jz .Lstore_base2_64_avx2 # store hash in base 2^64 format + + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$r0 + mov $h1,$r1 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$r0 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $r0,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$r1 + and \$0x3ffffff,$h1 # h[3] + or $r1,$h2 # h[4] + + test %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + mov $h0,0($ctx) + mov $h1,8($ctx) + mov $h2,16($ctx) # note that is_base2_26 is zeroed + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + mov %rax#d,0($ctx) # store hash value base 2^26 + mov %rdx#d,4($ctx) + mov $h0#d,8($ctx) + mov $h1#d,12($ctx) + mov $h2#d,16($ctx) +.align 16 +.Ldone_avx2: + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + ret +.cfi_endproc + +.align 32 +.Lbase2_64_avx2: +.cfi_startproc + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 +.Lbase2_64_avx2_body: + + mov $len,%r15 # reassign $len + + mov 24($ctx),$r0 # load r + mov 32($ctx),$s1 + + mov 0($ctx),$h0 # load hash value + mov 8($ctx),$h1 + mov 16($ctx),$h2#d + + mov $s1,$r1 + mov $s1,%rax + shr \$2,$s1 + add $r1,$s1 # s1 = r1 + (r1 >> 2) + + test \$63,$len + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + add 0($inp),$h0 # accumulate input + adc 8($inp),$h1 + lea 16($inp),$inp + adc $padbit,$h2 + sub \$16,%r15 + + call __poly1305_block + mov $r1,%rax + + test \$63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + ################################# base 2^64 -> base 2^26 + mov $h0,%rax + mov $h0,%rdx + shr \$52,$h0 + mov $h1,$d1 + mov $h1,$d2 + shr \$26,%rdx + and \$0x3ffffff,%rax # h[0] + shl \$12,$d1 + and \$0x3ffffff,%rdx # h[1] + shr \$14,$h1 + or $d1,$h0 + shl \$24,$h2 + and \$0x3ffffff,$h0 # h[2] + shr \$40,$d2 + and \$0x3ffffff,$h1 # h[3] + or $d2,$h2 # h[4] + + vmovd %rax#d,%x#$H0 + vmovd %rdx#d,%x#$H1 + vmovd $h0#d,%x#$H2 + vmovd $h1#d,%x#$H3 + vmovd $h2#d,%x#$H4 + movl \$1,20($ctx) # set is_base2_26 + + call __poly1305_init_avx + +.Lproceed_avx2: + mov %r15,$len # restore $len + mov OPENSSL_ia32cap_P+8(%rip),%r10d + mov \$`(1<<31|1<<30|1<<16)`,%r11d + + mov 0(%rsp),%r15 +.cfi_restore %r15 + mov 8(%rsp),%r14 +.cfi_restore %r14 + mov 16(%rsp),%r13 +.cfi_restore %r13 + mov 24(%rsp),%r12 +.cfi_restore %r12 + mov 32(%rsp),%rbp +.cfi_restore %rbp + mov 40(%rsp),%rbx +.cfi_restore %rbx + lea 48(%rsp),%rax + lea 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 +.cfi_endproc + +.align 32 +.Leven_avx2: +.cfi_startproc + mov OPENSSL_ia32cap_P+8(%rip),%r10d + vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 + vmovd 4*1($ctx),%x#$H1 + vmovd 4*2($ctx),%x#$H2 + vmovd 4*3($ctx),%x#$H3 + vmovd 4*4($ctx),%x#$H4 + +.Ldo_avx2: +___ +$code.=<<___ if ($avx>2); + cmp \$512,$len + jb .Lskip_avx512 + and %r11d,%r10d + test \$`1<<16`,%r10d # check for AVX512F + jnz .Lblocks_avx512 +.Lskip_avx512: +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx2_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 + + # expand and copy pre-calculated table to stack + vmovdqu `16*0-64`($ctx),%x#$T2 + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$T3 + vmovdqu `16*2-64`($ctx),%x#$T4 + vmovdqu `16*3-64`($ctx),%x#$D0 + vmovdqu `16*4-64`($ctx),%x#$D1 + vmovdqu `16*5-64`($ctx),%x#$D2 + lea 0x90(%rsp),%rax # size optimization + vmovdqu `16*6-64`($ctx),%x#$D3 + vpermd $T2,$T0,$T2 # 00003412 -> 14243444 + vmovdqu `16*7-64`($ctx),%x#$D4 + vpermd $T3,$T0,$T3 + vmovdqu `16*8-64`($ctx),%x#$MASK + vpermd $T4,$T0,$T4 + vmovdqa $T2,0x00(%rsp) + vpermd $D0,$T0,$D0 + vmovdqa $T3,0x20-0x90(%rax) + vpermd $D1,$T0,$D1 + vmovdqa $T4,0x40-0x90(%rax) + vpermd $D2,$T0,$D2 + vmovdqa $D0,0x60-0x90(%rax) + vpermd $D3,$T0,$D3 + vmovdqa $D1,0x80-0x90(%rax) + vpermd $D4,$T0,$D4 + vmovdqa $D2,0xa0-0x90(%rax) + vpermd $MASK,$T0,$MASK + vmovdqa $D3,0xc0-0x90(%rax) + vmovdqa $D4,0xe0-0x90(%rax) + vmovdqa $MASK,0x100-0x90(%rax) + vmovdqa 64(%rcx),$MASK # .Lmask26 + + ################################################################ + # load input + vmovdqu 16*0($inp),%x#$T0 + vmovdqu 16*1($inp),%x#$T1 + vinserti128 \$1,16*2($inp),$T0,$T0 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$64,$len + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + ################################################################ + # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 + # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 + # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 + # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 + # \________/\__________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqa `32*0`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqa `32*1`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqa `32*3`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 + vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 + # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + vmovdqa `32*4-0x90`(%rax),$T1 # s2 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vmovdqu 16*0($inp),%x#$T0 # load input + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + vinserti128 \$1,16*2($inp),$T0,$T0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vmovdqu 16*1($inp),%x#$T1 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqa `32*5-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + vinserti128 \$1,16*3($inp),$T1,$T1 + lea 16*4($inp),$inp + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpsrldq \$6,$T0,$T2 # splat input + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpsrldq \$6,$T1,$T3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + vpunpckhqdq $T1,$T0,$T4 # 4 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpunpcklqdq $T3,$T2,$T3 # 2:3 + vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # lazy reduction (interleaved with tail of input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$4,$T3,$T2 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpand $MASK,$T2,$T2 # 2 + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$30,$T3,$T3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpand $MASK,$T0,$T0 # 0 + vpand $MASK,$T1,$T1 # 1 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + + sub \$64,$len + jnz .Loop_avx2 + + .byte 0x66,0x90 +.Ltail_avx2: + ################################################################ + # while above multiplications were by r^4 in all lanes, in last + # iteration we multiply least significant lane by r^4 and most + # significant one by r, so copy of above except that references + # to the precomputed table are displaced by 4... + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + vmovdqu `32*0+4`(%rsp),$T0 # r0^4 + vpaddq $H1,$T1,$H1 + vmovdqu `32*1+4`(%rsp),$T1 # r1^4 + vpaddq $H3,$T3,$H3 + vmovdqu `32*3+4`(%rsp),$T2 # r2^4 + vpaddq $H4,$T4,$H4 + vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 + vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 + + vpmuludq $H2,$T0,$D2 # d2 = h2*r0 + vpmuludq $H2,$T1,$D3 # d3 = h2*r1 + vpmuludq $H2,$T2,$D4 # d4 = h2*r2 + vpmuludq $H2,$T3,$D0 # d0 = h2*s3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + + vpmuludq $H0,$T1,$T4 # h0*r1 + vpmuludq $H1,$T1,$H2 # h1*r1 + vpaddq $T4,$D1,$D1 # d1 += h0*r1 + vpaddq $H2,$D2,$D2 # d2 += h1*r1 + vpmuludq $H3,$T1,$T4 # h3*r1 + vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 + vpaddq $T4,$D4,$D4 # d4 += h3*r1 + vpaddq $H2,$D0,$D0 # d0 += h4*s1 + + vpmuludq $H0,$T0,$T4 # h0*r0 + vpmuludq $H1,$T0,$H2 # h1*r0 + vpaddq $T4,$D0,$D0 # d0 += h0*r0 + vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 + vpaddq $H2,$D1,$D1 # d1 += h1*r0 + vpmuludq $H3,$T0,$T4 # h3*r0 + vpmuludq $H4,$T0,$H2 # h4*r0 + vpaddq $T4,$D3,$D3 # d3 += h3*r0 + vpaddq $H2,$D4,$D4 # d4 += h4*r0 + + vpmuludq $H3,$T1,$T4 # h3*s2 + vpmuludq $H4,$T1,$H2 # h4*s2 + vpaddq $T4,$D0,$D0 # d0 += h3*s2 + vpaddq $H2,$D1,$D1 # d1 += h4*s2 + vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 + vpmuludq $H1,$T2,$T4 # h1*r2 + vpmuludq $H0,$T2,$T2 # h0*r2 + vpaddq $T4,$D3,$D3 # d3 += h1*r2 + vpaddq $T2,$D2,$D2 # d2 += h0*r2 + + vpmuludq $H1,$H2,$T4 # h1*r3 + vpmuludq $H0,$H2,$H2 # h0*r3 + vpaddq $T4,$D4,$D4 # d4 += h1*r3 + vpaddq $H2,$D3,$D3 # d3 += h0*r3 + vpmuludq $H3,$T3,$T4 # h3*s3 + vpmuludq $H4,$T3,$H2 # h4*s3 + vpaddq $T4,$D1,$D1 # d1 += h3*s3 + vpaddq $H2,$D2,$D2 # d2 += h4*s3 + + vpmuludq $H3,$S4,$H3 # h3*s4 + vpmuludq $H4,$S4,$H4 # h4*s4 + vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 + vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 + vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 + vpmuludq $H1,$S4,$H0 # h1*s4 + vmovdqa 64(%rcx),$MASK # .Lmask26 + vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 + vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + + ################################################################ + # lazy reduction + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$D1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) +___ +$code.=<<___ if ($win64); + vmovdqa 0x50(%r11),%xmm6 + vmovdqa 0x60(%r11),%xmm7 + vmovdqa 0x70(%r11),%xmm8 + vmovdqa 0x80(%r11),%xmm9 + vmovdqa 0x90(%r11),%xmm10 + vmovdqa 0xa0(%r11),%xmm11 + vmovdqa 0xb0(%r11),%xmm12 + vmovdqa 0xc0(%r11),%xmm13 + vmovdqa 0xd0(%r11),%xmm14 + vmovdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx2_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + vzeroupper + ret +.cfi_endproc +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +___ +####################################################################### +if ($avx>2) { +# On entry we have input length divisible by 64. But since inner loop +# processes 128 bytes per iteration, cases when length is not divisible +# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this +# reason stack layout is kept identical to poly1305_blocks_avx2. If not +# for this tail, we wouldn't have to even allocate stack frame... + +my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); +my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); +my $PADBIT="%zmm30"; + +map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain +map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); +map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); +map(s/%y/%z/,($MASK)); + +$code.=<<___; +.type poly1305_blocks_avx512,\@function,4 +.align 32 +poly1305_blocks_avx512: +.cfi_startproc +.Lblocks_avx512: + mov \$15,%eax + kmovw %eax,%k2 +___ +$code.=<<___ if (!$win64); + lea -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + sub \$0x128,%rsp +___ +$code.=<<___ if ($win64); + lea -0xf8(%rsp),%r11 + sub \$0x1c8,%rsp + vmovdqa %xmm6,0x50(%r11) + vmovdqa %xmm7,0x60(%r11) + vmovdqa %xmm8,0x70(%r11) + vmovdqa %xmm9,0x80(%r11) + vmovdqa %xmm10,0x90(%r11) + vmovdqa %xmm11,0xa0(%r11) + vmovdqa %xmm12,0xb0(%r11) + vmovdqa %xmm13,0xc0(%r11) + vmovdqa %xmm14,0xd0(%r11) + vmovdqa %xmm15,0xe0(%r11) +.Ldo_avx512_body: +___ +$code.=<<___; + lea .Lconst(%rip),%rcx + lea 48+64($ctx),$ctx # size optimization + vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 + + # expand pre-calculated table + vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} + and \$-512,%rsp + vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} + mov \$0x20,%rax + vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} + vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} + vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} + vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} + vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} + vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} + vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} + vpermd $D0,$T2,$R0 # 00003412 -> 14243444 + vpbroadcastq 64(%rcx),$MASK # .Lmask26 + vpermd $D1,$T2,$R1 + vpermd $T0,$T2,$S1 + vpermd $D2,$T2,$R2 + vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 + vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 + vpermd $T1,$T2,$S2 + vmovdqu64 $R1,0x00(%rsp,%rax){%k2} + vpsrlq \$32,$R1,$T1 + vpermd $D3,$T2,$R3 + vmovdqa64 $S1,0x40(%rsp){%k2} + vpermd $T3,$T2,$S3 + vpermd $D4,$T2,$R4 + vmovdqu64 $R2,0x40(%rsp,%rax){%k2} + vpermd $T4,$T2,$S4 + vmovdqa64 $S2,0x80(%rsp){%k2} + vmovdqu64 $R3,0x80(%rsp,%rax){%k2} + vmovdqa64 $S3,0xc0(%rsp){%k2} + vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} + vmovdqa64 $S4,0x100(%rsp){%k2} + + ################################################################ + # calculate 5th through 8th powers of the key + # + # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 + # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 + # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 + # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 + # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 + + vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 + vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 + vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 + vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 + vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 + vpsrlq \$32,$R2,$T2 + + vpmuludq $T1,$S4,$M0 + vpmuludq $T1,$R0,$M1 + vpmuludq $T1,$R1,$M2 + vpmuludq $T1,$R2,$M3 + vpmuludq $T1,$R3,$M4 + vpsrlq \$32,$R3,$T3 + vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 + vpaddq $M1,$D1,$D1 # d1 += r1'*r0 + vpaddq $M2,$D2,$D2 # d2 += r1'*r1 + vpaddq $M3,$D3,$D3 # d3 += r1'*r2 + vpaddq $M4,$D4,$D4 # d4 += r1'*r3 + + vpmuludq $T2,$S3,$M0 + vpmuludq $T2,$S4,$M1 + vpmuludq $T2,$R1,$M3 + vpmuludq $T2,$R2,$M4 + vpmuludq $T2,$R0,$M2 + vpsrlq \$32,$R4,$T4 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 + vpaddq $M3,$D3,$D3 # d3 += r2'*r1 + vpaddq $M4,$D4,$D4 # d4 += r2'*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*r0 + + vpmuludq $T3,$S2,$M0 + vpmuludq $T3,$R0,$M3 + vpmuludq $T3,$R1,$M4 + vpmuludq $T3,$S3,$M1 + vpmuludq $T3,$S4,$M2 + vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 + vpaddq $M3,$D3,$D3 # d3 += r3'*r0 + vpaddq $M4,$D4,$D4 # d4 += r3'*r1 + vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 + vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 + + vpmuludq $T4,$S4,$M3 + vpmuludq $T4,$R0,$M4 + vpmuludq $T4,$S1,$M0 + vpmuludq $T4,$S2,$M1 + vpmuludq $T4,$S3,$M2 + vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 + vpaddq $M4,$D4,$D4 # d4 += r2'*r0 + vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 + vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 + vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 + + ################################################################ + # load input + vmovdqu64 16*0($inp),%z#$T3 + vmovdqu64 16*4($inp),%z#$T4 + lea 16*8($inp),$inp + + ################################################################ + # lazy reduction + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D4,$M4 + vpandq $MASK,$D4,$D4 + + vpsrlq \$26,$D1,$M1 + vpandq $MASK,$D1,$D1 + vpaddq $M1,$D2,$D2 # d1 -> d2 + + vpaddq $M4,$D0,$D0 + vpsllq \$2,$M4,$M4 + vpaddq $M4,$D0,$D0 # d4 -> d0 + + vpsrlq \$26,$D2,$M2 + vpandq $MASK,$D2,$D2 + vpaddq $M2,$D3,$D3 # d2 -> d3 + + vpsrlq \$26,$D0,$M0 + vpandq $MASK,$D0,$D0 + vpaddq $M0,$D1,$D1 # d0 -> d1 + + vpsrlq \$26,$D3,$M3 + vpandq $MASK,$D3,$D3 + vpaddq $M3,$D4,$D4 # d3 -> d4 + + ################################################################ + # at this point we have 14243444 in $R0-$S4 and 05060708 in + # $D0-$D4, ... + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + # ... since input 64-bit lanes are ordered as 73625140, we could + # "vperm" it to 76543210 (here and in each loop iteration), *or* + # we could just flow along, hence the goal for $R0-$S4 is + # 1858286838784888 ... + + vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: + mov \$0x7777,%eax + kmovw %eax,%k1 + + vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- + vpermd $R1,$M0,$R1 + vpermd $R2,$M0,$R2 + vpermd $R3,$M0,$R3 + vpermd $R4,$M0,$R4 + + vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 + vpermd $D1,$M0,${R1}{%k1} + vpermd $D2,$M0,${R2}{%k1} + vpermd $D3,$M0,${R3}{%k1} + vpermd $D4,$M0,${R4}{%k1} + + vpslld \$2,$R1,$S1 # *5 + vpslld \$2,$R2,$S2 + vpslld \$2,$R3,$S3 + vpslld \$2,$R4,$S4 + vpaddd $R1,$S1,$S1 + vpaddd $R2,$S2,$S2 + vpaddd $R3,$S3,$S3 + vpaddd $R4,$S4,$S4 + + vpbroadcastq 32(%rcx),$PADBIT # .L129 + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + vporq $T3,$T2,$T2 + vpsrlq \$26,$T0,$T1 + vpsrlq \$14,$T4,$T3 + vpsrlq \$40,$T4,$T4 # 4 + vpandq $MASK,$T2,$T2 # 2 + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + vpaddq $H2,$T2,$H2 # accumulate input + sub \$192,$len + jbe .Ltail_avx512 + jmp .Loop_avx512 + +.align 32 +.Loop_avx512: + ################################################################ + # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 + # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 + # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 + # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 + # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 + # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 + # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 + # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 + # \________/\___________/ + ################################################################ + #vpaddq $H2,$T2,$H2 # accumulate input + + # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 + # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 + # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 + # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 + # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 + # + # however, as h2 is "chronologically" first one available pull + # corresponding operations up, so it's + # + # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 + # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 + # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 + # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 + # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpaddq $H0,$T0,$H0 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu64 16*0($inp),$T3 # load input + vmovdqu64 16*4($inp),$T4 + lea 16*8($inp),$inp + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vpunpcklqdq $T4,$T3,$T0 # transpose input + vpunpckhqdq $T4,$T3,$T4 + + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpaddq $M3,$D3,$D3 # d3 += h4*s4 + vpmuludq $H3,$S4,$M2 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$52,$T0,$T2 # splat input + vpsllq \$12,$T4,$T3 + + vpsrlq \$26,$D3,$H3 + vpandq $MASK,$D3,$D3 + vpaddq $H3,$D4,$H4 # h3 -> h4 + + vporq $T3,$T2,$T2 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpandq $MASK,$T2,$T2 # 2 + + vpsrlq \$26,$H4,$D4 + vpandq $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpandq $MASK,$H1,$H1 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpaddq $T2,$H2,$H2 # modulo-scheduled + vpsrlq \$26,$T0,$T1 + + vpsrlq \$26,$H2,$D2 + vpandq $MASK,$H2,$H2 + vpaddq $D2,$D3,$H3 # h2 -> h3 + + vpsrlq \$14,$T4,$T3 + + vpsrlq \$26,$H0,$D0 + vpandq $MASK,$H0,$H0 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$40,$T4,$T4 # 4 + + vpsrlq \$26,$H3,$D3 + vpandq $MASK,$H3,$H3 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpandq $MASK,$T0,$T0 # 0 + #vpandq $MASK,$T1,$T1 # 1 + #vpandq $MASK,$T3,$T3 # 3 + #vporq $PADBIT,$T4,$T4 # padbit, yes, always + + sub \$128,$len + ja .Loop_avx512 + +.Ltail_avx512: + ################################################################ + # while above multiplications were by r^8 in all lanes, in last + # iteration we multiply least significant lane by r^8 and most + # significant one by r, that's why table gets shifted... + + vpsrlq \$32,$R0,$R0 # 0105020603070408 + vpsrlq \$32,$R1,$R1 + vpsrlq \$32,$R2,$R2 + vpsrlq \$32,$S3,$S3 + vpsrlq \$32,$S4,$S4 + vpsrlq \$32,$R3,$R3 + vpsrlq \$32,$R4,$R4 + vpsrlq \$32,$S1,$S1 + vpsrlq \$32,$S2,$S2 + + ################################################################ + # load either next or last 64 byte of input + lea ($inp,$len),$inp + + #vpaddq $H2,$T2,$H2 # accumulate input + vpaddq $H0,$T0,$H0 + + vpmuludq $H2,$R1,$D3 # d3 = h2*r1 + vpmuludq $H2,$R2,$D4 # d4 = h2*r2 + vpmuludq $H2,$S3,$D0 # d0 = h2*s3 + vpandq $MASK,$T1,$T1 # 1 + vpmuludq $H2,$S4,$D1 # d1 = h2*s4 + vpandq $MASK,$T3,$T3 # 3 + vpmuludq $H2,$R0,$D2 # d2 = h2*r0 + vporq $PADBIT,$T4,$T4 # padbit, yes, always + vpaddq $H1,$T1,$H1 # accumulate input + vpaddq $H3,$T3,$H3 + vpaddq $H4,$T4,$H4 + + vmovdqu 16*0($inp),%x#$T0 + vpmuludq $H0,$R3,$M3 + vpmuludq $H0,$R4,$M4 + vpmuludq $H0,$R0,$M0 + vpmuludq $H0,$R1,$M1 + vpaddq $M3,$D3,$D3 # d3 += h0*r3 + vpaddq $M4,$D4,$D4 # d4 += h0*r4 + vpaddq $M0,$D0,$D0 # d0 += h0*r0 + vpaddq $M1,$D1,$D1 # d1 += h0*r1 + + vmovdqu 16*1($inp),%x#$T1 + vpmuludq $H1,$R2,$M3 + vpmuludq $H1,$R3,$M4 + vpmuludq $H1,$S4,$M0 + vpmuludq $H0,$R2,$M2 + vpaddq $M3,$D3,$D3 # d3 += h1*r2 + vpaddq $M4,$D4,$D4 # d4 += h1*r3 + vpaddq $M0,$D0,$D0 # d0 += h1*s4 + vpaddq $M2,$D2,$D2 # d2 += h0*r2 + + vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 + vpmuludq $H3,$R0,$M3 + vpmuludq $H3,$R1,$M4 + vpmuludq $H1,$R0,$M1 + vpmuludq $H1,$R1,$M2 + vpaddq $M3,$D3,$D3 # d3 += h3*r0 + vpaddq $M4,$D4,$D4 # d4 += h3*r1 + vpaddq $M1,$D1,$D1 # d1 += h1*r0 + vpaddq $M2,$D2,$D2 # d2 += h1*r1 + + vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 + vpmuludq $H4,$S4,$M3 + vpmuludq $H4,$R0,$M4 + vpmuludq $H3,$S2,$M0 + vpmuludq $H3,$S3,$M1 + vpmuludq $H3,$S4,$M2 + vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 + vpaddq $M4,$D4,$D4 # d4 += h4*r0 + vpaddq $M0,$D0,$D0 # d0 += h3*s2 + vpaddq $M1,$D1,$D1 # d1 += h3*s3 + vpaddq $M2,$D2,$D2 # d2 += h3*s4 + + vpmuludq $H4,$S1,$M0 + vpmuludq $H4,$S2,$M1 + vpmuludq $H4,$S3,$M2 + vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 + vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 + vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 + + ################################################################ + # horizontal addition + + mov \$1,%eax + vpermq \$0xb1,$H3,$D3 + vpermq \$0xb1,$D4,$H4 + vpermq \$0xb1,$H0,$D0 + vpermq \$0xb1,$H1,$D1 + vpermq \$0xb1,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + kmovw %eax,%k3 + vpermq \$0x2,$H3,$D3 + vpermq \$0x2,$H4,$D4 + vpermq \$0x2,$H0,$D0 + vpermq \$0x2,$H1,$D1 + vpermq \$0x2,$H2,$D2 + vpaddq $D3,$H3,$H3 + vpaddq $D4,$H4,$H4 + vpaddq $D0,$H0,$H0 + vpaddq $D1,$H1,$H1 + vpaddq $D2,$H2,$H2 + + vextracti64x4 \$0x1,$H3,%y#$D3 + vextracti64x4 \$0x1,$H4,%y#$D4 + vextracti64x4 \$0x1,$H0,%y#$D0 + vextracti64x4 \$0x1,$H1,%y#$D1 + vextracti64x4 \$0x1,$H2,%y#$D2 + vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case + vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 + vpaddq $D0,$H0,${H0}{%k3}{z} + vpaddq $D1,$H1,${H1}{%k3}{z} + vpaddq $D2,$H2,${H2}{%k3}{z} +___ +map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); +map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); +$code.=<<___; + ################################################################ + # lazy reduction (interleaved with input splat) + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpsrldq \$6,$T0,$T2 # splat input + vpsrldq \$6,$T1,$T3 + vpunpckhqdq $T1,$T0,$T4 # 4 + vpaddq $D3,$H4,$H4 # h3 -> h4 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpunpcklqdq $T3,$T2,$T2 # 2:3 + vpunpcklqdq $T1,$T0,$T0 # 0:1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H4,$D4 + vpand $MASK,$H4,$H4 + + vpsrlq \$26,$H1,$D1 + vpand $MASK,$H1,$H1 + vpsrlq \$30,$T2,$T3 + vpsrlq \$4,$T2,$T2 + vpaddq $D1,$H2,$H2 # h1 -> h2 + + vpaddq $D4,$H0,$H0 + vpsllq \$2,$D4,$D4 + vpsrlq \$26,$T0,$T1 + vpsrlq \$40,$T4,$T4 # 4 + vpaddq $D4,$H0,$H0 # h4 -> h0 + + vpsrlq \$26,$H2,$D2 + vpand $MASK,$H2,$H2 + vpand $MASK,$T2,$T2 # 2 + vpand $MASK,$T0,$T0 # 0 + vpaddq $D2,$H3,$H3 # h2 -> h3 + + vpsrlq \$26,$H0,$D0 + vpand $MASK,$H0,$H0 + vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 + vpand $MASK,$T1,$T1 # 1 + vpaddq $D0,$H1,$H1 # h0 -> h1 + + vpsrlq \$26,$H3,$D3 + vpand $MASK,$H3,$H3 + vpand $MASK,$T3,$T3 # 3 + vpor 32(%rcx),$T4,$T4 # padbit, yes, always + vpaddq $D3,$H4,$H4 # h3 -> h4 + + lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 + add \$64,$len + jnz .Ltail_avx2 + + vpsubq $T2,$H2,$H2 # undo input accumulation + vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced + vmovd %x#$H1,`4*1-48-64`($ctx) + vmovd %x#$H2,`4*2-48-64`($ctx) + vmovd %x#$H3,`4*3-48-64`($ctx) + vmovd %x#$H4,`4*4-48-64`($ctx) + vzeroall +___ +$code.=<<___ if ($win64); + movdqa 0x50(%r11),%xmm6 + movdqa 0x60(%r11),%xmm7 + movdqa 0x70(%r11),%xmm8 + movdqa 0x80(%r11),%xmm9 + movdqa 0x90(%r11),%xmm10 + movdqa 0xa0(%r11),%xmm11 + movdqa 0xb0(%r11),%xmm12 + movdqa 0xc0(%r11),%xmm13 + movdqa 0xd0(%r11),%xmm14 + movdqa 0xe0(%r11),%xmm15 + lea 0xf8(%r11),%rsp +.Ldo_avx512_epilogue: +___ +$code.=<<___ if (!$win64); + lea 8(%r11),%rsp +.cfi_def_cfa %rsp,8 +___ +$code.=<<___; + ret +.cfi_endproc +.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 +___ +if ($avx>3) { +######################################################################## +# VPMADD52 version using 2^44 radix. +# +# One can argue that base 2^52 would be more natural. Well, even though +# some operations would be more natural, one has to recognize couple of +# things. Base 2^52 doesn't provide advantage over base 2^44 if you look +# at amount of multiply-n-accumulate operations. Secondly, it makes it +# impossible to pre-compute multiples of 5 [referred to as s[]/sN in +# reference implementations], which means that more such operations +# would have to be performed in inner loop, which in turn makes critical +# path longer. In other words, even though base 2^44 reduction might +# look less elegant, overall critical path is actually shorter... + +######################################################################## +# Layout of opaque area is following. +# +# unsigned __int64 h[3]; # current hash value base 2^44 +# unsigned __int64 s[2]; # key value*20 base 2^44 +# unsigned __int64 r[3]; # key value base 2^44 +# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; +# # r^n positions reflect +# # placement in register, not +# # memory, R[3] is R[1]*20 + +$code.=<<___; +.type poly1305_init_base2_44,\@function,3 +.align 32 +poly1305_init_base2_44: + xor %rax,%rax + mov %rax,0($ctx) # initialize hash value + mov %rax,8($ctx) + mov %rax,16($ctx) + +.Linit_base2_44: + lea poly1305_blocks_vpmadd52(%rip),%r10 + lea poly1305_emit_base2_44(%rip),%r11 + + mov \$0x0ffffffc0fffffff,%rax + mov \$0x0ffffffc0ffffffc,%rcx + and 0($inp),%rax + mov \$0x00000fffffffffff,%r8 + and 8($inp),%rcx + mov \$0x00000fffffffffff,%r9 + and %rax,%r8 + shrd \$44,%rcx,%rax + mov %r8,40($ctx) # r0 + and %r9,%rax + shr \$24,%rcx + mov %rax,48($ctx) # r1 + lea (%rax,%rax,4),%rax # *5 + mov %rcx,56($ctx) # r2 + shl \$2,%rax # magic <<2 + lea (%rcx,%rcx,4),%rcx # *5 + shl \$2,%rcx # magic <<2 + mov %rax,24($ctx) # s1 + mov %rcx,32($ctx) # s2 + movq \$-1,64($ctx) # write impossible value +___ +$code.=<<___ if ($flavour !~ /elf32/); + mov %r10,0(%rdx) + mov %r11,8(%rdx) +___ +$code.=<<___ if ($flavour =~ /elf32/); + mov %r10d,0(%rdx) + mov %r11d,4(%rdx) +___ +$code.=<<___; + mov \$1,%eax + ret +.size poly1305_init_base2_44,.-poly1305_init_base2_44 +___ +{ +my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); +my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); +my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52,\@function,4 +.align 32 +poly1305_blocks_vpmadd52: + shr \$4,$len + jz .Lno_data_vpmadd52 # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + # if powers of the key are not calculated yet, process up to 3 + # blocks with this single-block subroutine, otherwise ensure that + # length is divisible by 2 blocks and pass the rest down to next + # subroutine... + + mov \$3,%rax + mov \$1,%r10 + cmp \$4,$len # is input long + cmovae %r10,%rax + test %r8,%r8 # is power value impossible? + cmovns %r10,%rax + + and $len,%rax # is input of favourable length? + jz .Lblocks_vpmadd52_4x + + sub %rax,$len + mov \$7,%r10d + mov \$1,%r11d + kmovw %r10d,%k7 + lea .L2_44_inp_permd(%rip),%r10 + kmovw %r11d,%k1 + + vmovq $padbit,%x#$PAD + vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd + vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift + vpermq \$0xcf,$PAD,$PAD + vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask + + vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value + vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys + vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} + vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} + + vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt + vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft + + jmp .Loop_vpmadd52 + +.align 32 +.Loop_vpmadd52: + vmovdqu32 0($inp),%x#$T0 # load input as ----3210 + lea 16($inp),$inp + + vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 + vpsrlvq $inp_shift,$T0,$T0 + vpandq $reduc_mask,$T0,$T0 + vporq $PAD,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo # accumulate input + + vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value + vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} + vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} + + vpxord $Dlo,$Dlo,$Dlo + vpxord $Dhi,$Dhi,$Dhi + + vpmadd52luq $r2r1r0,$H0,$Dlo + vpmadd52huq $r2r1r0,$H0,$Dhi + + vpmadd52luq $r1r0s2,$H1,$Dlo + vpmadd52huq $r1r0s2,$H1,$Dhi + + vpmadd52luq $r0s2s1,$H2,$Dlo + vpmadd52huq $r0s2s1,$H2,$Dhi + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword + vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword + vpandq $reduc_mask,$Dlo,$Dlo + + vpaddq $T0,$Dhi,$Dhi + + vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword + + vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) + + vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word + vpandq $reduc_mask,$Dlo,$Dlo + + vpermq \$0b10010011,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} + + vpaddq $T0,$Dlo,$Dlo + vpsllq \$2,$T0,$T0 + + vpaddq $T0,$Dlo,$Dlo + + dec %rax # len-=16 + jnz .Loop_vpmadd52 + + vmovdqu64 $Dlo,0($ctx){%k7} # store hash value + + test $len,$len + jnz .Lblocks_vpmadd52_4x + +.Lno_data_vpmadd52: + ret +.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 +___ +} +{ +######################################################################## +# As implied by its name 4x subroutine processes 4 blocks in parallel +# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power +# and is handled in 256-bit %ymm registers. + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_4x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_4x: + shr \$4,$len + jz .Lno_data_vpmadd52_4x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + +.Lblocks_vpmadd52_4x: + vpbroadcastq $padbit,$PAD + + vmovdqa64 .Lx_mask44(%rip),$mask44 + mov \$5,%eax + vmovdqa64 .Lx_mask42(%rip),$mask42 + kmovw %eax,%k1 # used in 2x path + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Lblocks_vpmadd52_2x_do + +.Lblocks_vpmadd52_4x_do: + vpbroadcastq 64($ctx),$R0 # load 4th power of the key + vpbroadcastq 96($ctx),$R1 + vpbroadcastq 128($ctx),$R2 + vpbroadcastq 160($ctx),$S1 + +.Lblocks_vpmadd52_4x_key_loaded: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + test \$7,$len # is len 8*n? + jz .Lblocks_vpmadd52_8x + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 3-1-2-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$4,$len + jz .Ltail_vpmadd52_4x + jmp .Loop_vpmadd52_4x + ud2 + +.align 32 +.Linit_vpmadd52: + vmovq 24($ctx),%x#$S1 # load key + vmovq 56($ctx),%x#$H2 + vmovq 32($ctx),%x#$S2 + vmovq 40($ctx),%x#$R0 + vmovq 48($ctx),%x#$R1 + + vmovdqa $R0,$H0 + vmovdqa $R1,$H1 + vmovdqa $H2,$R2 + + mov \$2,%eax + +.Lmul_init_vpmadd52: + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + dec %eax + jz .Ldone_init_vpmadd52 + + vpunpcklqdq $R1,$H1,$R1 # 1,2 + vpbroadcastq %x#$H1,%x#$H1 # 2,2 + vpunpcklqdq $R2,$H2,$R2 + vpbroadcastq %x#$H2,%x#$H2 + vpunpcklqdq $R0,$H0,$R0 + vpbroadcastq %x#$H0,%x#$H0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R1,$S1,$S1 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S1,$S1 + vpsllq \$2,$S2,$S2 + + jmp .Lmul_init_vpmadd52 + ud2 + +.align 32 +.Ldone_init_vpmadd52: + vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 + vinserti128 \$1,%x#$R2,$H2,$R2 + vinserti128 \$1,%x#$R0,$H0,$R0 + + vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 + vpermq \$0b11011000,$R2,$R2 + vpermq \$0b11011000,$R0,$R0 + + vpsllq \$2,$R1,$S1 # S1 = R1*5*4 + vpaddq $R1,$S1,$S1 + vpsllq \$2,$S1,$S1 + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + + test \$3,$len # is length 4*n+2? + jnz .Ldone_init_vpmadd52_2x + + vmovdqu64 $R0,64($ctx) # save key powers + vpbroadcastq %x#$R0,$R0 # broadcast 4th power + vmovdqu64 $R1,96($ctx) + vpbroadcastq %x#$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpbroadcastq %x#$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpbroadcastq %x#$S1,$S1 + + jmp .Lblocks_vpmadd52_4x_key_loaded + ud2 + +.align 32 +.Ldone_init_vpmadd52_2x: + vmovdqu64 $R0,64($ctx) # save key powers + vpsrldq \$8,$R0,$R0 # 0-1-0-2 + vmovdqu64 $R1,96($ctx) + vpsrldq \$8,$R1,$R1 + vmovdqu64 $R2,128($ctx) + vpsrldq \$8,$R2,$R2 + vmovdqu64 $S1,160($ctx) + vpsrldq \$8,$S1,$S1 + jmp .Lblocks_vpmadd52_2x_key_loaded + ud2 + +.align 32 +.Lblocks_vpmadd52_2x_do: + vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers + vmovdqu64 160+8($ctx),${S1}{%k1}{z} + vmovdqu64 64+8($ctx),${R0}{%k1}{z} + vmovdqu64 96+8($ctx),${R1}{%k1}{z} + +.Lblocks_vpmadd52_2x_key_loaded: + vmovdqu64 16*0($inp),$T2 # load data + vpxorq $T3,$T3,$T3 + lea 16*2($inp),$inp + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as x-1-x-0 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + jmp .Ltail_vpmadd52_2x + ud2 + +.align 32 +.Loop_vpmadd52_4x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*2($inp),$T3 + lea 16*4($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$4,$len # len-=64 + jnz .Loop_vpmadd52_4x + +.Ltail_vpmadd52_4x: + vmovdqu64 128($ctx),$R2 # load all key powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + +.Ltail_vpmadd52_2x: + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + # at this point $len is + # either 4*n+2 or 0... + sub \$2,$len # len-=32 + ja .Lblocks_vpmadd52_4x_do + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_4x: + ret +.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x +___ +} +{ +######################################################################## +# As implied by its name 8x subroutine processes 8 blocks in parallel... +# This is intermediate version, as it's used only in cases when input +# length is either 8*n, 8*n+1 or 8*n+2... + +my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); +my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); +my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); +my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); + +$code.=<<___; +.type poly1305_blocks_vpmadd52_8x,\@function,4 +.align 32 +poly1305_blocks_vpmadd52_8x: + shr \$4,$len + jz .Lno_data_vpmadd52_8x # too short + + shl \$40,$padbit + mov 64($ctx),%r8 # peek on power of the key + + vmovdqa64 .Lx_mask44(%rip),$mask44 + vmovdqa64 .Lx_mask42(%rip),$mask42 + + test %r8,%r8 # is power value impossible? + js .Linit_vpmadd52 # if it is, then init R[4] + + vmovq 0($ctx),%x#$H0 # load current hash value + vmovq 8($ctx),%x#$H1 + vmovq 16($ctx),%x#$H2 + +.Lblocks_vpmadd52_8x: + ################################################################ + # fist we calculate more key powers + + vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers + vmovdqu64 160($ctx),$S1 + vmovdqu64 64($ctx),$R0 + vmovdqu64 96($ctx),$R1 + + vpsllq \$2,$R2,$S2 # S2 = R2*5*4 + vpaddq $R2,$S2,$S2 + vpsllq \$2,$S2,$S2 + + vpbroadcastq %x#$R2,$RR2 # broadcast 4th power + vpbroadcastq %x#$R0,$RR0 + vpbroadcastq %x#$R1,$RR1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $RR2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $RR2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $RR2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $RR2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $RR2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $RR2,$R0,$D2hi + + vpmadd52luq $RR0,$R0,$D0lo + vpmadd52huq $RR0,$R0,$D0hi + vpmadd52luq $RR0,$R1,$D1lo + vpmadd52huq $RR0,$R1,$D1hi + vpmadd52luq $RR0,$R2,$D2lo + vpmadd52huq $RR0,$R2,$D2hi + + vpmadd52luq $RR1,$S2,$D0lo + vpmadd52huq $RR1,$S2,$D0hi + vpmadd52luq $RR1,$R0,$D1lo + vpmadd52huq $RR1,$R0,$D1hi + vpmadd52luq $RR1,$R1,$D2lo + vpmadd52huq $RR1,$R1,$D2hi + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$RR0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$RR1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$RR2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$RR0,$RR0 + + vpsrlq \$44,$RR0,$tmp # additional step + vpandq $mask44,$RR0,$RR0 + + vpaddq $tmp,$RR1,$RR1 + + ################################################################ + # At this point Rx holds 1324 powers, RRx - 5768, and the goal + # is 15263748, which reflects how data is loaded... + + vpunpcklqdq $R2,$RR2,$T2 # 3748 + vpunpckhqdq $R2,$RR2,$R2 # 1526 + vpunpcklqdq $R0,$RR0,$T0 + vpunpckhqdq $R0,$RR0,$R0 + vpunpcklqdq $R1,$RR1,$T1 + vpunpckhqdq $R1,$RR1,$R1 +___ +######## switch to %zmm +map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); +map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); + +$code.=<<___; + vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 + vshufi64x2 \$0x44,$R0,$T0,$RR0 + vshufi64x2 \$0x44,$R1,$T1,$RR1 + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + + vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 + vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 + vpaddq $RR2,$SS2,$SS2 + vpaddq $RR1,$SS1,$SS1 + vpsllq \$2,$SS2,$SS2 + vpsllq \$2,$SS1,$SS1 + + vpbroadcastq $padbit,$PAD + vpbroadcastq %x#$mask44,$mask44 + vpbroadcastq %x#$mask42,$mask42 + + vpbroadcastq %x#$SS1,$S1 # broadcast 8th power + vpbroadcastq %x#$SS2,$S2 + vpbroadcastq %x#$RR0,$R0 + vpbroadcastq %x#$RR1,$R1 + vpbroadcastq %x#$RR2,$R2 + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + + # at this point 64-bit lanes are ordered as 73625140 + + vpsrlq \$24,$T3,$T2 # splat the data + vporq $PAD,$T2,$T2 + vpaddq $T2,$H2,$H2 # accumulate input + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + sub \$8,$len + jz .Ltail_vpmadd52_8x + jmp .Loop_vpmadd52_8x + +.align 32 +.Loop_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$S1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$S1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$S2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$S2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$R0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$R0,$D2hi + + vmovdqu64 16*0($inp),$T2 # load data + vmovdqu64 16*4($inp),$T3 + lea 16*8($inp),$inp + vpmadd52luq $H0,$R0,$D0lo + vpmadd52huq $H0,$R0,$D0hi + vpmadd52luq $H0,$R1,$D1lo + vpmadd52huq $H0,$R1,$D1hi + vpmadd52luq $H0,$R2,$D2lo + vpmadd52huq $H0,$R2,$D2hi + + vpunpcklqdq $T3,$T2,$T1 # transpose data + vpunpckhqdq $T3,$T2,$T3 + vpmadd52luq $H1,$S2,$D0lo + vpmadd52huq $H1,$S2,$D0hi + vpmadd52luq $H1,$R0,$D1lo + vpmadd52huq $H1,$R0,$D1hi + vpmadd52luq $H1,$R1,$D2lo + vpmadd52huq $H1,$R1,$D2hi + + ################################################################ + # partial reduction (interleaved with data splat) + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpsrlq \$24,$T3,$T2 + vporq $PAD,$T2,$T2 + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpandq $mask44,$T1,$T0 + vpsrlq \$44,$T1,$T1 + vpsllq \$20,$T3,$T3 + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vporq $T3,$T1,$T1 + vpandq $mask44,$T1,$T1 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + sub \$8,$len # len-=128 + jnz .Loop_vpmadd52_8x + +.Ltail_vpmadd52_8x: + #vpaddq $T2,$H2,$H2 # accumulate input + vpaddq $T0,$H0,$H0 + vpaddq $T1,$H1,$H1 + + vpxorq $D0lo,$D0lo,$D0lo + vpmadd52luq $H2,$SS1,$D0lo + vpxorq $D0hi,$D0hi,$D0hi + vpmadd52huq $H2,$SS1,$D0hi + vpxorq $D1lo,$D1lo,$D1lo + vpmadd52luq $H2,$SS2,$D1lo + vpxorq $D1hi,$D1hi,$D1hi + vpmadd52huq $H2,$SS2,$D1hi + vpxorq $D2lo,$D2lo,$D2lo + vpmadd52luq $H2,$RR0,$D2lo + vpxorq $D2hi,$D2hi,$D2hi + vpmadd52huq $H2,$RR0,$D2hi + + vpmadd52luq $H0,$RR0,$D0lo + vpmadd52huq $H0,$RR0,$D0hi + vpmadd52luq $H0,$RR1,$D1lo + vpmadd52huq $H0,$RR1,$D1hi + vpmadd52luq $H0,$RR2,$D2lo + vpmadd52huq $H0,$RR2,$D2hi + + vpmadd52luq $H1,$SS2,$D0lo + vpmadd52huq $H1,$SS2,$D0hi + vpmadd52luq $H1,$RR0,$D1lo + vpmadd52huq $H1,$RR0,$D1hi + vpmadd52luq $H1,$RR1,$D2lo + vpmadd52huq $H1,$RR1,$D2hi + + ################################################################ + # horizontal addition + + mov \$1,%eax + kmovw %eax,%k1 + vpsrldq \$8,$D0lo,$T0 + vpsrldq \$8,$D0hi,$H0 + vpsrldq \$8,$D1lo,$T1 + vpsrldq \$8,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpsrldq \$8,$D2lo,$T2 + vpsrldq \$8,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vpermq \$0x2,$D0lo,$T0 + vpermq \$0x2,$D0hi,$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vpermq \$0x2,$D1lo,$T1 + vpermq \$0x2,$D1hi,$H1 + vpaddq $T0,$D0lo,$D0lo + vpaddq $H0,$D0hi,$D0hi + vpermq \$0x2,$D2lo,$T2 + vpermq \$0x2,$D2hi,$H2 + vpaddq $T1,$D1lo,$D1lo + vpaddq $H1,$D1hi,$D1hi + vextracti64x4 \$1,$D0lo,%y#$T0 + vextracti64x4 \$1,$D0hi,%y#$H0 + vpaddq $T2,$D2lo,$D2lo + vpaddq $H2,$D2hi,$D2hi + + vextracti64x4 \$1,$D1lo,%y#$T1 + vextracti64x4 \$1,$D1hi,%y#$H1 + vextracti64x4 \$1,$D2lo,%y#$T2 + vextracti64x4 \$1,$D2hi,%y#$H2 +___ +######## switch back to %ymm +map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); +map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); +map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); + +$code.=<<___; + vpaddq $T0,$D0lo,${D0lo}{%k1}{z} + vpaddq $H0,$D0hi,${D0hi}{%k1}{z} + vpaddq $T1,$D1lo,${D1lo}{%k1}{z} + vpaddq $H1,$D1hi,${D1hi}{%k1}{z} + vpaddq $T2,$D2lo,${D2lo}{%k1}{z} + vpaddq $H2,$D2hi,${D2hi}{%k1}{z} + + ################################################################ + # partial reduction + vpsrlq \$44,$D0lo,$tmp + vpsllq \$8,$D0hi,$D0hi + vpandq $mask44,$D0lo,$H0 + vpaddq $tmp,$D0hi,$D0hi + + vpaddq $D0hi,$D1lo,$D1lo + + vpsrlq \$44,$D1lo,$tmp + vpsllq \$8,$D1hi,$D1hi + vpandq $mask44,$D1lo,$H1 + vpaddq $tmp,$D1hi,$D1hi + + vpaddq $D1hi,$D2lo,$D2lo + + vpsrlq \$42,$D2lo,$tmp + vpsllq \$10,$D2hi,$D2hi + vpandq $mask42,$D2lo,$H2 + vpaddq $tmp,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + vpsllq \$2,$D2hi,$D2hi + + vpaddq $D2hi,$H0,$H0 + + vpsrlq \$44,$H0,$tmp # additional step + vpandq $mask44,$H0,$H0 + + vpaddq $tmp,$H1,$H1 + + ################################################################ + + vmovq %x#$H0,0($ctx) + vmovq %x#$H1,8($ctx) + vmovq %x#$H2,16($ctx) + vzeroall + +.Lno_data_vpmadd52_8x: + ret +.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x +___ +} +$code.=<<___; +.type poly1305_emit_base2_44,\@function,3 +.align 32 +poly1305_emit_base2_44: + mov 0($ctx),%r8 # load hash value + mov 8($ctx),%r9 + mov 16($ctx),%r10 + + mov %r9,%rax + shr \$20,%r9 + shl \$44,%rax + mov %r10,%rcx + shr \$40,%r10 + shl \$24,%rcx + + add %rax,%r8 + adc %rcx,%r9 + adc \$0,%r10 + + mov %r8,%rax + add \$5,%r8 # compare to modulus + mov %r9,%rcx + adc \$0,%r9 + adc \$0,%r10 + shr \$2,%r10 # did 130-bit value overflow? + cmovnz %r8,%rax + cmovnz %r9,%rcx + + add 0($nonce),%rax # accumulate nonce + adc 8($nonce),%rcx + mov %rax,0($mac) # write result + mov %rcx,8($mac) + + ret +.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 +___ +} } } +$code.=<<___; +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +___ +} +$code.=<<___; +.asciz "Poly1305 for x86_64, CRYPTOGAMS by " +.align 16 +___ + +{ # chacha20-poly1305 helpers +my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order + ("%rdi","%rsi","%rdx","%rcx"); # Unix order +$code.=<<___; +.globl xor128_encrypt_n_pad +.type xor128_encrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_encrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_enc + nop +.Loop_enc_xmm: + movdqu ($inp,$otp),%xmm0 + pxor ($otp),%xmm0 + movdqu %xmm0,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_enc_xmm + + and \$15,%r10 # len % 16 + jz .Ldone_enc + +.Ltail_enc: + mov \$16,$len + sub %r10,$len + xor %eax,%eax +.Loop_enc_byte: + mov ($inp,$otp),%al + xor ($otp),%al + mov %al,($out,$otp) + mov %al,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_enc_byte + + xor %eax,%eax +.Loop_enc_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_enc_pad + +.Ldone_enc: + mov $otp,%rax + ret +.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad + +.globl xor128_decrypt_n_pad +.type xor128_decrypt_n_pad,\@abi-omnipotent +.align 16 +xor128_decrypt_n_pad: + sub $otp,$inp + sub $otp,$out + mov $len,%r10 # put len aside + shr \$4,$len # len / 16 + jz .Ltail_dec + nop +.Loop_dec_xmm: + movdqu ($inp,$otp),%xmm0 + movdqa ($otp),%xmm1 + pxor %xmm0,%xmm1 + movdqu %xmm1,($out,$otp) + movdqa %xmm0,($otp) + lea 16($otp),$otp + dec $len + jnz .Loop_dec_xmm + + pxor %xmm1,%xmm1 + and \$15,%r10 # len % 16 + jz .Ldone_dec + +.Ltail_dec: + mov \$16,$len + sub %r10,$len + xor %eax,%eax + xor %r11,%r11 +.Loop_dec_byte: + mov ($inp,$otp),%r11b + mov ($otp),%al + xor %r11b,%al + mov %al,($out,$otp) + mov %r11b,($otp) + lea 1($otp),$otp + dec %r10 + jnz .Loop_dec_byte + + xor %eax,%eax +.Loop_dec_pad: + mov %al,($otp) + lea 1($otp),$otp + dec $len + jnz .Loop_dec_pad + +.Ldone_dec: + mov $otp,%rax + ret +.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad +___ +} + +# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, +# CONTEXT *context,DISPATCHER_CONTEXT *disp) +if ($win64) { +$rec="%rcx"; +$frame="%rdx"; +$context="%r8"; +$disp="%r9"; + +$code.=<<___; +.extern __imp_RtlVirtualUnwind +.type se_handler,\@abi-omnipotent +.align 16 +se_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lcommon_seh_tail + + mov 152($context),%rax # pull context->Rsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lcommon_seh_tail + + lea 48(%rax),%rax + + mov -8(%rax),%rbx + mov -16(%rax),%rbp + mov -24(%rax),%r12 + mov -32(%rax),%r13 + mov -40(%rax),%r14 + mov -48(%rax),%r15 + mov %rbx,144($context) # restore context->Rbx + mov %rbp,160($context) # restore context->Rbp + mov %r12,216($context) # restore context->R12 + mov %r13,224($context) # restore context->R13 + mov %r14,232($context) # restore context->R14 + mov %r15,240($context) # restore context->R14 + + jmp .Lcommon_seh_tail +.size se_handler,.-se_handler + +.type avx_handler,\@abi-omnipotent +.align 16 +avx_handler: + push %rsi + push %rdi + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + pushfq + sub \$64,%rsp + + mov 120($context),%rax # pull context->Rax + mov 248($context),%rbx # pull context->Rip + + mov 8($disp),%rsi # disp->ImageBase + mov 56($disp),%r11 # disp->HandlerData + + mov 0(%r11),%r10d # HandlerData[0] + lea (%rsi,%r10),%r10 # prologue label + cmp %r10,%rbx # context->RipRsp + + mov 4(%r11),%r10d # HandlerData[1] + lea (%rsi,%r10),%r10 # epilogue label + cmp %r10,%rbx # context->Rip>=epilogue label + jae .Lcommon_seh_tail + + mov 208($context),%rax # pull context->R11 + + lea 0x50(%rax),%rsi + lea 0xf8(%rax),%rax + lea 512($context),%rdi # &context.Xmm6 + mov \$20,%ecx + .long 0xa548f3fc # cld; rep movsq + +.Lcommon_seh_tail: + mov 8(%rax),%rdi + mov 16(%rax),%rsi + mov %rax,152($context) # restore context->Rsp + mov %rsi,168($context) # restore context->Rsi + mov %rdi,176($context) # restore context->Rdi + + mov 40($disp),%rdi # disp->ContextRecord + mov $context,%rsi # context + mov \$154,%ecx # sizeof(CONTEXT) + .long 0xa548f3fc # cld; rep movsq + + mov $disp,%rsi + xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER + mov 8(%rsi),%rdx # arg2, disp->ImageBase + mov 0(%rsi),%r8 # arg3, disp->ControlPc + mov 16(%rsi),%r9 # arg4, disp->FunctionEntry + mov 40(%rsi),%r10 # disp->ContextRecord + lea 56(%rsi),%r11 # &disp->HandlerData + lea 24(%rsi),%r12 # &disp->EstablisherFrame + mov %r10,32(%rsp) # arg5 + mov %r11,40(%rsp) # arg6 + mov %r12,48(%rsp) # arg7 + mov %rcx,56(%rsp) # arg8, (NULL) + call *__imp_RtlVirtualUnwind(%rip) + + mov \$1,%eax # ExceptionContinueSearch + add \$64,%rsp + popfq + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + pop %rdi + pop %rsi + ret +.size avx_handler,.-avx_handler + +.section .pdata +.align 4 + .rva .LSEH_begin_poly1305_init + .rva .LSEH_end_poly1305_init + .rva .LSEH_info_poly1305_init + + .rva .LSEH_begin_poly1305_blocks + .rva .LSEH_end_poly1305_blocks + .rva .LSEH_info_poly1305_blocks + + .rva .LSEH_begin_poly1305_emit + .rva .LSEH_end_poly1305_emit + .rva .LSEH_info_poly1305_emit +___ +$code.=<<___ if ($avx); + .rva .LSEH_begin_poly1305_blocks_avx + .rva .Lbase2_64_avx + .rva .LSEH_info_poly1305_blocks_avx_1 + + .rva .Lbase2_64_avx + .rva .Leven_avx + .rva .LSEH_info_poly1305_blocks_avx_2 + + .rva .Leven_avx + .rva .LSEH_end_poly1305_blocks_avx + .rva .LSEH_info_poly1305_blocks_avx_3 + + .rva .LSEH_begin_poly1305_emit_avx + .rva .LSEH_end_poly1305_emit_avx + .rva .LSEH_info_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_poly1305_blocks_avx2 + .rva .Lbase2_64_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_1 + + .rva .Lbase2_64_avx2 + .rva .Leven_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_2 + + .rva .Leven_avx2 + .rva .LSEH_end_poly1305_blocks_avx2 + .rva .LSEH_info_poly1305_blocks_avx2_3 +___ +$code.=<<___ if ($avx>2); + .rva .LSEH_begin_poly1305_blocks_avx512 + .rva .LSEH_end_poly1305_blocks_avx512 + .rva .LSEH_info_poly1305_blocks_avx512 +___ +$code.=<<___; +.section .xdata +.align 8 +.LSEH_info_poly1305_init: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init + +.LSEH_info_poly1305_blocks: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_body,.Lblocks_epilogue + +.LSEH_info_poly1305_emit: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit +___ +$code.=<<___ if ($avx); +.LSEH_info_poly1305_blocks_avx_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] + +.LSEH_info_poly1305_emit_avx: + .byte 9,0,0,0 + .rva se_handler + .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx +___ +$code.=<<___ if ($avx>1); +.LSEH_info_poly1305_blocks_avx2_1: + .byte 9,0,0,0 + .rva se_handler + .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] + +.LSEH_info_poly1305_blocks_avx2_3: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] +___ +$code.=<<___ if ($avx>2); +.LSEH_info_poly1305_blocks_avx512: + .byte 9,0,0,0 + .rva avx_handler + .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] +___ +} + +foreach (split('\n',$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + s/%r([a-z]+)#d/%e$1/g; + s/%r([0-9]+)#d/%r$1d/g; + s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; + + print $_,"\n"; +} +close STDOUT; -- cgit v1.2.3-59-g8ed1b