From 427847284618ecaf4b8ba60fad2f2cb4aed24586 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Thu, 23 Nov 2017 16:08:46 +0000 Subject: blake2s: tweak avx512 code This is not as ideal as using zmm, but zmm downclocks. And it's not as fast single-threaded as using the gathers. But it is faster when multithreaded, which is what WireGuard is doing. Signed-off-by: Samuel Neves --- src/crypto/blake2s-x86_64.S | 111 +++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 64 deletions(-) diff --git a/src/crypto/blake2s-x86_64.S b/src/crypto/blake2s-x86_64.S index 294750e..d1e0c03 100644 --- a/src/crypto/blake2s-x86_64.S +++ b/src/crypto/blake2s-x86_64.S @@ -5,7 +5,7 @@ #include -.section .rodata.cst32.BLAKECONST, "aM", @progbits, 32 +.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 .align 32 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 .octa 0x5BE0CD191F83D9AB9B05688C510E527F @@ -16,38 +16,19 @@ ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 .align 16 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 #ifdef CONFIG_AS_AVX512 -.section .rodata.cst64.BLAKESIGMA, "aM", @progbits, 640 +.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640 .align 64 SIGMA: - .long 0, 2, 4, 6, 1, 3, 5, 7 - .long 8, 10, 12, 14, 9, 11, 13, 15 - - .long 14, 4, 9, 13, 10, 8, 15, 6 - .long 1, 0, 11, 5, 12, 2, 7, 3 - - .long 11, 12, 5, 15, 8, 0, 2, 13 - .long 10, 3, 7, 9, 14, 6, 1, 4 - - .long 7, 3, 13, 11, 9, 1, 12, 14 - .long 2, 5, 4, 15, 6, 10, 0, 8 - - .long 9, 5, 2, 10, 0, 7, 4, 15 - .long 14, 11, 6, 3, 1, 12, 8, 13 - - .long 2, 6, 0, 8, 12, 10, 11, 3 - .long 4, 7, 15, 1, 13, 5, 14, 9 - - .long 12, 1, 14, 4, 5, 15, 13, 10 - .long 0, 6, 9, 8, 7, 3, 2, 11 - - .long 13, 7, 12, 3, 11, 14, 1, 9 - .long 5, 15, 8, 2, 0, 4, 6, 10 - - .long 6, 14, 11, 0, 15, 9, 3, 8 - .long 12, 13, 1, 10, 2, 7, 4, 5 - - .long 10, 8, 7, 1, 2, 4, 6, 5 - .long 15, 9, 3, 13, 11, 14, 12, 0 +.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15 +.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5 +.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1 +.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4 +.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2 +.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0 +.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6 +.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7 +.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8 +.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3 #endif /* CONFIG_AS_AVX512 */ .text @@ -625,32 +606,40 @@ ENDPROC(blake2s_compress_avx) ENTRY(blake2s_compress_avx512) vmovdqu (%rdi),%xmm0 vmovdqu 0x10(%rdi),%xmm1 - vmovdqu 0x20(%rdi),%xmm15 - vmovq %rcx,%xmm13 - jmp .Lblake2s_compress_avx512_mainloop + vmovdqu 0x20(%rdi),%xmm4 + vmovq %rcx,%xmm5 + vmovdqa IV(%rip),%xmm14 + vmovdqa IV+16(%rip),%xmm15 + jmp .Lblake2s_compress_avx512_mainloop .align 32 .Lblake2s_compress_avx512_mainloop: - vpaddq %xmm13,%xmm15,%xmm15 - vmovdqa IV(%rip),%xmm2 - vpxor IV+16(%rip),%xmm15,%xmm3 - lea SIGMA(%rip),%rax - movl $10,%ecx + vmovdqa %xmm0,%xmm10 + vmovdqa %xmm1,%xmm11 + vpaddq %xmm5,%xmm4,%xmm4 + vmovdqa %xmm14,%xmm2 + vpxor %xmm15,%xmm4,%xmm3 + vmovdqu (%rsi),%ymm6 + vmovdqu 0x20(%rsi),%ymm7 + addq $0x40,%rsi + leaq SIGMA(%rip),%rax + movb $0xa,%cl .Lblake2s_compress_avx512_roundloop: - add $0x40,%rax - vmovdqa -0x40(%rax),%xmm7 - vpcmpeqd %xmm14,%xmm14,%xmm14 - vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 - vpaddd %xmm6,%xmm0,%xmm0 + addq $0x40,%rax + vmovdqa -0x40(%rax),%ymm8 + vmovdqa -0x20(%rax),%ymm9 + vpermi2d %ymm7,%ymm6,%ymm8 + vpermi2d %ymm7,%ymm6,%ymm9 + vmovdqa %ymm8,%ymm6 + vmovdqa %ymm9,%ymm7 + vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 - vmovdqa -0x30(%rax),%xmm7 - vpcmpeqd %xmm14,%xmm14,%xmm14 - vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 - vpaddd %xmm6,%xmm0,%xmm0 + vextracti128 $0x1,%ymm8,%xmm8 + vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 @@ -660,20 +649,15 @@ ENTRY(blake2s_compress_avx512) vpshufd $0x39,%xmm1,%xmm1 vpshufd $0x4e,%xmm2,%xmm2 vpshufd $0x93,%xmm3,%xmm3 - vmovdqa -0x20(%rax),%xmm7 - vpcmpeqd %xmm14,%xmm14,%xmm14 - vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 - vpaddd %xmm6,%xmm0,%xmm0 + vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 - vmovdqa -0x10(%rax),%xmm7 - vpcmpeqd %xmm14,%xmm14,%xmm14 - vpgatherdd %xmm14,(%rsi,%xmm7,4),%xmm6 - vpaddd %xmm6,%xmm0,%xmm0 + vextracti128 $0x1,%ymm9,%xmm9 + vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 @@ -683,19 +667,18 @@ ENTRY(blake2s_compress_avx512) vpshufd $0x93,%xmm1,%xmm1 vpshufd $0x4e,%xmm2,%xmm2 vpshufd $0x39,%xmm3,%xmm3 - decl %ecx + decb %cl jne .Lblake2s_compress_avx512_roundloop - add $0x40,%rsi - vpxor (%rdi),%xmm0,%xmm0 - vpxor 0x10(%rdi),%xmm1,%xmm1 + vpxor %xmm10,%xmm0,%xmm0 + vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm2,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 + decq %rdx + jne .Lblake2s_compress_avx512_mainloop vmovdqu %xmm0,(%rdi) vmovdqu %xmm1,0x10(%rdi) - dec %rdx - jne .Lblake2s_compress_avx512_mainloop - vmovdqu %xmm15,0x20(%rdi) - vzeroupper + vmovdqu %xmm4,0x20(%rdi) + vzeroupper retq ENDPROC(blake2s_compress_avx512) #endif /* CONFIG_AS_AVX512 */ -- cgit v1.2.3-59-g8ed1b