/* SPDX-License-Identifier: GPL-2.0 OR MIT */ /* * Copyright (C) 2015-2019 Jason A. Donenfeld . All Rights Reserved. * Copyright (C) 2017 Samuel Neves . All Rights Reserved. */ #include .section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32 .align 32 IV: .octa 0xA54FF53A3C6EF372BB67AE856A09E667 .octa 0x5BE0CD191F83D9AB9B05688C510E527F .section .rodata.cst16.ROT16, "aM", @progbits, 16 .align 16 ROT16: .octa 0x0D0C0F0E09080B0A0504070601000302 .section .rodata.cst16.ROR328, "aM", @progbits, 16 .align 16 ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160 .align 64 SIGMA: .byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 .byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7 .byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1 .byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0 .byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8 .byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14 .byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2 .byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6 .byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4 .byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12 #ifdef CONFIG_AS_AVX512 .section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640 .align 64 SIGMA2: .long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 .long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 .long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 .long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 .long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 .long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 .long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 .long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 .long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 .long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 #endif /* CONFIG_AS_AVX512 */ .text #ifdef CONFIG_AS_SSSE3 ENTRY(blake2s_compress_ssse3) testq %rdx, %rdx je .Lendofloop movdqu (%rdi),%xmm0 movdqu 0x10(%rdi),%xmm1 movdqa ROT16(%rip),%xmm12 movdqa ROR328(%rip),%xmm13 movdqu 0x20(%rdi),%xmm14 movq %rcx,%xmm15 leaq SIGMA+0xa0(%rip),%r8 jmp .Lbeginofloop .align 32 .Lbeginofloop: movdqa %xmm0,%xmm10 movdqa %xmm1,%xmm11 paddq %xmm15,%xmm14 movdqa IV(%rip),%xmm2 movdqa %xmm14,%xmm3 pxor IV+0x10(%rip),%xmm3 leaq SIGMA(%rip),%rcx .Lroundloop: movzbl (%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0x1(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0x2(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x3(%rcx),%eax movd (%rsi,%rax,4),%xmm7 punpckldq %xmm5,%xmm4 punpckldq %xmm7,%xmm6 punpcklqdq %xmm6,%xmm4 paddd %xmm4,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm12,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0xc,%xmm1 pslld $0x14,%xmm8 por %xmm8,%xmm1 movzbl 0x4(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0x5(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x6(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0x7(%rcx),%eax movd (%rsi,%rax,4),%xmm4 punpckldq %xmm6,%xmm5 punpckldq %xmm4,%xmm7 punpcklqdq %xmm7,%xmm5 paddd %xmm5,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm13,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0x7,%xmm1 pslld $0x19,%xmm8 por %xmm8,%xmm1 pshufd $0x93,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x39,%xmm2,%xmm2 movzbl 0x8(%rcx),%eax movd (%rsi,%rax,4),%xmm6 movzbl 0x9(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0xa(%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0xb(%rcx),%eax movd (%rsi,%rax,4),%xmm5 punpckldq %xmm7,%xmm6 punpckldq %xmm5,%xmm4 punpcklqdq %xmm4,%xmm6 paddd %xmm6,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm12,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0xc,%xmm1 pslld $0x14,%xmm8 por %xmm8,%xmm1 movzbl 0xc(%rcx),%eax movd (%rsi,%rax,4),%xmm7 movzbl 0xd(%rcx),%eax movd (%rsi,%rax,4),%xmm4 movzbl 0xe(%rcx),%eax movd (%rsi,%rax,4),%xmm5 movzbl 0xf(%rcx),%eax movd (%rsi,%rax,4),%xmm6 punpckldq %xmm4,%xmm7 punpckldq %xmm6,%xmm5 punpcklqdq %xmm5,%xmm7 paddd %xmm7,%xmm0 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 pshufb %xmm13,%xmm3 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm8 psrld $0x7,%xmm1 pslld $0x19,%xmm8 por %xmm8,%xmm1 pshufd $0x39,%xmm0,%xmm0 pshufd $0x4e,%xmm3,%xmm3 pshufd $0x93,%xmm2,%xmm2 addq $0x10,%rcx cmpq %r8,%rcx jnz .Lroundloop pxor %xmm2,%xmm0 pxor %xmm3,%xmm1 pxor %xmm10,%xmm0 pxor %xmm11,%xmm1 addq $0x40,%rsi decq %rdx jnz .Lbeginofloop movdqu %xmm0,(%rdi) movdqu %xmm1,0x10(%rdi) movdqu %xmm14,0x20(%rdi) .Lendofloop: ret ENDPROC(blake2s_compress_ssse3) #endif /* CONFIG_AS_SSSE3 */ #ifdef CONFIG_AS_AVX512 ENTRY(blake2s_compress_avx512) vmovdqu (%rdi),%xmm0 vmovdqu 0x10(%rdi),%xmm1 vmovdqu 0x20(%rdi),%xmm4 vmovq %rcx,%xmm5 vmovdqa IV(%rip),%xmm14 vmovdqa IV+16(%rip),%xmm15 jmp .Lblake2s_compress_avx512_mainloop .align 32 .Lblake2s_compress_avx512_mainloop: vmovdqa %xmm0,%xmm10 vmovdqa %xmm1,%xmm11 vpaddq %xmm5,%xmm4,%xmm4 vmovdqa %xmm14,%xmm2 vpxor %xmm15,%xmm4,%xmm3 vmovdqu (%rsi),%ymm6 vmovdqu 0x20(%rsi),%ymm7 addq $0x40,%rsi leaq SIGMA2(%rip),%rax movb $0xa,%cl .Lblake2s_compress_avx512_roundloop: addq $0x40,%rax vmovdqa -0x40(%rax),%ymm8 vmovdqa -0x20(%rax),%ymm9 vpermi2d %ymm7,%ymm6,%ymm8 vpermi2d %ymm7,%ymm6,%ymm9 vmovdqa %ymm8,%ymm6 vmovdqa %ymm9,%ymm7 vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 vextracti128 $0x1,%ymm8,%xmm8 vpaddd %xmm8,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 vpshufd $0x93,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x39,%xmm2,%xmm2 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x10,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0xc,%xmm1,%xmm1 vextracti128 $0x1,%ymm9,%xmm9 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 vprord $0x8,%xmm3,%xmm3 vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 vpshufd $0x39,%xmm0,%xmm0 vpshufd $0x4e,%xmm3,%xmm3 vpshufd $0x93,%xmm2,%xmm2 decb %cl jne .Lblake2s_compress_avx512_roundloop vpxor %xmm10,%xmm0,%xmm0 vpxor %xmm11,%xmm1,%xmm1 vpxor %xmm2,%xmm0,%xmm0 vpxor %xmm3,%xmm1,%xmm1 decq %rdx jne .Lblake2s_compress_avx512_mainloop vmovdqu %xmm0,(%rdi) vmovdqu %xmm1,0x10(%rdi) vmovdqu %xmm4,0x20(%rdi) vzeroupper retq ENDPROC(blake2s_compress_avx512) #endif /* CONFIG_AS_AVX512 */