aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/crypto/zinc/blake2s/blake2s-x86_64.S
diff options
context:
space:
mode:
Diffstat (limited to 'src/crypto/zinc/blake2s/blake2s-x86_64.S')
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64.S1526
1 files changed, 945 insertions, 581 deletions
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S
index 675288f..9bb4c83 100644
--- a/src/crypto/zinc/blake2s/blake2s-x86_64.S
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S
@@ -20,588 +20,952 @@ ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
.align 64
SIGMA:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
-.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
-.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
-.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
-.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
-.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
-.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
-.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
-.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
-.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
+.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
+.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
+.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
+.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
+.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
+.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
+.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
+.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
+.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#endif /* CONFIG_AS_AVX512 */
.text
-#ifdef CONFIG_AS_AVX
-ENTRY(blake2s_compress_avx)
- movl %ecx, %ecx
- testq %rdx, %rdx
- je .Lendofloop
+#ifdef CONFIG_AS_SSSE3
+ENTRY(blake2s_compress_ssse3)
+ testq %rdx, %rdx
+ je .Lendofloop
+ movdqu (%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqa ROT16(%rip),%xmm12
+ movdqa ROR328(%rip),%xmm13
+ movdqu 0x20(%rdi),%xmm14
+ movq %rcx,%xmm15
+ jmp .Lbeginofloop
.align 32
.Lbeginofloop:
- addq %rcx, 32(%rdi)
- vmovdqu IV+16(%rip), %xmm1
- vmovdqu (%rsi), %xmm4
- vpxor 32(%rdi), %xmm1, %xmm1
- vmovdqu 16(%rsi), %xmm3
- vshufps $136, %xmm3, %xmm4, %xmm6
- vmovdqa ROT16(%rip), %xmm7
- vpaddd (%rdi), %xmm6, %xmm6
- vpaddd 16(%rdi), %xmm6, %xmm6
- vpxor %xmm6, %xmm1, %xmm1
- vmovdqu IV(%rip), %xmm8
- vpshufb %xmm7, %xmm1, %xmm1
- vmovdqu 48(%rsi), %xmm5
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor 16(%rdi), %xmm8, %xmm9
- vmovdqu 32(%rsi), %xmm2
- vpblendw $12, %xmm3, %xmm5, %xmm13
- vshufps $221, %xmm5, %xmm2, %xmm12
- vpunpckhqdq %xmm2, %xmm4, %xmm14
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vshufps $221, %xmm3, %xmm4, %xmm9
- vpaddd %xmm9, %xmm6, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vmovdqa ROR328(%rip), %xmm6
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vshufps $136, %xmm5, %xmm2, %xmm10
- vpshufd $57, %xmm0, %xmm0
- vpaddd %xmm10, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpaddd %xmm12, %xmm9, %xmm9
- vpblendw $12, %xmm2, %xmm3, %xmm12
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm10
- vpslld $20, %xmm10, %xmm0
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm0, %xmm10, %xmm0
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpslldq $4, %xmm5, %xmm10
- vpblendw $240, %xmm10, %xmm12, %xmm12
- vpshufd $147, %xmm0, %xmm0
- vpshufd $147, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm12
- vpaddd %xmm0, %xmm12, %xmm12
- vpxor %xmm12, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm11
- vpslld $20, %xmm11, %xmm9
- vpsrld $12, %xmm11, %xmm11
- vpxor %xmm9, %xmm11, %xmm0
- vpshufd $8, %xmm2, %xmm9
- vpblendw $192, %xmm5, %xmm3, %xmm11
- vpblendw $240, %xmm11, %xmm9, %xmm9
- vpshufd $177, %xmm9, %xmm9
- vpaddd %xmm12, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm11
- vpxor %xmm11, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm9
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm9, %xmm0
- vpsrld $7, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vpslldq $4, %xmm3, %xmm9
- vpblendw $48, %xmm9, %xmm2, %xmm9
- vpblendw $240, %xmm9, %xmm4, %xmm9
- vpshufd $57, %xmm0, %xmm0
- vpshufd $177, %xmm9, %xmm9
- vpaddd %xmm11, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm8
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm8, %xmm0, %xmm0
- vpunpckhdq %xmm3, %xmm4, %xmm8
- vpblendw $12, %xmm10, %xmm8, %xmm12
- vpshufd $177, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm12
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpunpckhdq %xmm5, %xmm2, %xmm12
- vpshufd $147, %xmm0, %xmm0
- vpblendw $15, %xmm13, %xmm12, %xmm12
- vpslldq $8, %xmm5, %xmm13
- vpshufd $210, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm12
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpunpckldq %xmm4, %xmm2, %xmm12
- vpblendw $240, %xmm4, %xmm12, %xmm12
- vpblendw $192, %xmm13, %xmm12, %xmm12
- vpsrldq $12, %xmm3, %xmm13
- vpaddd %xmm12, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm12
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpblendw $60, %xmm2, %xmm4, %xmm12
- vpblendw $3, %xmm13, %xmm12, %xmm12
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm12
- vpslld $20, %xmm12, %xmm13
- vpsrld $12, %xmm12, %xmm0
- vpblendw $51, %xmm3, %xmm4, %xmm12
- vpxor %xmm13, %xmm0, %xmm0
- vpblendw $192, %xmm10, %xmm12, %xmm10
- vpslldq $8, %xmm2, %xmm12
- vpshufd $27, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpunpckhdq %xmm2, %xmm8, %xmm10
- vpshufd $147, %xmm0, %xmm0
- vpblendw $12, %xmm5, %xmm10, %xmm10
- vpshufd $210, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm10
- vpslld $20, %xmm10, %xmm0
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm0, %xmm10, %xmm0
- vpblendw $12, %xmm4, %xmm5, %xmm10
- vpblendw $192, %xmm12, %xmm10, %xmm10
- vpunpckldq %xmm2, %xmm4, %xmm12
- vpshufd $135, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpblendw $15, %xmm3, %xmm4, %xmm10
- vpblendw $192, %xmm5, %xmm10, %xmm10
- vpshufd $57, %xmm0, %xmm0
- vpshufd $198, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm10, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm9
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vpunpckhdq %xmm2, %xmm3, %xmm9
- vpunpcklqdq %xmm12, %xmm9, %xmm15
- vpunpcklqdq %xmm12, %xmm8, %xmm12
- vpblendw $15, %xmm5, %xmm8, %xmm8
- vpaddd %xmm15, %xmm10, %xmm15
- vpaddd %xmm0, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm1
- vpshufd $141, %xmm8, %xmm8
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpunpcklqdq %xmm2, %xmm3, %xmm10
- vpshufd $147, %xmm0, %xmm0
- vpblendw $51, %xmm14, %xmm10, %xmm14
- vpshufd $135, %xmm14, %xmm14
- vpaddd %xmm15, %xmm14, %xmm14
- vpaddd %xmm0, %xmm14, %xmm14
- vpxor %xmm14, %xmm1, %xmm1
- vpunpcklqdq %xmm3, %xmm4, %xmm15
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm11
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm11, %xmm0, %xmm0
- vpunpckhqdq %xmm5, %xmm3, %xmm11
- vpblendw $51, %xmm15, %xmm11, %xmm11
- vpunpckhqdq %xmm3, %xmm5, %xmm15
- vpaddd %xmm11, %xmm14, %xmm11
- vpaddd %xmm0, %xmm11, %xmm11
- vpxor %xmm11, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm14
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm14, %xmm0, %xmm14
- vpunpckhqdq %xmm4, %xmm2, %xmm0
- vpshufd $57, %xmm14, %xmm14
- vpblendw $51, %xmm15, %xmm0, %xmm15
- vpaddd %xmm15, %xmm11, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm11
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm11, %xmm14, %xmm14
- vpblendw $3, %xmm2, %xmm4, %xmm11
- vpslldq $8, %xmm11, %xmm0
- vpblendw $15, %xmm5, %xmm0, %xmm0
- vpshufd $99, %xmm0, %xmm0
- vpaddd %xmm15, %xmm0, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm0
- vpaddd %xmm12, %xmm15, %xmm15
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm14, %xmm1
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm1, %xmm14, %xmm14
- vpblendw $3, %xmm5, %xmm4, %xmm1
- vpshufd $147, %xmm14, %xmm14
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm12
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm2, %xmm12
- vpblendw $60, %xmm12, %xmm1, %xmm1
- vpaddd %xmm1, %xmm15, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpblendw $12, %xmm4, %xmm3, %xmm1
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm5, %xmm12
- vpblendw $48, %xmm12, %xmm1, %xmm1
- vpshufd $33, %xmm5, %xmm12
- vpshufd $57, %xmm14, %xmm14
- vpshufd $108, %xmm1, %xmm1
- vpblendw $51, %xmm12, %xmm10, %xmm12
- vpaddd %xmm15, %xmm1, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpaddd %xmm12, %xmm15, %xmm15
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm13
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpslldq $12, %xmm3, %xmm13
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpblendw $51, %xmm5, %xmm4, %xmm12
- vpshufd $147, %xmm14, %xmm14
- vpblendw $192, %xmm13, %xmm12, %xmm12
- vpaddd %xmm12, %xmm15, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpsrldq $4, %xmm3, %xmm12
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm13
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpblendw $48, %xmm2, %xmm5, %xmm13
- vpblendw $3, %xmm12, %xmm13, %xmm13
- vpshufd $156, %xmm13, %xmm13
- vpaddd %xmm15, %xmm13, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm13
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpunpcklqdq %xmm2, %xmm4, %xmm13
- vpshufd $57, %xmm14, %xmm14
- vpblendw $12, %xmm12, %xmm13, %xmm12
- vpshufd $180, %xmm12, %xmm12
- vpaddd %xmm15, %xmm12, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm12
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpunpckhqdq %xmm9, %xmm4, %xmm12
- vpshufd $198, %xmm12, %xmm12
- vpaddd %xmm15, %xmm12, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpaddd %xmm15, %xmm8, %xmm15
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm4, %xmm12
- vpshufd $147, %xmm14, %xmm14
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm8
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm14, %xmm8, %xmm14
- vpblendw $48, %xmm5, %xmm2, %xmm8
- vpblendw $3, %xmm12, %xmm8, %xmm8
- vpunpckhqdq %xmm5, %xmm4, %xmm12
- vpshufd $75, %xmm8, %xmm8
- vpblendw $60, %xmm10, %xmm12, %xmm10
- vpaddd %xmm15, %xmm8, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm0, %xmm15, %xmm0
- vpshufd $45, %xmm10, %xmm10
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm15, %xmm10, %xmm15
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm8
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm14, %xmm8, %xmm8
- vpshufd $57, %xmm8, %xmm8
- vpaddd %xmm8, %xmm15, %xmm15
- vpxor %xmm0, %xmm15, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm8, %xmm1, %xmm8
- vpslld $20, %xmm8, %xmm10
- vpsrld $12, %xmm8, %xmm8
- vpxor %xmm8, %xmm10, %xmm10
- vpunpckldq %xmm3, %xmm4, %xmm8
- vpunpcklqdq %xmm9, %xmm8, %xmm9
- vpaddd %xmm9, %xmm15, %xmm9
- vpaddd %xmm10, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm8
- vpshufb %xmm6, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm10
- vpshufd $57, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm10, %xmm12
- vpsrld $7, %xmm10, %xmm10
- vpxor %xmm10, %xmm12, %xmm10
- vpblendw $48, %xmm4, %xmm3, %xmm12
- vpshufd $147, %xmm10, %xmm0
- vpunpckhdq %xmm5, %xmm3, %xmm10
- vpshufd $78, %xmm12, %xmm12
- vpunpcklqdq %xmm4, %xmm10, %xmm10
- vpblendw $192, %xmm2, %xmm10, %xmm10
- vpshufhw $78, %xmm10, %xmm10
- vpaddd %xmm10, %xmm9, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm8, %xmm10, %xmm8
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm0, %xmm1, %xmm9
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm9, %xmm0, %xmm0
- vpunpckhdq %xmm5, %xmm4, %xmm9
- vpblendw $240, %xmm9, %xmm2, %xmm13
- vpshufd $39, %xmm13, %xmm13
- vpaddd %xmm10, %xmm13, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm8, %xmm10, %xmm8
- vpblendw $12, %xmm4, %xmm2, %xmm13
- vpshufb %xmm6, %xmm8, %xmm8
- vpslldq $4, %xmm13, %xmm13
- vpblendw $15, %xmm5, %xmm13, %xmm13
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm0, %xmm0
- vpaddd %xmm13, %xmm10, %xmm13
- vpshufd $147, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm0, %xmm14
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm0, %xmm14, %xmm14
- vpshufd $57, %xmm14, %xmm14
- vpaddd %xmm14, %xmm13, %xmm13
- vpxor %xmm8, %xmm13, %xmm8
- vpaddd %xmm13, %xmm12, %xmm12
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm14, %xmm1, %xmm14
- vpslld $20, %xmm14, %xmm10
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm14, %xmm10, %xmm10
- vpaddd %xmm10, %xmm12, %xmm12
- vpxor %xmm8, %xmm12, %xmm8
- vpshufb %xmm6, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm0
- vpshufd $57, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm0, %xmm10, %xmm10
- vpblendw $48, %xmm2, %xmm3, %xmm0
- vpblendw $15, %xmm11, %xmm0, %xmm0
- vpshufd $147, %xmm10, %xmm10
- vpshufd $114, %xmm0, %xmm0
- vpaddd %xmm12, %xmm0, %xmm0
- vpaddd %xmm10, %xmm0, %xmm0
- vpxor %xmm8, %xmm0, %xmm8
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm10, %xmm1, %xmm10
- vpslld $20, %xmm10, %xmm11
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm10, %xmm11, %xmm10
- vpslldq $4, %xmm4, %xmm11
- vpblendw $192, %xmm11, %xmm3, %xmm3
- vpunpckldq %xmm5, %xmm4, %xmm4
- vpshufd $99, %xmm3, %xmm3
- vpaddd %xmm0, %xmm3, %xmm3
- vpaddd %xmm10, %xmm3, %xmm3
- vpxor %xmm8, %xmm3, %xmm11
- vpunpckldq %xmm5, %xmm2, %xmm0
- vpblendw $192, %xmm2, %xmm5, %xmm2
- vpshufb %xmm6, %xmm11, %xmm11
- vpunpckhqdq %xmm0, %xmm9, %xmm0
- vpblendw $15, %xmm4, %xmm2, %xmm4
- vpaddd %xmm11, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm10
- vpshufd $147, %xmm11, %xmm11
- vpshufd $201, %xmm0, %xmm0
- vpslld $25, %xmm10, %xmm8
- vpsrld $7, %xmm10, %xmm10
- vpxor %xmm10, %xmm8, %xmm10
- vpshufd $78, %xmm1, %xmm1
- vpaddd %xmm3, %xmm0, %xmm0
- vpshufd $27, %xmm4, %xmm4
- vpshufd $57, %xmm10, %xmm10
- vpaddd %xmm10, %xmm0, %xmm0
- vpxor %xmm11, %xmm0, %xmm11
- vpaddd %xmm0, %xmm4, %xmm0
- vpshufb %xmm7, %xmm11, %xmm7
- vpaddd %xmm7, %xmm1, %xmm1
- vpxor %xmm10, %xmm1, %xmm10
- vpslld $20, %xmm10, %xmm8
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm10, %xmm8, %xmm8
- vpaddd %xmm8, %xmm0, %xmm0
- vpxor %xmm7, %xmm0, %xmm7
- vpshufb %xmm6, %xmm7, %xmm6
- vpaddd %xmm6, %xmm1, %xmm1
- vpxor %xmm1, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpshufd $57, %xmm6, %xmm6
- vpslld $25, %xmm8, %xmm2
- vpsrld $7, %xmm8, %xmm8
- vpxor %xmm8, %xmm2, %xmm8
- vpxor (%rdi), %xmm1, %xmm1
- vpshufd $147, %xmm8, %xmm8
- vpxor %xmm0, %xmm1, %xmm0
- vmovups %xmm0, (%rdi)
- vpxor 16(%rdi), %xmm8, %xmm0
- vpxor %xmm6, %xmm0, %xmm6
- vmovups %xmm6, 16(%rdi)
- addq $64, %rsi
- decq %rdx
- jnz .Lbeginofloop
+ movdqa %xmm0,%xmm10
+ movdqa %xmm1,%xmm11
+ paddq %xmm15,%xmm14
+ movdqa IV(%rip),%xmm2
+ movdqa %xmm14,%xmm3
+ pxor IV+0x10(%rip),%xmm3
+ movl 0x8(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0xc(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0x4(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x20(%rsi),%r8d
+ movl 0x30(%rsi),%r9d
+ movl 0x38(%rsi),%r10d
+ movl 0x28(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x24(%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x10(%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x38(%rsi),%r10d
+ movl 0x24(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x20(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x3c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x4(%rsi),%r8d
+ movl 0x2c(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl (%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x30(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0xc(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x30(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl (%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x28(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x38(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x10(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0xc(%rsi),%r8d
+ movl 0x2c(%rsi),%r9d
+ movl 0x1c(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x4(%rsi),%r8d
+ movl 0x38(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x8(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x18(%rsi),%r8d
+ movl (%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x28(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x14(%rsi),%r8d
+ movl 0x28(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x1c(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x38(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0xc(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x4(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x18(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl (%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x28(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x30(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x10(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl 0x4(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x34(%rsi),%r8d
+ movl 0x38(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x4(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x30(%rsi),%r10d
+ movl 0x38(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x3c(%rsi),%r8d
+ movl 0x28(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl (%rsi),%r8d
+ movl 0x24(%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x1c(%rsi),%r8d
+ movl 0x8(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x1c(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x38(%rsi),%r8d
+ movl 0x24(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0x4(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x14(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl 0x3c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl (%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x38(%rsi),%r8d
+ movl (%rsi),%r9d
+ movl 0x18(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x24(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x30(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x8(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x20(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x10(%rsi),%r8d
+ movl 0x14(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x3c(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x24(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x2c(%rsi),%r8d
+ movl 0x30(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x38(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm1
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm1
+ addq $0x40,%rsi
+ decq %rdx
+ jnz .Lbeginofloop
+ movdqu %xmm0,(%rdi)
+ movdqu %xmm1,0x10(%rdi)
+ movdqu %xmm14,0x20(%rdi)
.Lendofloop:
ret
-ENDPROC(blake2s_compress_avx)
-#endif /* CONFIG_AS_AVX */
+ENDPROC(blake2s_compress_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
ENTRY(blake2s_compress_avx512)
@@ -647,9 +1011,9 @@ ENTRY(blake2s_compress_avx512)
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm1,%xmm1
- vpshufd $0x4e,%xmm2,%xmm2
- vpshufd $0x93,%xmm3,%xmm3
+ vpshufd $0x93,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
@@ -665,9 +1029,9 @@ ENTRY(blake2s_compress_avx512)
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm1,%xmm1
- vpshufd $0x4e,%xmm2,%xmm2
- vpshufd $0x39,%xmm3,%xmm3
+ vpshufd $0x39,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0