aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/src/crypto/zinc
diff options
context:
space:
mode:
authorSamuel Neves <sneves@dei.uc.pt>2019-05-04 17:14:09 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2019-05-29 01:23:24 +0200
commit22bbac4d2ffb62f28b0483f05f24a0f41639b787 (patch)
tree262a0864dc669ac71dd27264f119c145799c4bc0 /src/crypto/zinc
parentqemu: do not check for alignment with ubsan (diff)
downloadwireguard-monolithic-historical-22bbac4d2ffb62f28b0483f05f24a0f41639b787.tar.xz
wireguard-monolithic-historical-22bbac4d2ffb62f28b0483f05f24a0f41639b787.zip
blake2s,chacha: latency tweak
In every odd-numbered round, instead of operating over the state x00 x01 x02 x03 x05 x06 x07 x04 x10 x11 x08 x09 x15 x12 x13 x14 we operate over the rotated state x03 x00 x01 x02 x04 x05 x06 x07 x09 x10 x11 x08 x14 x15 x12 x13 The advantage here is that this requires no changes to the 'x04 x05 x06 x07' row, which is in the critical path. This results in a noticeable latency improvement of roughly R cycles, for R diagonal rounds in the primitive. In the case of BLAKE2s, which I also moved from requiring AVX to only requiring SSSE3, we save approximately 30 cycles per compression function call on Haswell and Skylake. In other words, this is an improvement of ~0.6 cpb. This idea was pointed out to me by Shunsuke Shimizu, though it appears to have been around for longer. Signed-off-by: Samuel Neves <sneves@dei.uc.pt>
Diffstat (limited to 'src/crypto/zinc')
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64-glue.c14
-rw-r--r--src/crypto/zinc/blake2s/blake2s-x86_64.S1526
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm.pl6
-rw-r--r--src/crypto/zinc/chacha20/chacha20-arm64.pl6
-rw-r--r--src/crypto/zinc/chacha20/chacha20-x86_64.pl48
5 files changed, 982 insertions, 618 deletions
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c
index 9a956be..087a48d 100644
--- a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c
@@ -8,22 +8,22 @@
#include <asm/processor.h>
#include <asm/fpu/api.h>
-asmlinkage void blake2s_compress_avx(struct blake2s_state *state,
+asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
asmlinkage void blake2s_compress_avx512(struct blake2s_state *state,
const u8 *block, const size_t nblocks,
const u32 inc);
-static bool blake2s_use_avx __ro_after_init;
+static bool blake2s_use_ssse3 __ro_after_init;
static bool blake2s_use_avx512 __ro_after_init;
static bool *const blake2s_nobs[] __initconst = { &blake2s_use_avx512 };
static void __init blake2s_fpu_init(void)
{
- blake2s_use_avx =
- boot_cpu_has(X86_FEATURE_AVX) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+ blake2s_use_ssse3 =
+ boot_cpu_has(X86_FEATURE_SSSE3) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL);
#ifndef COMPAT_CANNOT_USE_AVX512
blake2s_use_avx512 =
boot_cpu_has(X86_FEATURE_AVX) &&
@@ -47,7 +47,7 @@ static inline bool blake2s_compress_arch(struct blake2s_state *state,
simd_get(&simd_context);
- if (!IS_ENABLED(CONFIG_AS_AVX) || !blake2s_use_avx ||
+ if (!IS_ENABLED(CONFIG_AS_SSSE3) || !blake2s_use_ssse3 ||
!simd_use(&simd_context))
goto out;
used_arch = true;
@@ -59,7 +59,7 @@ static inline bool blake2s_compress_arch(struct blake2s_state *state,
if (IS_ENABLED(CONFIG_AS_AVX512) && blake2s_use_avx512)
blake2s_compress_avx512(state, block, blocks, inc);
else
- blake2s_compress_avx(state, block, blocks, inc);
+ blake2s_compress_ssse3(state, block, blocks, inc);
nblocks -= blocks;
if (!nblocks)
diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S
index 675288f..9bb4c83 100644
--- a/src/crypto/zinc/blake2s/blake2s-x86_64.S
+++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S
@@ -20,588 +20,952 @@ ROR328: .octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640
.align 64
SIGMA:
-.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15
-.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5
-.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1
-.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4
-.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2
-.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0
-.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6
-.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7
-.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8
-.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3
+.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
+.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
+.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
+.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
+.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
+.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
+.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
+.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
+.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
+.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#endif /* CONFIG_AS_AVX512 */
.text
-#ifdef CONFIG_AS_AVX
-ENTRY(blake2s_compress_avx)
- movl %ecx, %ecx
- testq %rdx, %rdx
- je .Lendofloop
+#ifdef CONFIG_AS_SSSE3
+ENTRY(blake2s_compress_ssse3)
+ testq %rdx, %rdx
+ je .Lendofloop
+ movdqu (%rdi),%xmm0
+ movdqu 0x10(%rdi),%xmm1
+ movdqa ROT16(%rip),%xmm12
+ movdqa ROR328(%rip),%xmm13
+ movdqu 0x20(%rdi),%xmm14
+ movq %rcx,%xmm15
+ jmp .Lbeginofloop
.align 32
.Lbeginofloop:
- addq %rcx, 32(%rdi)
- vmovdqu IV+16(%rip), %xmm1
- vmovdqu (%rsi), %xmm4
- vpxor 32(%rdi), %xmm1, %xmm1
- vmovdqu 16(%rsi), %xmm3
- vshufps $136, %xmm3, %xmm4, %xmm6
- vmovdqa ROT16(%rip), %xmm7
- vpaddd (%rdi), %xmm6, %xmm6
- vpaddd 16(%rdi), %xmm6, %xmm6
- vpxor %xmm6, %xmm1, %xmm1
- vmovdqu IV(%rip), %xmm8
- vpshufb %xmm7, %xmm1, %xmm1
- vmovdqu 48(%rsi), %xmm5
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor 16(%rdi), %xmm8, %xmm9
- vmovdqu 32(%rsi), %xmm2
- vpblendw $12, %xmm3, %xmm5, %xmm13
- vshufps $221, %xmm5, %xmm2, %xmm12
- vpunpckhqdq %xmm2, %xmm4, %xmm14
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vshufps $221, %xmm3, %xmm4, %xmm9
- vpaddd %xmm9, %xmm6, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vmovdqa ROR328(%rip), %xmm6
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vshufps $136, %xmm5, %xmm2, %xmm10
- vpshufd $57, %xmm0, %xmm0
- vpaddd %xmm10, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpaddd %xmm12, %xmm9, %xmm9
- vpblendw $12, %xmm2, %xmm3, %xmm12
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm10
- vpslld $20, %xmm10, %xmm0
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm0, %xmm10, %xmm0
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpslldq $4, %xmm5, %xmm10
- vpblendw $240, %xmm10, %xmm12, %xmm12
- vpshufd $147, %xmm0, %xmm0
- vpshufd $147, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm12
- vpaddd %xmm0, %xmm12, %xmm12
- vpxor %xmm12, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm11
- vpslld $20, %xmm11, %xmm9
- vpsrld $12, %xmm11, %xmm11
- vpxor %xmm9, %xmm11, %xmm0
- vpshufd $8, %xmm2, %xmm9
- vpblendw $192, %xmm5, %xmm3, %xmm11
- vpblendw $240, %xmm11, %xmm9, %xmm9
- vpshufd $177, %xmm9, %xmm9
- vpaddd %xmm12, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm11
- vpxor %xmm11, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm8
- vpxor %xmm8, %xmm0, %xmm9
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm8, %xmm8
- vpslld $25, %xmm9, %xmm0
- vpsrld $7, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vpslldq $4, %xmm3, %xmm9
- vpblendw $48, %xmm9, %xmm2, %xmm9
- vpblendw $240, %xmm9, %xmm4, %xmm9
- vpshufd $57, %xmm0, %xmm0
- vpshufd $177, %xmm9, %xmm9
- vpaddd %xmm11, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm8, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm8
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm8, %xmm0, %xmm0
- vpunpckhdq %xmm3, %xmm4, %xmm8
- vpblendw $12, %xmm10, %xmm8, %xmm12
- vpshufd $177, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm12
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpunpckhdq %xmm5, %xmm2, %xmm12
- vpshufd $147, %xmm0, %xmm0
- vpblendw $15, %xmm13, %xmm12, %xmm12
- vpslldq $8, %xmm5, %xmm13
- vpshufd $210, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm12
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpunpckldq %xmm4, %xmm2, %xmm12
- vpblendw $240, %xmm4, %xmm12, %xmm12
- vpblendw $192, %xmm13, %xmm12, %xmm12
- vpsrldq $12, %xmm3, %xmm13
- vpaddd %xmm12, %xmm9, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm12
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm12, %xmm0, %xmm0
- vpblendw $60, %xmm2, %xmm4, %xmm12
- vpblendw $3, %xmm13, %xmm12, %xmm12
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm12, %xmm12
- vpaddd %xmm9, %xmm12, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm12
- vpslld $20, %xmm12, %xmm13
- vpsrld $12, %xmm12, %xmm0
- vpblendw $51, %xmm3, %xmm4, %xmm12
- vpxor %xmm13, %xmm0, %xmm0
- vpblendw $192, %xmm10, %xmm12, %xmm10
- vpslldq $8, %xmm2, %xmm12
- vpshufd $27, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm11, %xmm11
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpunpckhdq %xmm2, %xmm8, %xmm10
- vpshufd $147, %xmm0, %xmm0
- vpblendw $12, %xmm5, %xmm10, %xmm10
- vpshufd $210, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm11
- vpxor %xmm11, %xmm0, %xmm10
- vpslld $20, %xmm10, %xmm0
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm0, %xmm10, %xmm0
- vpblendw $12, %xmm4, %xmm5, %xmm10
- vpblendw $192, %xmm12, %xmm10, %xmm10
- vpunpckldq %xmm2, %xmm4, %xmm12
- vpshufd $135, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm9
- vpaddd %xmm0, %xmm9, %xmm9
- vpxor %xmm9, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm11, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpblendw $15, %xmm3, %xmm4, %xmm10
- vpblendw $192, %xmm5, %xmm10, %xmm10
- vpshufd $57, %xmm0, %xmm0
- vpshufd $198, %xmm10, %xmm10
- vpaddd %xmm9, %xmm10, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm10, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm9
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm0
- vpunpckhdq %xmm2, %xmm3, %xmm9
- vpunpcklqdq %xmm12, %xmm9, %xmm15
- vpunpcklqdq %xmm12, %xmm8, %xmm12
- vpblendw $15, %xmm5, %xmm8, %xmm8
- vpaddd %xmm15, %xmm10, %xmm15
- vpaddd %xmm0, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm1
- vpshufd $141, %xmm8, %xmm8
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $57, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm10, %xmm0, %xmm0
- vpunpcklqdq %xmm2, %xmm3, %xmm10
- vpshufd $147, %xmm0, %xmm0
- vpblendw $51, %xmm14, %xmm10, %xmm14
- vpshufd $135, %xmm14, %xmm14
- vpaddd %xmm15, %xmm14, %xmm14
- vpaddd %xmm0, %xmm14, %xmm14
- vpxor %xmm14, %xmm1, %xmm1
- vpunpcklqdq %xmm3, %xmm4, %xmm15
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpslld $20, %xmm0, %xmm11
- vpsrld $12, %xmm0, %xmm0
- vpxor %xmm11, %xmm0, %xmm0
- vpunpckhqdq %xmm5, %xmm3, %xmm11
- vpblendw $51, %xmm15, %xmm11, %xmm11
- vpunpckhqdq %xmm3, %xmm5, %xmm15
- vpaddd %xmm11, %xmm14, %xmm11
- vpaddd %xmm0, %xmm11, %xmm11
- vpxor %xmm11, %xmm1, %xmm1
- vpshufb %xmm6, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm0, %xmm0
- vpshufd $147, %xmm1, %xmm1
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm0, %xmm14
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm14, %xmm0, %xmm14
- vpunpckhqdq %xmm4, %xmm2, %xmm0
- vpshufd $57, %xmm14, %xmm14
- vpblendw $51, %xmm15, %xmm0, %xmm15
- vpaddd %xmm15, %xmm11, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm1
- vpshufb %xmm7, %xmm1, %xmm1
- vpaddd %xmm1, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm11
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm11, %xmm14, %xmm14
- vpblendw $3, %xmm2, %xmm4, %xmm11
- vpslldq $8, %xmm11, %xmm0
- vpblendw $15, %xmm5, %xmm0, %xmm0
- vpshufd $99, %xmm0, %xmm0
- vpaddd %xmm15, %xmm0, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm1, %xmm0
- vpaddd %xmm12, %xmm15, %xmm15
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm14, %xmm1
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm1, %xmm14, %xmm14
- vpblendw $3, %xmm5, %xmm4, %xmm1
- vpshufd $147, %xmm14, %xmm14
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm12
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm2, %xmm12
- vpblendw $60, %xmm12, %xmm1, %xmm1
- vpaddd %xmm1, %xmm15, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpblendw $12, %xmm4, %xmm3, %xmm1
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm13
- vpxor %xmm13, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm13, %xmm13
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm5, %xmm12
- vpblendw $48, %xmm12, %xmm1, %xmm1
- vpshufd $33, %xmm5, %xmm12
- vpshufd $57, %xmm14, %xmm14
- vpshufd $108, %xmm1, %xmm1
- vpblendw $51, %xmm12, %xmm10, %xmm12
- vpaddd %xmm15, %xmm1, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpaddd %xmm12, %xmm15, %xmm15
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm13, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm13
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpslldq $12, %xmm3, %xmm13
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpblendw $51, %xmm5, %xmm4, %xmm12
- vpshufd $147, %xmm14, %xmm14
- vpblendw $192, %xmm13, %xmm12, %xmm12
- vpaddd %xmm12, %xmm15, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpsrldq $4, %xmm3, %xmm12
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm13
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpblendw $48, %xmm2, %xmm5, %xmm13
- vpblendw $3, %xmm12, %xmm13, %xmm13
- vpshufd $156, %xmm13, %xmm13
- vpaddd %xmm15, %xmm13, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm13
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm13, %xmm14, %xmm14
- vpunpcklqdq %xmm2, %xmm4, %xmm13
- vpshufd $57, %xmm14, %xmm14
- vpblendw $12, %xmm12, %xmm13, %xmm12
- vpshufd $180, %xmm12, %xmm12
- vpaddd %xmm15, %xmm12, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm12
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpunpckhqdq %xmm9, %xmm4, %xmm12
- vpshufd $198, %xmm12, %xmm12
- vpaddd %xmm15, %xmm12, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpaddd %xmm15, %xmm8, %xmm15
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $57, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm12
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm12, %xmm14, %xmm14
- vpsrldq $4, %xmm4, %xmm12
- vpshufd $147, %xmm14, %xmm14
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm15, %xmm0, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpslld $20, %xmm14, %xmm8
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm14, %xmm8, %xmm14
- vpblendw $48, %xmm5, %xmm2, %xmm8
- vpblendw $3, %xmm12, %xmm8, %xmm8
- vpunpckhqdq %xmm5, %xmm4, %xmm12
- vpshufd $75, %xmm8, %xmm8
- vpblendw $60, %xmm10, %xmm12, %xmm10
- vpaddd %xmm15, %xmm8, %xmm15
- vpaddd %xmm14, %xmm15, %xmm15
- vpxor %xmm0, %xmm15, %xmm0
- vpshufd $45, %xmm10, %xmm10
- vpshufb %xmm6, %xmm0, %xmm0
- vpaddd %xmm15, %xmm10, %xmm15
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm1, %xmm14, %xmm14
- vpshufd $147, %xmm0, %xmm0
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm14, %xmm8
- vpsrld $7, %xmm14, %xmm14
- vpxor %xmm14, %xmm8, %xmm8
- vpshufd $57, %xmm8, %xmm8
- vpaddd %xmm8, %xmm15, %xmm15
- vpxor %xmm0, %xmm15, %xmm0
- vpshufb %xmm7, %xmm0, %xmm0
- vpaddd %xmm0, %xmm1, %xmm1
- vpxor %xmm8, %xmm1, %xmm8
- vpslld $20, %xmm8, %xmm10
- vpsrld $12, %xmm8, %xmm8
- vpxor %xmm8, %xmm10, %xmm10
- vpunpckldq %xmm3, %xmm4, %xmm8
- vpunpcklqdq %xmm9, %xmm8, %xmm9
- vpaddd %xmm9, %xmm15, %xmm9
- vpaddd %xmm10, %xmm9, %xmm9
- vpxor %xmm0, %xmm9, %xmm8
- vpshufb %xmm6, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm10
- vpshufd $57, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm10, %xmm12
- vpsrld $7, %xmm10, %xmm10
- vpxor %xmm10, %xmm12, %xmm10
- vpblendw $48, %xmm4, %xmm3, %xmm12
- vpshufd $147, %xmm10, %xmm0
- vpunpckhdq %xmm5, %xmm3, %xmm10
- vpshufd $78, %xmm12, %xmm12
- vpunpcklqdq %xmm4, %xmm10, %xmm10
- vpblendw $192, %xmm2, %xmm10, %xmm10
- vpshufhw $78, %xmm10, %xmm10
- vpaddd %xmm10, %xmm9, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm8, %xmm10, %xmm8
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm0, %xmm1, %xmm9
- vpslld $20, %xmm9, %xmm0
- vpsrld $12, %xmm9, %xmm9
- vpxor %xmm9, %xmm0, %xmm0
- vpunpckhdq %xmm5, %xmm4, %xmm9
- vpblendw $240, %xmm9, %xmm2, %xmm13
- vpshufd $39, %xmm13, %xmm13
- vpaddd %xmm10, %xmm13, %xmm10
- vpaddd %xmm0, %xmm10, %xmm10
- vpxor %xmm8, %xmm10, %xmm8
- vpblendw $12, %xmm4, %xmm2, %xmm13
- vpshufb %xmm6, %xmm8, %xmm8
- vpslldq $4, %xmm13, %xmm13
- vpblendw $15, %xmm5, %xmm13, %xmm13
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm0, %xmm0
- vpaddd %xmm13, %xmm10, %xmm13
- vpshufd $147, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm0, %xmm14
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm0, %xmm14, %xmm14
- vpshufd $57, %xmm14, %xmm14
- vpaddd %xmm14, %xmm13, %xmm13
- vpxor %xmm8, %xmm13, %xmm8
- vpaddd %xmm13, %xmm12, %xmm12
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm14, %xmm1, %xmm14
- vpslld $20, %xmm14, %xmm10
- vpsrld $12, %xmm14, %xmm14
- vpxor %xmm14, %xmm10, %xmm10
- vpaddd %xmm10, %xmm12, %xmm12
- vpxor %xmm8, %xmm12, %xmm8
- vpshufb %xmm6, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm0
- vpshufd $57, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpslld $25, %xmm0, %xmm10
- vpsrld $7, %xmm0, %xmm0
- vpxor %xmm0, %xmm10, %xmm10
- vpblendw $48, %xmm2, %xmm3, %xmm0
- vpblendw $15, %xmm11, %xmm0, %xmm0
- vpshufd $147, %xmm10, %xmm10
- vpshufd $114, %xmm0, %xmm0
- vpaddd %xmm12, %xmm0, %xmm0
- vpaddd %xmm10, %xmm0, %xmm0
- vpxor %xmm8, %xmm0, %xmm8
- vpshufb %xmm7, %xmm8, %xmm8
- vpaddd %xmm8, %xmm1, %xmm1
- vpxor %xmm10, %xmm1, %xmm10
- vpslld $20, %xmm10, %xmm11
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm10, %xmm11, %xmm10
- vpslldq $4, %xmm4, %xmm11
- vpblendw $192, %xmm11, %xmm3, %xmm3
- vpunpckldq %xmm5, %xmm4, %xmm4
- vpshufd $99, %xmm3, %xmm3
- vpaddd %xmm0, %xmm3, %xmm3
- vpaddd %xmm10, %xmm3, %xmm3
- vpxor %xmm8, %xmm3, %xmm11
- vpunpckldq %xmm5, %xmm2, %xmm0
- vpblendw $192, %xmm2, %xmm5, %xmm2
- vpshufb %xmm6, %xmm11, %xmm11
- vpunpckhqdq %xmm0, %xmm9, %xmm0
- vpblendw $15, %xmm4, %xmm2, %xmm4
- vpaddd %xmm11, %xmm1, %xmm1
- vpxor %xmm1, %xmm10, %xmm10
- vpshufd $147, %xmm11, %xmm11
- vpshufd $201, %xmm0, %xmm0
- vpslld $25, %xmm10, %xmm8
- vpsrld $7, %xmm10, %xmm10
- vpxor %xmm10, %xmm8, %xmm10
- vpshufd $78, %xmm1, %xmm1
- vpaddd %xmm3, %xmm0, %xmm0
- vpshufd $27, %xmm4, %xmm4
- vpshufd $57, %xmm10, %xmm10
- vpaddd %xmm10, %xmm0, %xmm0
- vpxor %xmm11, %xmm0, %xmm11
- vpaddd %xmm0, %xmm4, %xmm0
- vpshufb %xmm7, %xmm11, %xmm7
- vpaddd %xmm7, %xmm1, %xmm1
- vpxor %xmm10, %xmm1, %xmm10
- vpslld $20, %xmm10, %xmm8
- vpsrld $12, %xmm10, %xmm10
- vpxor %xmm10, %xmm8, %xmm8
- vpaddd %xmm8, %xmm0, %xmm0
- vpxor %xmm7, %xmm0, %xmm7
- vpshufb %xmm6, %xmm7, %xmm6
- vpaddd %xmm6, %xmm1, %xmm1
- vpxor %xmm1, %xmm8, %xmm8
- vpshufd $78, %xmm1, %xmm1
- vpshufd $57, %xmm6, %xmm6
- vpslld $25, %xmm8, %xmm2
- vpsrld $7, %xmm8, %xmm8
- vpxor %xmm8, %xmm2, %xmm8
- vpxor (%rdi), %xmm1, %xmm1
- vpshufd $147, %xmm8, %xmm8
- vpxor %xmm0, %xmm1, %xmm0
- vmovups %xmm0, (%rdi)
- vpxor 16(%rdi), %xmm8, %xmm0
- vpxor %xmm6, %xmm0, %xmm6
- vmovups %xmm6, 16(%rdi)
- addq $64, %rsi
- decq %rdx
- jnz .Lbeginofloop
+ movdqa %xmm0,%xmm10
+ movdqa %xmm1,%xmm11
+ paddq %xmm15,%xmm14
+ movdqa IV(%rip),%xmm2
+ movdqa %xmm14,%xmm3
+ pxor IV+0x10(%rip),%xmm3
+ movl 0x8(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0xc(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0x4(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x20(%rsi),%r8d
+ movl 0x30(%rsi),%r9d
+ movl 0x38(%rsi),%r10d
+ movl 0x28(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x24(%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x10(%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x38(%rsi),%r10d
+ movl 0x24(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x20(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x3c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x4(%rsi),%r8d
+ movl 0x2c(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl (%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x30(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0xc(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x30(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl (%rsi),%r8d
+ movl 0x34(%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x28(%rsi),%r8d
+ movl 0x1c(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x38(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x10(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0xc(%rsi),%r8d
+ movl 0x2c(%rsi),%r9d
+ movl 0x1c(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x4(%rsi),%r8d
+ movl 0x38(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x8(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x18(%rsi),%r8d
+ movl (%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x28(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x14(%rsi),%r8d
+ movl 0x28(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x8(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x1c(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x38(%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0xc(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x4(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x18(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl (%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x28(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x30(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x10(%rsi),%r8d
+ movl 0x3c(%rsi),%r9d
+ movl 0x4(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x34(%rsi),%r8d
+ movl 0x38(%rsi),%r9d
+ movl 0x24(%rsi),%r10d
+ movl 0x14(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x4(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x30(%rsi),%r10d
+ movl 0x38(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x3c(%rsi),%r8d
+ movl 0x28(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl (%rsi),%r8d
+ movl 0x24(%rsi),%r9d
+ movl 0x20(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x1c(%rsi),%r8d
+ movl 0x8(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x1c(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x30(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x38(%rsi),%r8d
+ movl 0x24(%rsi),%r9d
+ movl 0x2c(%rsi),%r10d
+ movl 0x4(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x14(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl 0x3c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl (%rsi),%r8d
+ movl 0x18(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x10(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x38(%rsi),%r8d
+ movl (%rsi),%r9d
+ movl 0x18(%rsi),%r10d
+ movl 0x2c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x24(%rsi),%r8d
+ movl 0x20(%rsi),%r9d
+ movl 0x3c(%rsi),%r10d
+ movl 0xc(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x30(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x34(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x8(%rsi),%r8d
+ movl 0x10(%rsi),%r9d
+ movl 0x14(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ movl 0x20(%rsi),%r8d
+ movl 0x4(%rsi),%r9d
+ movl 0x28(%rsi),%r10d
+ movl 0x1c(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm4
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm4
+ paddd %xmm4,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x10(%rsi),%r8d
+ movl 0x14(%rsi),%r9d
+ movl 0x8(%rsi),%r10d
+ movl 0x18(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm5
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm5
+ paddd %xmm5,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x93,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x39,%xmm2,%xmm2
+ movl 0x3c(%rsi),%r8d
+ movl 0xc(%rsi),%r9d
+ movl 0x34(%rsi),%r10d
+ movl 0x24(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm6
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm6
+ paddd %xmm6,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm12,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0xc,%xmm1
+ pslld $0x14,%xmm8
+ por %xmm8,%xmm1
+ movl 0x2c(%rsi),%r8d
+ movl 0x30(%rsi),%r9d
+ movl (%rsi),%r10d
+ movl 0x38(%rsi),%r11d
+ shlq $0x20,%r8
+ shlq $0x20,%r9
+ orq %r10,%r8
+ orq %r11,%r9
+ movq %r8,%xmm7
+ movq %r9,%xmm8
+ punpcklqdq %xmm8,%xmm7
+ paddd %xmm7,%xmm0
+ paddd %xmm1,%xmm0
+ pxor %xmm0,%xmm3
+ pshufb %xmm13,%xmm3
+ paddd %xmm3,%xmm2
+ pxor %xmm2,%xmm1
+ movdqa %xmm1,%xmm8
+ psrld $0x7,%xmm1
+ pslld $0x19,%xmm8
+ por %xmm8,%xmm1
+ pshufd $0x39,%xmm0,%xmm0
+ pshufd $0x4e,%xmm3,%xmm3
+ pshufd $0x93,%xmm2,%xmm2
+ pxor %xmm2,%xmm0
+ pxor %xmm3,%xmm1
+ pxor %xmm10,%xmm0
+ pxor %xmm11,%xmm1
+ addq $0x40,%rsi
+ decq %rdx
+ jnz .Lbeginofloop
+ movdqu %xmm0,(%rdi)
+ movdqu %xmm1,0x10(%rdi)
+ movdqu %xmm14,0x20(%rdi)
.Lendofloop:
ret
-ENDPROC(blake2s_compress_avx)
-#endif /* CONFIG_AS_AVX */
+ENDPROC(blake2s_compress_ssse3)
+#endif /* CONFIG_AS_SSSE3 */
#ifdef CONFIG_AS_AVX512
ENTRY(blake2s_compress_avx512)
@@ -647,9 +1011,9 @@ ENTRY(blake2s_compress_avx512)
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
- vpshufd $0x39,%xmm1,%xmm1
- vpshufd $0x4e,%xmm2,%xmm2
- vpshufd $0x93,%xmm3,%xmm3
+ vpshufd $0x93,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
@@ -665,9 +1029,9 @@ ENTRY(blake2s_compress_avx512)
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
- vpshufd $0x93,%xmm1,%xmm1
- vpshufd $0x4e,%xmm2,%xmm2
- vpshufd $0x39,%xmm3,%xmm3
+ vpshufd $0x39,%xmm0,%xmm0
+ vpshufd $0x4e,%xmm3,%xmm3
+ vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
vpxor %xmm10,%xmm0,%xmm0
diff --git a/src/crypto/zinc/chacha20/chacha20-arm.pl b/src/crypto/zinc/chacha20/chacha20-arm.pl
index 6a7d62c..6785383 100644
--- a/src/crypto/zinc/chacha20/chacha20-arm.pl
+++ b/src/crypto/zinc/chacha20/chacha20-arm.pl
@@ -686,9 +686,9 @@ my ($a,$b,$c,$d,$t)=@_;
"&vshr_u32 ($b,$t,25)",
"&vsli_32 ($b,$t,7)",
- "&vext_8 ($c,$c,$c,8)",
- "&vext_8 ($b,$b,$b,$odd?12:4)",
- "&vext_8 ($d,$d,$d,$odd?4:12)"
+ "&vext_8 ($a,$a,$a,$odd?4:12)",
+ "&vext_8 ($d,$d,$d,8)",
+ "&vext_8 ($c,$c,$c,$odd?12:4)"
);
}
diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.pl b/src/crypto/zinc/chacha20/chacha20-arm64.pl
index fc63cc8..ac14a99 100644
--- a/src/crypto/zinc/chacha20/chacha20-arm64.pl
+++ b/src/crypto/zinc/chacha20/chacha20-arm64.pl
@@ -378,9 +378,9 @@ my ($a,$b,$c,$d,$t)=@_;
"&ushr ('$b','$t',25)",
"&sli ('$b','$t',7)",
- "&ext ('$c','$c','$c',8)",
- "&ext ('$d','$d','$d',$odd?4:12)",
- "&ext ('$b','$b','$b',$odd?12:4)"
+ "&ext ('$a','$a','$a',$odd?4:12)",
+ "&ext ('$d','$d','$d',8)",
+ "&ext ('$c','$c','$c',$odd?12:4)"
);
}
diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.pl b/src/crypto/zinc/chacha20/chacha20-x86_64.pl
index 38532f8..116c16e 100644
--- a/src/crypto/zinc/chacha20/chacha20-x86_64.pl
+++ b/src/crypto/zinc/chacha20/chacha20-x86_64.pl
@@ -525,15 +525,15 @@ $code.=<<___;
1:
___
&SSSE3ROUND();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b00111001);
- &pshufd ($d,$d,0b10010011);
+ &pshufd ($a,$a,0b10010011);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b00111001);
&nop ();
&SSSE3ROUND();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b10010011);
- &pshufd ($d,$d,0b00111001);
+ &pshufd ($a,$a,0b00111001);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b10010011);
&dec ($counter);
&jnz ("1b");
@@ -600,15 +600,15 @@ $code.=<<___;
.Loop_ssse3:
___
&SSSE3ROUND();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b00111001);
- &pshufd ($d,$d,0b10010011);
+ &pshufd ($a,$a,0b10010011);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b00111001);
&nop ();
&SSSE3ROUND();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b10010011);
- &pshufd ($d,$d,0b00111001);
+ &pshufd ($a,$a,0b00111001);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b10010011);
&dec ($counter);
&jnz (".Loop_ssse3");
@@ -770,20 +770,20 @@ $code.=<<___;
.Loop_128:
___
&SSSE3ROUND_2x();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b00111001);
- &pshufd ($d,$d,0b10010011);
- &pshufd ($c1,$c1,0b01001110);
- &pshufd ($b1,$b1,0b00111001);
- &pshufd ($d1,$d1,0b10010011);
+ &pshufd ($a,$a,0b10010011);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b00111001);
+ &pshufd ($a1,$a1,0b10010011);
+ &pshufd ($d1,$d1,0b01001110);
+ &pshufd ($c1,$c1,0b00111001);
&SSSE3ROUND_2x();
- &pshufd ($c,$c,0b01001110);
- &pshufd ($b,$b,0b10010011);
- &pshufd ($d,$d,0b00111001);
- &pshufd ($c1,$c1,0b01001110);
- &pshufd ($b1,$b1,0b10010011);
- &pshufd ($d1,$d1,0b00111001);
+ &pshufd ($a,$a,0b00111001);
+ &pshufd ($d,$d,0b01001110);
+ &pshufd ($c,$c,0b10010011);
+ &pshufd ($a1,$a1,0b00111001);
+ &pshufd ($d1,$d1,0b01001110);
+ &pshufd ($c1,$c1,0b10010011);
&dec ($counter);
&jnz (".Loop_128");