From 22bbac4d2ffb62f28b0483f05f24a0f41639b787 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Sat, 4 May 2019 17:14:09 +0100 Subject: blake2s,chacha: latency tweak In every odd-numbered round, instead of operating over the state x00 x01 x02 x03 x05 x06 x07 x04 x10 x11 x08 x09 x15 x12 x13 x14 we operate over the rotated state x03 x00 x01 x02 x04 x05 x06 x07 x09 x10 x11 x08 x14 x15 x12 x13 The advantage here is that this requires no changes to the 'x04 x05 x06 x07' row, which is in the critical path. This results in a noticeable latency improvement of roughly R cycles, for R diagonal rounds in the primitive. In the case of BLAKE2s, which I also moved from requiring AVX to only requiring SSSE3, we save approximately 30 cycles per compression function call on Haswell and Skylake. In other words, this is an improvement of ~0.6 cpb. This idea was pointed out to me by Shunsuke Shimizu, though it appears to have been around for longer. Signed-off-by: Samuel Neves --- src/crypto/zinc/blake2s/blake2s-x86_64-glue.c | 14 +- src/crypto/zinc/blake2s/blake2s-x86_64.S | 1526 +++++++++++++++---------- src/crypto/zinc/chacha20/chacha20-arm.pl | 6 +- src/crypto/zinc/chacha20/chacha20-arm64.pl | 6 +- src/crypto/zinc/chacha20/chacha20-x86_64.pl | 48 +- 5 files changed, 982 insertions(+), 618 deletions(-) (limited to 'src/crypto') diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c index 9a956be..087a48d 100644 --- a/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c +++ b/src/crypto/zinc/blake2s/blake2s-x86_64-glue.c @@ -8,22 +8,22 @@ #include #include -asmlinkage void blake2s_compress_avx(struct blake2s_state *state, +asmlinkage void blake2s_compress_ssse3(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); asmlinkage void blake2s_compress_avx512(struct blake2s_state *state, const u8 *block, const size_t nblocks, const u32 inc); -static bool blake2s_use_avx __ro_after_init; +static bool blake2s_use_ssse3 __ro_after_init; static bool blake2s_use_avx512 __ro_after_init; static bool *const blake2s_nobs[] __initconst = { &blake2s_use_avx512 }; static void __init blake2s_fpu_init(void) { - blake2s_use_avx = - boot_cpu_has(X86_FEATURE_AVX) && - cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL); + blake2s_use_ssse3 = + boot_cpu_has(X86_FEATURE_SSSE3) && + cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL); #ifndef COMPAT_CANNOT_USE_AVX512 blake2s_use_avx512 = boot_cpu_has(X86_FEATURE_AVX) && @@ -47,7 +47,7 @@ static inline bool blake2s_compress_arch(struct blake2s_state *state, simd_get(&simd_context); - if (!IS_ENABLED(CONFIG_AS_AVX) || !blake2s_use_avx || + if (!IS_ENABLED(CONFIG_AS_SSSE3) || !blake2s_use_ssse3 || !simd_use(&simd_context)) goto out; used_arch = true; @@ -59,7 +59,7 @@ static inline bool blake2s_compress_arch(struct blake2s_state *state, if (IS_ENABLED(CONFIG_AS_AVX512) && blake2s_use_avx512) blake2s_compress_avx512(state, block, blocks, inc); else - blake2s_compress_avx(state, block, blocks, inc); + blake2s_compress_ssse3(state, block, blocks, inc); nblocks -= blocks; if (!nblocks) diff --git a/src/crypto/zinc/blake2s/blake2s-x86_64.S b/src/crypto/zinc/blake2s/blake2s-x86_64.S index 675288f..9bb4c83 100644 --- a/src/crypto/zinc/blake2s/blake2s-x86_64.S +++ b/src/crypto/zinc/blake2s/blake2s-x86_64.S @@ -20,588 +20,952 @@ ROR328: .octa 0x0C0F0E0D080B0A090407060500030201 .section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 640 .align 64 SIGMA: -.long 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15 -.long 11, 2, 12, 14, 9, 8, 15, 3, 4, 0, 13, 6, 10, 1, 7, 5 -.long 10, 12, 11, 6, 5, 9, 13, 3, 4, 15, 14, 2, 0, 7, 8, 1 -.long 10, 9, 7, 0, 11, 14, 1, 12, 6, 2, 15, 3, 13, 8, 5, 4 -.long 4, 9, 8, 13, 14, 0, 10, 11, 7, 3, 12, 1, 5, 6, 15, 2 -.long 2, 10, 4, 14, 13, 3, 9, 11, 6, 5, 7, 12, 15, 1, 8, 0 -.long 4, 11, 14, 8, 13, 10, 12, 5, 2, 1, 15, 3, 9, 7, 0, 6 -.long 6, 12, 0, 13, 15, 2, 1, 10, 4, 5, 11, 14, 8, 3, 9, 7 -.long 14, 5, 4, 12, 9, 7, 3, 10, 2, 0, 6, 15, 11, 1, 13, 8 -.long 11, 7, 13, 10, 12, 14, 0, 15, 4, 5, 6, 9, 2, 1, 8, 3 +.long 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13 +.long 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7 +.long 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9 +.long 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5 +.long 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12 +.long 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9 +.long 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0 +.long 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10 +.long 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14 +.long 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9 #endif /* CONFIG_AS_AVX512 */ .text -#ifdef CONFIG_AS_AVX -ENTRY(blake2s_compress_avx) - movl %ecx, %ecx - testq %rdx, %rdx - je .Lendofloop +#ifdef CONFIG_AS_SSSE3 +ENTRY(blake2s_compress_ssse3) + testq %rdx, %rdx + je .Lendofloop + movdqu (%rdi),%xmm0 + movdqu 0x10(%rdi),%xmm1 + movdqa ROT16(%rip),%xmm12 + movdqa ROR328(%rip),%xmm13 + movdqu 0x20(%rdi),%xmm14 + movq %rcx,%xmm15 + jmp .Lbeginofloop .align 32 .Lbeginofloop: - addq %rcx, 32(%rdi) - vmovdqu IV+16(%rip), %xmm1 - vmovdqu (%rsi), %xmm4 - vpxor 32(%rdi), %xmm1, %xmm1 - vmovdqu 16(%rsi), %xmm3 - vshufps $136, %xmm3, %xmm4, %xmm6 - vmovdqa ROT16(%rip), %xmm7 - vpaddd (%rdi), %xmm6, %xmm6 - vpaddd 16(%rdi), %xmm6, %xmm6 - vpxor %xmm6, %xmm1, %xmm1 - vmovdqu IV(%rip), %xmm8 - vpshufb %xmm7, %xmm1, %xmm1 - vmovdqu 48(%rsi), %xmm5 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor 16(%rdi), %xmm8, %xmm9 - vmovdqu 32(%rsi), %xmm2 - vpblendw $12, %xmm3, %xmm5, %xmm13 - vshufps $221, %xmm5, %xmm2, %xmm12 - vpunpckhqdq %xmm2, %xmm4, %xmm14 - vpslld $20, %xmm9, %xmm0 - vpsrld $12, %xmm9, %xmm9 - vpxor %xmm0, %xmm9, %xmm0 - vshufps $221, %xmm3, %xmm4, %xmm9 - vpaddd %xmm9, %xmm6, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vmovdqa ROR328(%rip), %xmm6 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor %xmm8, %xmm0, %xmm0 - vpshufd $147, %xmm1, %xmm1 - vpshufd $78, %xmm8, %xmm8 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm10, %xmm0, %xmm0 - vshufps $136, %xmm5, %xmm2, %xmm10 - vpshufd $57, %xmm0, %xmm0 - vpaddd %xmm10, %xmm9, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpaddd %xmm12, %xmm9, %xmm9 - vpblendw $12, %xmm2, %xmm3, %xmm12 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor %xmm8, %xmm0, %xmm10 - vpslld $20, %xmm10, %xmm0 - vpsrld $12, %xmm10, %xmm10 - vpxor %xmm0, %xmm10, %xmm0 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor %xmm8, %xmm0, %xmm0 - vpshufd $57, %xmm1, %xmm1 - vpshufd $78, %xmm8, %xmm8 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm10, %xmm0, %xmm0 - vpslldq $4, %xmm5, %xmm10 - vpblendw $240, %xmm10, %xmm12, %xmm12 - vpshufd $147, %xmm0, %xmm0 - vpshufd $147, %xmm12, %xmm12 - vpaddd %xmm9, %xmm12, %xmm12 - vpaddd %xmm0, %xmm12, %xmm12 - vpxor %xmm12, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor %xmm8, %xmm0, %xmm11 - vpslld $20, %xmm11, %xmm9 - vpsrld $12, %xmm11, %xmm11 - vpxor %xmm9, %xmm11, %xmm0 - vpshufd $8, %xmm2, %xmm9 - vpblendw $192, %xmm5, %xmm3, %xmm11 - vpblendw $240, %xmm11, %xmm9, %xmm9 - vpshufd $177, %xmm9, %xmm9 - vpaddd %xmm12, %xmm9, %xmm9 - vpaddd %xmm0, %xmm9, %xmm11 - vpxor %xmm11, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm8 - vpxor %xmm8, %xmm0, %xmm9 - vpshufd $147, %xmm1, %xmm1 - vpshufd $78, %xmm8, %xmm8 - vpslld $25, %xmm9, %xmm0 - vpsrld $7, %xmm9, %xmm9 - vpxor %xmm0, %xmm9, %xmm0 - vpslldq $4, %xmm3, %xmm9 - vpblendw $48, %xmm9, %xmm2, %xmm9 - vpblendw $240, %xmm9, %xmm4, %xmm9 - vpshufd $57, %xmm0, %xmm0 - vpshufd $177, %xmm9, %xmm9 - vpaddd %xmm11, %xmm9, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm8, %xmm11 - vpxor %xmm11, %xmm0, %xmm0 - vpslld $20, %xmm0, %xmm8 - vpsrld $12, %xmm0, %xmm0 - vpxor %xmm8, %xmm0, %xmm0 - vpunpckhdq %xmm3, %xmm4, %xmm8 - vpblendw $12, %xmm10, %xmm8, %xmm12 - vpshufd $177, %xmm12, %xmm12 - vpaddd %xmm9, %xmm12, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm0 - vpshufd $57, %xmm1, %xmm1 - vpshufd $78, %xmm11, %xmm11 - vpslld $25, %xmm0, %xmm12 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm12, %xmm0, %xmm0 - vpunpckhdq %xmm5, %xmm2, %xmm12 - vpshufd $147, %xmm0, %xmm0 - vpblendw $15, %xmm13, %xmm12, %xmm12 - vpslldq $8, %xmm5, %xmm13 - vpshufd $210, %xmm12, %xmm12 - vpaddd %xmm9, %xmm12, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm0 - vpslld $20, %xmm0, %xmm12 - vpsrld $12, %xmm0, %xmm0 - vpxor %xmm12, %xmm0, %xmm0 - vpunpckldq %xmm4, %xmm2, %xmm12 - vpblendw $240, %xmm4, %xmm12, %xmm12 - vpblendw $192, %xmm13, %xmm12, %xmm12 - vpsrldq $12, %xmm3, %xmm13 - vpaddd %xmm12, %xmm9, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm0 - vpshufd $147, %xmm1, %xmm1 - vpshufd $78, %xmm11, %xmm11 - vpslld $25, %xmm0, %xmm12 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm12, %xmm0, %xmm0 - vpblendw $60, %xmm2, %xmm4, %xmm12 - vpblendw $3, %xmm13, %xmm12, %xmm12 - vpshufd $57, %xmm0, %xmm0 - vpshufd $78, %xmm12, %xmm12 - vpaddd %xmm9, %xmm12, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm12 - vpslld $20, %xmm12, %xmm13 - vpsrld $12, %xmm12, %xmm0 - vpblendw $51, %xmm3, %xmm4, %xmm12 - vpxor %xmm13, %xmm0, %xmm0 - vpblendw $192, %xmm10, %xmm12, %xmm10 - vpslldq $8, %xmm2, %xmm12 - vpshufd $27, %xmm10, %xmm10 - vpaddd %xmm9, %xmm10, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm0 - vpshufd $57, %xmm1, %xmm1 - vpshufd $78, %xmm11, %xmm11 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm10, %xmm0, %xmm0 - vpunpckhdq %xmm2, %xmm8, %xmm10 - vpshufd $147, %xmm0, %xmm0 - vpblendw $12, %xmm5, %xmm10, %xmm10 - vpshufd $210, %xmm10, %xmm10 - vpaddd %xmm9, %xmm10, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm11 - vpxor %xmm11, %xmm0, %xmm10 - vpslld $20, %xmm10, %xmm0 - vpsrld $12, %xmm10, %xmm10 - vpxor %xmm0, %xmm10, %xmm0 - vpblendw $12, %xmm4, %xmm5, %xmm10 - vpblendw $192, %xmm12, %xmm10, %xmm10 - vpunpckldq %xmm2, %xmm4, %xmm12 - vpshufd $135, %xmm10, %xmm10 - vpaddd %xmm9, %xmm10, %xmm9 - vpaddd %xmm0, %xmm9, %xmm9 - vpxor %xmm9, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm11, %xmm13 - vpxor %xmm13, %xmm0, %xmm0 - vpshufd $147, %xmm1, %xmm1 - vpshufd $78, %xmm13, %xmm13 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm10, %xmm0, %xmm0 - vpblendw $15, %xmm3, %xmm4, %xmm10 - vpblendw $192, %xmm5, %xmm10, %xmm10 - vpshufd $57, %xmm0, %xmm0 - vpshufd $198, %xmm10, %xmm10 - vpaddd %xmm9, %xmm10, %xmm10 - vpaddd %xmm0, %xmm10, %xmm10 - vpxor %xmm10, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm13, %xmm13 - vpxor %xmm13, %xmm0, %xmm9 - vpslld $20, %xmm9, %xmm0 - vpsrld $12, %xmm9, %xmm9 - vpxor %xmm0, %xmm9, %xmm0 - vpunpckhdq %xmm2, %xmm3, %xmm9 - vpunpcklqdq %xmm12, %xmm9, %xmm15 - vpunpcklqdq %xmm12, %xmm8, %xmm12 - vpblendw $15, %xmm5, %xmm8, %xmm8 - vpaddd %xmm15, %xmm10, %xmm15 - vpaddd %xmm0, %xmm15, %xmm15 - vpxor %xmm15, %xmm1, %xmm1 - vpshufd $141, %xmm8, %xmm8 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm13, %xmm13 - vpxor %xmm13, %xmm0, %xmm0 - vpshufd $57, %xmm1, %xmm1 - vpshufd $78, %xmm13, %xmm13 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm10, %xmm0, %xmm0 - vpunpcklqdq %xmm2, %xmm3, %xmm10 - vpshufd $147, %xmm0, %xmm0 - vpblendw $51, %xmm14, %xmm10, %xmm14 - vpshufd $135, %xmm14, %xmm14 - vpaddd %xmm15, %xmm14, %xmm14 - vpaddd %xmm0, %xmm14, %xmm14 - vpxor %xmm14, %xmm1, %xmm1 - vpunpcklqdq %xmm3, %xmm4, %xmm15 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm13, %xmm13 - vpxor %xmm13, %xmm0, %xmm0 - vpslld $20, %xmm0, %xmm11 - vpsrld $12, %xmm0, %xmm0 - vpxor %xmm11, %xmm0, %xmm0 - vpunpckhqdq %xmm5, %xmm3, %xmm11 - vpblendw $51, %xmm15, %xmm11, %xmm11 - vpunpckhqdq %xmm3, %xmm5, %xmm15 - vpaddd %xmm11, %xmm14, %xmm11 - vpaddd %xmm0, %xmm11, %xmm11 - vpxor %xmm11, %xmm1, %xmm1 - vpshufb %xmm6, %xmm1, %xmm1 - vpaddd %xmm1, %xmm13, %xmm13 - vpxor %xmm13, %xmm0, %xmm0 - vpshufd $147, %xmm1, %xmm1 - vpshufd $78, %xmm13, %xmm13 - vpslld $25, %xmm0, %xmm14 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm14, %xmm0, %xmm14 - vpunpckhqdq %xmm4, %xmm2, %xmm0 - vpshufd $57, %xmm14, %xmm14 - vpblendw $51, %xmm15, %xmm0, %xmm15 - vpaddd %xmm15, %xmm11, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm1, %xmm1 - vpshufb %xmm7, %xmm1, %xmm1 - vpaddd %xmm1, %xmm13, %xmm13 - vpxor %xmm13, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm11 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm11, %xmm14, %xmm14 - vpblendw $3, %xmm2, %xmm4, %xmm11 - vpslldq $8, %xmm11, %xmm0 - vpblendw $15, %xmm5, %xmm0, %xmm0 - vpshufd $99, %xmm0, %xmm0 - vpaddd %xmm15, %xmm0, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm1, %xmm0 - vpaddd %xmm12, %xmm15, %xmm15 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm0, %xmm13, %xmm13 - vpxor %xmm13, %xmm14, %xmm14 - vpshufd $57, %xmm0, %xmm0 - vpshufd $78, %xmm13, %xmm13 - vpslld $25, %xmm14, %xmm1 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm1, %xmm14, %xmm14 - vpblendw $3, %xmm5, %xmm4, %xmm1 - vpshufd $147, %xmm14, %xmm14 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm13, %xmm13 - vpxor %xmm13, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm12 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm12, %xmm14, %xmm14 - vpsrldq $4, %xmm2, %xmm12 - vpblendw $60, %xmm12, %xmm1, %xmm1 - vpaddd %xmm1, %xmm15, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpblendw $12, %xmm4, %xmm3, %xmm1 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm0, %xmm13, %xmm13 - vpxor %xmm13, %xmm14, %xmm14 - vpshufd $147, %xmm0, %xmm0 - vpshufd $78, %xmm13, %xmm13 - vpslld $25, %xmm14, %xmm12 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm12, %xmm14, %xmm14 - vpsrldq $4, %xmm5, %xmm12 - vpblendw $48, %xmm12, %xmm1, %xmm1 - vpshufd $33, %xmm5, %xmm12 - vpshufd $57, %xmm14, %xmm14 - vpshufd $108, %xmm1, %xmm1 - vpblendw $51, %xmm12, %xmm10, %xmm12 - vpaddd %xmm15, %xmm1, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpaddd %xmm12, %xmm15, %xmm15 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm13, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm13 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm13, %xmm14, %xmm14 - vpslldq $12, %xmm3, %xmm13 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpshufd $57, %xmm0, %xmm0 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm14, %xmm12 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm12, %xmm14, %xmm14 - vpblendw $51, %xmm5, %xmm4, %xmm12 - vpshufd $147, %xmm14, %xmm14 - vpblendw $192, %xmm13, %xmm12, %xmm12 - vpaddd %xmm12, %xmm15, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpsrldq $4, %xmm3, %xmm12 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm13 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm13, %xmm14, %xmm14 - vpblendw $48, %xmm2, %xmm5, %xmm13 - vpblendw $3, %xmm12, %xmm13, %xmm13 - vpshufd $156, %xmm13, %xmm13 - vpaddd %xmm15, %xmm13, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpshufd $147, %xmm0, %xmm0 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm14, %xmm13 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm13, %xmm14, %xmm14 - vpunpcklqdq %xmm2, %xmm4, %xmm13 - vpshufd $57, %xmm14, %xmm14 - vpblendw $12, %xmm12, %xmm13, %xmm12 - vpshufd $180, %xmm12, %xmm12 - vpaddd %xmm15, %xmm12, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm12 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm12, %xmm14, %xmm14 - vpunpckhqdq %xmm9, %xmm4, %xmm12 - vpshufd $198, %xmm12, %xmm12 - vpaddd %xmm15, %xmm12, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpaddd %xmm15, %xmm8, %xmm15 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpshufd $57, %xmm0, %xmm0 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm14, %xmm12 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm12, %xmm14, %xmm14 - vpsrldq $4, %xmm4, %xmm12 - vpshufd $147, %xmm14, %xmm14 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm15, %xmm0, %xmm0 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpslld $20, %xmm14, %xmm8 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm14, %xmm8, %xmm14 - vpblendw $48, %xmm5, %xmm2, %xmm8 - vpblendw $3, %xmm12, %xmm8, %xmm8 - vpunpckhqdq %xmm5, %xmm4, %xmm12 - vpshufd $75, %xmm8, %xmm8 - vpblendw $60, %xmm10, %xmm12, %xmm10 - vpaddd %xmm15, %xmm8, %xmm15 - vpaddd %xmm14, %xmm15, %xmm15 - vpxor %xmm0, %xmm15, %xmm0 - vpshufd $45, %xmm10, %xmm10 - vpshufb %xmm6, %xmm0, %xmm0 - vpaddd %xmm15, %xmm10, %xmm15 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm1, %xmm14, %xmm14 - vpshufd $147, %xmm0, %xmm0 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm14, %xmm8 - vpsrld $7, %xmm14, %xmm14 - vpxor %xmm14, %xmm8, %xmm8 - vpshufd $57, %xmm8, %xmm8 - vpaddd %xmm8, %xmm15, %xmm15 - vpxor %xmm0, %xmm15, %xmm0 - vpshufb %xmm7, %xmm0, %xmm0 - vpaddd %xmm0, %xmm1, %xmm1 - vpxor %xmm8, %xmm1, %xmm8 - vpslld $20, %xmm8, %xmm10 - vpsrld $12, %xmm8, %xmm8 - vpxor %xmm8, %xmm10, %xmm10 - vpunpckldq %xmm3, %xmm4, %xmm8 - vpunpcklqdq %xmm9, %xmm8, %xmm9 - vpaddd %xmm9, %xmm15, %xmm9 - vpaddd %xmm10, %xmm9, %xmm9 - vpxor %xmm0, %xmm9, %xmm8 - vpshufb %xmm6, %xmm8, %xmm8 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm1, %xmm10, %xmm10 - vpshufd $57, %xmm8, %xmm8 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm10, %xmm12 - vpsrld $7, %xmm10, %xmm10 - vpxor %xmm10, %xmm12, %xmm10 - vpblendw $48, %xmm4, %xmm3, %xmm12 - vpshufd $147, %xmm10, %xmm0 - vpunpckhdq %xmm5, %xmm3, %xmm10 - vpshufd $78, %xmm12, %xmm12 - vpunpcklqdq %xmm4, %xmm10, %xmm10 - vpblendw $192, %xmm2, %xmm10, %xmm10 - vpshufhw $78, %xmm10, %xmm10 - vpaddd %xmm10, %xmm9, %xmm10 - vpaddd %xmm0, %xmm10, %xmm10 - vpxor %xmm8, %xmm10, %xmm8 - vpshufb %xmm7, %xmm8, %xmm8 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm0, %xmm1, %xmm9 - vpslld $20, %xmm9, %xmm0 - vpsrld $12, %xmm9, %xmm9 - vpxor %xmm9, %xmm0, %xmm0 - vpunpckhdq %xmm5, %xmm4, %xmm9 - vpblendw $240, %xmm9, %xmm2, %xmm13 - vpshufd $39, %xmm13, %xmm13 - vpaddd %xmm10, %xmm13, %xmm10 - vpaddd %xmm0, %xmm10, %xmm10 - vpxor %xmm8, %xmm10, %xmm8 - vpblendw $12, %xmm4, %xmm2, %xmm13 - vpshufb %xmm6, %xmm8, %xmm8 - vpslldq $4, %xmm13, %xmm13 - vpblendw $15, %xmm5, %xmm13, %xmm13 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 - vpaddd %xmm13, %xmm10, %xmm13 - vpshufd $147, %xmm8, %xmm8 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm0, %xmm14 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm0, %xmm14, %xmm14 - vpshufd $57, %xmm14, %xmm14 - vpaddd %xmm14, %xmm13, %xmm13 - vpxor %xmm8, %xmm13, %xmm8 - vpaddd %xmm13, %xmm12, %xmm12 - vpshufb %xmm7, %xmm8, %xmm8 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm14, %xmm1, %xmm14 - vpslld $20, %xmm14, %xmm10 - vpsrld $12, %xmm14, %xmm14 - vpxor %xmm14, %xmm10, %xmm10 - vpaddd %xmm10, %xmm12, %xmm12 - vpxor %xmm8, %xmm12, %xmm8 - vpshufb %xmm6, %xmm8, %xmm8 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm1, %xmm10, %xmm0 - vpshufd $57, %xmm8, %xmm8 - vpshufd $78, %xmm1, %xmm1 - vpslld $25, %xmm0, %xmm10 - vpsrld $7, %xmm0, %xmm0 - vpxor %xmm0, %xmm10, %xmm10 - vpblendw $48, %xmm2, %xmm3, %xmm0 - vpblendw $15, %xmm11, %xmm0, %xmm0 - vpshufd $147, %xmm10, %xmm10 - vpshufd $114, %xmm0, %xmm0 - vpaddd %xmm12, %xmm0, %xmm0 - vpaddd %xmm10, %xmm0, %xmm0 - vpxor %xmm8, %xmm0, %xmm8 - vpshufb %xmm7, %xmm8, %xmm8 - vpaddd %xmm8, %xmm1, %xmm1 - vpxor %xmm10, %xmm1, %xmm10 - vpslld $20, %xmm10, %xmm11 - vpsrld $12, %xmm10, %xmm10 - vpxor %xmm10, %xmm11, %xmm10 - vpslldq $4, %xmm4, %xmm11 - vpblendw $192, %xmm11, %xmm3, %xmm3 - vpunpckldq %xmm5, %xmm4, %xmm4 - vpshufd $99, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm10, %xmm3, %xmm3 - vpxor %xmm8, %xmm3, %xmm11 - vpunpckldq %xmm5, %xmm2, %xmm0 - vpblendw $192, %xmm2, %xmm5, %xmm2 - vpshufb %xmm6, %xmm11, %xmm11 - vpunpckhqdq %xmm0, %xmm9, %xmm0 - vpblendw $15, %xmm4, %xmm2, %xmm4 - vpaddd %xmm11, %xmm1, %xmm1 - vpxor %xmm1, %xmm10, %xmm10 - vpshufd $147, %xmm11, %xmm11 - vpshufd $201, %xmm0, %xmm0 - vpslld $25, %xmm10, %xmm8 - vpsrld $7, %xmm10, %xmm10 - vpxor %xmm10, %xmm8, %xmm10 - vpshufd $78, %xmm1, %xmm1 - vpaddd %xmm3, %xmm0, %xmm0 - vpshufd $27, %xmm4, %xmm4 - vpshufd $57, %xmm10, %xmm10 - vpaddd %xmm10, %xmm0, %xmm0 - vpxor %xmm11, %xmm0, %xmm11 - vpaddd %xmm0, %xmm4, %xmm0 - vpshufb %xmm7, %xmm11, %xmm7 - vpaddd %xmm7, %xmm1, %xmm1 - vpxor %xmm10, %xmm1, %xmm10 - vpslld $20, %xmm10, %xmm8 - vpsrld $12, %xmm10, %xmm10 - vpxor %xmm10, %xmm8, %xmm8 - vpaddd %xmm8, %xmm0, %xmm0 - vpxor %xmm7, %xmm0, %xmm7 - vpshufb %xmm6, %xmm7, %xmm6 - vpaddd %xmm6, %xmm1, %xmm1 - vpxor %xmm1, %xmm8, %xmm8 - vpshufd $78, %xmm1, %xmm1 - vpshufd $57, %xmm6, %xmm6 - vpslld $25, %xmm8, %xmm2 - vpsrld $7, %xmm8, %xmm8 - vpxor %xmm8, %xmm2, %xmm8 - vpxor (%rdi), %xmm1, %xmm1 - vpshufd $147, %xmm8, %xmm8 - vpxor %xmm0, %xmm1, %xmm0 - vmovups %xmm0, (%rdi) - vpxor 16(%rdi), %xmm8, %xmm0 - vpxor %xmm6, %xmm0, %xmm6 - vmovups %xmm6, 16(%rdi) - addq $64, %rsi - decq %rdx - jnz .Lbeginofloop + movdqa %xmm0,%xmm10 + movdqa %xmm1,%xmm11 + paddq %xmm15,%xmm14 + movdqa IV(%rip),%xmm2 + movdqa %xmm14,%xmm3 + pxor IV+0x10(%rip),%xmm3 + movl 0x8(%rsi),%r8d + movl 0x18(%rsi),%r9d + movl (%rsi),%r10d + movl 0x10(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0xc(%rsi),%r8d + movl 0x1c(%rsi),%r9d + movl 0x4(%rsi),%r10d + movl 0x14(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x20(%rsi),%r8d + movl 0x30(%rsi),%r9d + movl 0x38(%rsi),%r10d + movl 0x28(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x24(%rsi),%r8d + movl 0x34(%rsi),%r9d + movl 0x3c(%rsi),%r10d + movl 0x2c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x10(%rsi),%r8d + movl 0x34(%rsi),%r9d + movl 0x38(%rsi),%r10d + movl 0x24(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x20(%rsi),%r8d + movl 0x18(%rsi),%r9d + movl 0x28(%rsi),%r10d + movl 0x3c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x4(%rsi),%r8d + movl 0x2c(%rsi),%r9d + movl 0x14(%rsi),%r10d + movl (%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x30(%rsi),%r8d + movl 0x1c(%rsi),%r9d + movl 0xc(%rsi),%r10d + movl 0x8(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x30(%rsi),%r8d + movl 0x3c(%rsi),%r9d + movl 0x2c(%rsi),%r10d + movl 0x14(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl (%rsi),%r8d + movl 0x34(%rsi),%r9d + movl 0x20(%rsi),%r10d + movl 0x8(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x28(%rsi),%r8d + movl 0x1c(%rsi),%r9d + movl 0x24(%rsi),%r10d + movl 0xc(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x38(%rsi),%r8d + movl 0x4(%rsi),%r9d + movl 0x10(%rsi),%r10d + movl 0x18(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0xc(%rsi),%r8d + movl 0x2c(%rsi),%r9d + movl 0x1c(%rsi),%r10d + movl 0x34(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x4(%rsi),%r8d + movl 0x38(%rsi),%r9d + movl 0x24(%rsi),%r10d + movl 0x30(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x8(%rsi),%r8d + movl 0x10(%rsi),%r9d + movl 0x3c(%rsi),%r10d + movl 0x14(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x18(%rsi),%r8d + movl (%rsi),%r9d + movl 0x20(%rsi),%r10d + movl 0x28(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x14(%rsi),%r8d + movl 0x28(%rsi),%r9d + movl 0x24(%rsi),%r10d + movl 0x8(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x1c(%rsi),%r8d + movl 0x3c(%rsi),%r9d + movl (%rsi),%r10d + movl 0x10(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x38(%rsi),%r8d + movl 0x18(%rsi),%r9d + movl 0xc(%rsi),%r10d + movl 0x2c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x4(%rsi),%r8d + movl 0x20(%rsi),%r9d + movl 0x34(%rsi),%r10d + movl 0x30(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x18(%rsi),%r8d + movl 0x20(%rsi),%r9d + movl 0x8(%rsi),%r10d + movl (%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x28(%rsi),%r8d + movl 0xc(%rsi),%r9d + movl 0x30(%rsi),%r10d + movl 0x2c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x10(%rsi),%r8d + movl 0x3c(%rsi),%r9d + movl 0x4(%rsi),%r10d + movl 0x1c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x34(%rsi),%r8d + movl 0x38(%rsi),%r9d + movl 0x24(%rsi),%r10d + movl 0x14(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x4(%rsi),%r8d + movl 0x10(%rsi),%r9d + movl 0x30(%rsi),%r10d + movl 0x38(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x3c(%rsi),%r8d + movl 0x28(%rsi),%r9d + movl 0x14(%rsi),%r10d + movl 0x34(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl (%rsi),%r8d + movl 0x24(%rsi),%r9d + movl 0x20(%rsi),%r10d + movl 0x18(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x1c(%rsi),%r8d + movl 0x8(%rsi),%r9d + movl 0x2c(%rsi),%r10d + movl 0xc(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x1c(%rsi),%r8d + movl 0xc(%rsi),%r9d + movl 0x34(%rsi),%r10d + movl 0x30(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x38(%rsi),%r8d + movl 0x24(%rsi),%r9d + movl 0x2c(%rsi),%r10d + movl 0x4(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x14(%rsi),%r8d + movl 0x20(%rsi),%r9d + movl 0x8(%rsi),%r10d + movl 0x3c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl (%rsi),%r8d + movl 0x18(%rsi),%r9d + movl 0x28(%rsi),%r10d + movl 0x10(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x38(%rsi),%r8d + movl (%rsi),%r9d + movl 0x18(%rsi),%r10d + movl 0x2c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x24(%rsi),%r8d + movl 0x20(%rsi),%r9d + movl 0x3c(%rsi),%r10d + movl 0xc(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x30(%rsi),%r8d + movl 0x4(%rsi),%r9d + movl 0x28(%rsi),%r10d + movl 0x34(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x8(%rsi),%r8d + movl 0x10(%rsi),%r9d + movl 0x14(%rsi),%r10d + movl 0x1c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + movl 0x20(%rsi),%r8d + movl 0x4(%rsi),%r9d + movl 0x28(%rsi),%r10d + movl 0x1c(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm4 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm4 + paddd %xmm4,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x10(%rsi),%r8d + movl 0x14(%rsi),%r9d + movl 0x8(%rsi),%r10d + movl 0x18(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm5 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm5 + paddd %xmm5,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x93,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x39,%xmm2,%xmm2 + movl 0x3c(%rsi),%r8d + movl 0xc(%rsi),%r9d + movl 0x34(%rsi),%r10d + movl 0x24(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm6 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm6 + paddd %xmm6,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm12,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0xc,%xmm1 + pslld $0x14,%xmm8 + por %xmm8,%xmm1 + movl 0x2c(%rsi),%r8d + movl 0x30(%rsi),%r9d + movl (%rsi),%r10d + movl 0x38(%rsi),%r11d + shlq $0x20,%r8 + shlq $0x20,%r9 + orq %r10,%r8 + orq %r11,%r9 + movq %r8,%xmm7 + movq %r9,%xmm8 + punpcklqdq %xmm8,%xmm7 + paddd %xmm7,%xmm0 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm13,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm8 + psrld $0x7,%xmm1 + pslld $0x19,%xmm8 + por %xmm8,%xmm1 + pshufd $0x39,%xmm0,%xmm0 + pshufd $0x4e,%xmm3,%xmm3 + pshufd $0x93,%xmm2,%xmm2 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm1 + pxor %xmm10,%xmm0 + pxor %xmm11,%xmm1 + addq $0x40,%rsi + decq %rdx + jnz .Lbeginofloop + movdqu %xmm0,(%rdi) + movdqu %xmm1,0x10(%rdi) + movdqu %xmm14,0x20(%rdi) .Lendofloop: ret -ENDPROC(blake2s_compress_avx) -#endif /* CONFIG_AS_AVX */ +ENDPROC(blake2s_compress_ssse3) +#endif /* CONFIG_AS_SSSE3 */ #ifdef CONFIG_AS_AVX512 ENTRY(blake2s_compress_avx512) @@ -647,9 +1011,9 @@ ENTRY(blake2s_compress_avx512) vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 - vpshufd $0x39,%xmm1,%xmm1 - vpshufd $0x4e,%xmm2,%xmm2 - vpshufd $0x93,%xmm3,%xmm3 + vpshufd $0x93,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x39,%xmm2,%xmm2 vpaddd %xmm9,%xmm0,%xmm0 vpaddd %xmm1,%xmm0,%xmm0 vpxor %xmm0,%xmm3,%xmm3 @@ -665,9 +1029,9 @@ ENTRY(blake2s_compress_avx512) vpaddd %xmm3,%xmm2,%xmm2 vpxor %xmm2,%xmm1,%xmm1 vprord $0x7,%xmm1,%xmm1 - vpshufd $0x93,%xmm1,%xmm1 - vpshufd $0x4e,%xmm2,%xmm2 - vpshufd $0x39,%xmm3,%xmm3 + vpshufd $0x39,%xmm0,%xmm0 + vpshufd $0x4e,%xmm3,%xmm3 + vpshufd $0x93,%xmm2,%xmm2 decb %cl jne .Lblake2s_compress_avx512_roundloop vpxor %xmm10,%xmm0,%xmm0 diff --git a/src/crypto/zinc/chacha20/chacha20-arm.pl b/src/crypto/zinc/chacha20/chacha20-arm.pl index 6a7d62c..6785383 100644 --- a/src/crypto/zinc/chacha20/chacha20-arm.pl +++ b/src/crypto/zinc/chacha20/chacha20-arm.pl @@ -686,9 +686,9 @@ my ($a,$b,$c,$d,$t)=@_; "&vshr_u32 ($b,$t,25)", "&vsli_32 ($b,$t,7)", - "&vext_8 ($c,$c,$c,8)", - "&vext_8 ($b,$b,$b,$odd?12:4)", - "&vext_8 ($d,$d,$d,$odd?4:12)" + "&vext_8 ($a,$a,$a,$odd?4:12)", + "&vext_8 ($d,$d,$d,8)", + "&vext_8 ($c,$c,$c,$odd?12:4)" ); } diff --git a/src/crypto/zinc/chacha20/chacha20-arm64.pl b/src/crypto/zinc/chacha20/chacha20-arm64.pl index fc63cc8..ac14a99 100644 --- a/src/crypto/zinc/chacha20/chacha20-arm64.pl +++ b/src/crypto/zinc/chacha20/chacha20-arm64.pl @@ -378,9 +378,9 @@ my ($a,$b,$c,$d,$t)=@_; "&ushr ('$b','$t',25)", "&sli ('$b','$t',7)", - "&ext ('$c','$c','$c',8)", - "&ext ('$d','$d','$d',$odd?4:12)", - "&ext ('$b','$b','$b',$odd?12:4)" + "&ext ('$a','$a','$a',$odd?4:12)", + "&ext ('$d','$d','$d',8)", + "&ext ('$c','$c','$c',$odd?12:4)" ); } diff --git a/src/crypto/zinc/chacha20/chacha20-x86_64.pl b/src/crypto/zinc/chacha20/chacha20-x86_64.pl index 38532f8..116c16e 100644 --- a/src/crypto/zinc/chacha20/chacha20-x86_64.pl +++ b/src/crypto/zinc/chacha20/chacha20-x86_64.pl @@ -525,15 +525,15 @@ $code.=<<___; 1: ___ &SSSE3ROUND(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b00111001); - &pshufd ($d,$d,0b10010011); + &pshufd ($a,$a,0b10010011); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b00111001); &nop (); &SSSE3ROUND(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b10010011); - &pshufd ($d,$d,0b00111001); + &pshufd ($a,$a,0b00111001); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b10010011); &dec ($counter); &jnz ("1b"); @@ -600,15 +600,15 @@ $code.=<<___; .Loop_ssse3: ___ &SSSE3ROUND(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b00111001); - &pshufd ($d,$d,0b10010011); + &pshufd ($a,$a,0b10010011); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b00111001); &nop (); &SSSE3ROUND(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b10010011); - &pshufd ($d,$d,0b00111001); + &pshufd ($a,$a,0b00111001); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b10010011); &dec ($counter); &jnz (".Loop_ssse3"); @@ -770,20 +770,20 @@ $code.=<<___; .Loop_128: ___ &SSSE3ROUND_2x(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b00111001); - &pshufd ($d,$d,0b10010011); - &pshufd ($c1,$c1,0b01001110); - &pshufd ($b1,$b1,0b00111001); - &pshufd ($d1,$d1,0b10010011); + &pshufd ($a,$a,0b10010011); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b00111001); + &pshufd ($a1,$a1,0b10010011); + &pshufd ($d1,$d1,0b01001110); + &pshufd ($c1,$c1,0b00111001); &SSSE3ROUND_2x(); - &pshufd ($c,$c,0b01001110); - &pshufd ($b,$b,0b10010011); - &pshufd ($d,$d,0b00111001); - &pshufd ($c1,$c1,0b01001110); - &pshufd ($b1,$b1,0b10010011); - &pshufd ($d1,$d1,0b00111001); + &pshufd ($a,$a,0b00111001); + &pshufd ($d,$d,0b01001110); + &pshufd ($c,$c,0b10010011); + &pshufd ($a1,$a1,0b00111001); + &pshufd ($d1,$d1,0b01001110); + &pshufd ($c1,$c1,0b10010011); &dec ($counter); &jnz (".Loop_128"); -- cgit v1.2.3-59-g8ed1b