aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/crypto/Makefile18
-rw-r--r--arch/x86/crypto/aesni-intel_avx-x86_64.S2125
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c353
-rw-r--r--arch/x86/crypto/chacha-avx2-x86_64.S1025
-rw-r--r--arch/x86/crypto/chacha-avx512vl-x86_64.S836
-rw-r--r--arch/x86/crypto/chacha-ssse3-x86_64.S (renamed from arch/x86/crypto/chacha20-ssse3-x86_64.S)327
-rw-r--r--arch/x86/crypto/chacha20-avx2-x86_64.S448
-rw-r--r--arch/x86/crypto/chacha20_glue.c146
-rw-r--r--arch/x86/crypto/chacha_glue.c304
-rw-r--r--arch/x86/crypto/nh-avx2-x86_64.S157
-rw-r--r--arch/x86/crypto/nh-sse2-x86_64.S123
-rw-r--r--arch/x86/crypto/nhpoly1305-avx2-glue.c77
-rw-r--r--arch/x86/crypto/nhpoly1305-sse2-glue.c76
-rw-r--r--arch/x86/crypto/poly1305_glue.c20
14 files changed, 3987 insertions, 2048 deletions
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index a4b0007a54e1..45734e1cf967 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -8,6 +8,7 @@ OBJECT_FILES_NON_STANDARD := y
avx_supported := $(call as-instr,vpxor %xmm0$(comma)%xmm0$(comma)%xmm0,yes,no)
avx2_supported := $(call as-instr,vpgatherdd %ymm0$(comma)(%eax$(comma)%ymm1\
$(comma)4)$(comma)%ymm2,yes,no)
+avx512_supported :=$(call as-instr,vpmovm2b %k1$(comma)%zmm5,yes,no)
sha1_ni_supported :=$(call as-instr,sha1msg1 %xmm0$(comma)%xmm1,yes,no)
sha256_ni_supported :=$(call as-instr,sha256msg1 %xmm0$(comma)%xmm1,yes,no)
@@ -23,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CAMELLIA_X86_64) += camellia-x86_64.o
obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
-obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha20-x86_64.o
+obj-$(CONFIG_CRYPTO_CHACHA20_X86_64) += chacha-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
@@ -46,6 +47,9 @@ obj-$(CONFIG_CRYPTO_MORUS1280_GLUE) += morus1280_glue.o
obj-$(CONFIG_CRYPTO_MORUS640_SSE2) += morus640-sse2.o
obj-$(CONFIG_CRYPTO_MORUS1280_SSE2) += morus1280-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_SSE2) += nhpoly1305-sse2.o
+obj-$(CONFIG_CRYPTO_NHPOLY1305_AVX2) += nhpoly1305-avx2.o
+
# These modules require assembler to support AVX.
ifeq ($(avx_supported),yes)
obj-$(CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64) += \
@@ -74,7 +78,7 @@ camellia-x86_64-y := camellia-x86_64-asm_64.o camellia_glue.o
blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
-chacha20-x86_64-y := chacha20-ssse3-x86_64.o chacha20_glue.o
+chacha-x86_64-y := chacha-ssse3-x86_64.o chacha_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aegis128-aesni-y := aegis128-aesni-asm.o aegis128-aesni-glue.o
@@ -84,6 +88,8 @@ aegis256-aesni-y := aegis256-aesni-asm.o aegis256-aesni-glue.o
morus640-sse2-y := morus640-sse2-asm.o morus640-sse2-glue.o
morus1280-sse2-y := morus1280-sse2-asm.o morus1280-sse2-glue.o
+nhpoly1305-sse2-y := nh-sse2-x86_64.o nhpoly1305-sse2-glue.o
+
ifeq ($(avx_supported),yes)
camellia-aesni-avx-x86_64-y := camellia-aesni-avx-asm_64.o \
camellia_aesni_avx_glue.o
@@ -97,10 +103,16 @@ endif
ifeq ($(avx2_supported),yes)
camellia-aesni-avx2-y := camellia-aesni-avx2-asm_64.o camellia_aesni_avx2_glue.o
- chacha20-x86_64-y += chacha20-avx2-x86_64.o
+ chacha-x86_64-y += chacha-avx2-x86_64.o
serpent-avx2-y := serpent-avx2-asm_64.o serpent_avx2_glue.o
morus1280-avx2-y := morus1280-avx2-asm.o morus1280-avx2-glue.o
+
+ nhpoly1305-avx2-y := nh-avx2-x86_64.o nhpoly1305-avx2-glue.o
+endif
+
+ifeq ($(avx512_supported),yes)
+ chacha-x86_64-y += chacha-avx512vl-x86_64.o
endif
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S
index 1985ea0b551b..91c039ab5699 100644
--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S
+++ b/arch/x86/crypto/aesni-intel_avx-x86_64.S
@@ -182,43 +182,30 @@ aad_shift_arr:
.text
-##define the fields of the gcm aes context
-#{
-# u8 expanded_keys[16*11] store expanded keys
-# u8 shifted_hkey_1[16] store HashKey <<1 mod poly here
-# u8 shifted_hkey_2[16] store HashKey^2 <<1 mod poly here
-# u8 shifted_hkey_3[16] store HashKey^3 <<1 mod poly here
-# u8 shifted_hkey_4[16] store HashKey^4 <<1 mod poly here
-# u8 shifted_hkey_5[16] store HashKey^5 <<1 mod poly here
-# u8 shifted_hkey_6[16] store HashKey^6 <<1 mod poly here
-# u8 shifted_hkey_7[16] store HashKey^7 <<1 mod poly here
-# u8 shifted_hkey_8[16] store HashKey^8 <<1 mod poly here
-# u8 shifted_hkey_1_k[16] store XOR HashKey <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_2_k[16] store XOR HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_3_k[16] store XOR HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_4_k[16] store XOR HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_5_k[16] store XOR HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_6_k[16] store XOR HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_7_k[16] store XOR HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-# u8 shifted_hkey_8_k[16] store XOR HashKey^8 <<1 mod poly here (for Karatsuba purposes)
-#} gcm_ctx#
-
-HashKey = 16*11 # store HashKey <<1 mod poly here
-HashKey_2 = 16*12 # store HashKey^2 <<1 mod poly here
-HashKey_3 = 16*13 # store HashKey^3 <<1 mod poly here
-HashKey_4 = 16*14 # store HashKey^4 <<1 mod poly here
-HashKey_5 = 16*15 # store HashKey^5 <<1 mod poly here
-HashKey_6 = 16*16 # store HashKey^6 <<1 mod poly here
-HashKey_7 = 16*17 # store HashKey^7 <<1 mod poly here
-HashKey_8 = 16*18 # store HashKey^8 <<1 mod poly here
-HashKey_k = 16*19 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
-HashKey_2_k = 16*20 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
-HashKey_3_k = 16*21 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
-HashKey_4_k = 16*22 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
-HashKey_5_k = 16*23 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
-HashKey_6_k = 16*24 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
-HashKey_7_k = 16*25 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
-HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
+#define AadHash 16*0
+#define AadLen 16*1
+#define InLen (16*1)+8
+#define PBlockEncKey 16*2
+#define OrigIV 16*3
+#define CurCount 16*4
+#define PBlockLen 16*5
+
+HashKey = 16*6 # store HashKey <<1 mod poly here
+HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here
+HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here
+HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here
+HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here
+HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here
+HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here
+HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here
+HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
+HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
+HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
+HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
+HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
+HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
+HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
+HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
#define arg1 %rdi
#define arg2 %rsi
@@ -229,6 +216,8 @@ HashKey_8_k = 16*26 # store XOR of HashKey^8 <<1 mod poly here (for Karatsu
#define arg7 STACK_OFFSET+8*1(%r14)
#define arg8 STACK_OFFSET+8*2(%r14)
#define arg9 STACK_OFFSET+8*3(%r14)
+#define arg10 STACK_OFFSET+8*4(%r14)
+#define keysize 2*15*16(arg1)
i = 0
j = 0
@@ -267,19 +256,636 @@ VARIABLE_OFFSET = 16*8
# Utility Macros
################################
+.macro FUNC_SAVE
+ #the number of pushes must equal STACK_OFFSET
+ push %r12
+ push %r13
+ push %r14
+ push %r15
+
+ mov %rsp, %r14
+
+
+
+ sub $VARIABLE_OFFSET, %rsp
+ and $~63, %rsp # align rsp to 64 bytes
+.endm
+
+.macro FUNC_RESTORE
+ mov %r14, %rsp
+
+ pop %r15
+ pop %r14
+ pop %r13
+ pop %r12
+.endm
+
# Encryption of a single block
-.macro ENCRYPT_SINGLE_BLOCK XMM0
+.macro ENCRYPT_SINGLE_BLOCK REP XMM0
vpxor (arg1), \XMM0, \XMM0
- i = 1
- setreg
-.rep 9
+ i = 1
+ setreg
+.rep \REP
vaesenc 16*i(arg1), \XMM0, \XMM0
- i = (i+1)
- setreg
+ i = (i+1)
+ setreg
.endr
- vaesenclast 16*10(arg1), \XMM0, \XMM0
+ vaesenclast 16*i(arg1), \XMM0, \XMM0
.endm
+# combined for GCM encrypt and decrypt functions
+# clobbering all xmm registers
+# clobbering r10, r11, r12, r13, r14, r15
+.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
+ vmovdqu AadHash(arg2), %xmm8
+ vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey
+ add arg5, InLen(arg2)
+
+ # initialize the data pointer offset as zero
+ xor %r11d, %r11d
+
+ PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
+ sub %r11, arg5
+
+ mov arg5, %r13 # save the number of bytes of plaintext/ciphertext
+ and $-16, %r13 # r13 = r13 - (r13 mod 16)
+
+ mov %r13, %r12
+ shr $4, %r12
+ and $7, %r12
+ jz _initial_num_blocks_is_0\@
+
+ cmp $7, %r12
+ je _initial_num_blocks_is_7\@
+ cmp $6, %r12
+ je _initial_num_blocks_is_6\@
+ cmp $5, %r12
+ je _initial_num_blocks_is_5\@
+ cmp $4, %r12
+ je _initial_num_blocks_is_4\@
+ cmp $3, %r12
+ je _initial_num_blocks_is_3\@
+ cmp $2, %r12
+ je _initial_num_blocks_is_2\@
+
+ jmp _initial_num_blocks_is_1\@
+
+_initial_num_blocks_is_7\@:
+ \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*7, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_6\@:
+ \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*6, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_5\@:
+ \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*5, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_4\@:
+ \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*4, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_3\@:
+ \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*3, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_2\@:
+ \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*2, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_1\@:
+ \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+ sub $16*1, %r13
+ jmp _initial_blocks_encrypted\@
+
+_initial_num_blocks_is_0\@:
+ \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
+
+
+_initial_blocks_encrypted\@:
+ cmp $0, %r13
+ je _zero_cipher_left\@
+
+ sub $128, %r13
+ je _eight_cipher_left\@
+
+
+
+
+ vmovd %xmm9, %r15d
+ and $255, %r15d
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+_encrypt_by_8_new\@:
+ cmp $(255-8), %r15d
+ jg _encrypt_by_8\@
+
+
+
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ jmp _eight_cipher_left\@
+
+_encrypt_by_8\@:
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $8, %r15b
+ \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ add $128, %r11
+ sub $128, %r13
+ jne _encrypt_by_8_new\@
+
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+
+
+
+_eight_cipher_left\@:
+ \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
+
+
+_zero_cipher_left\@:
+ vmovdqu %xmm14, AadHash(arg2)
+ vmovdqu %xmm9, CurCount(arg2)
+
+ # check for 0 length
+ mov arg5, %r13
+ and $15, %r13 # r13 = (arg5 mod 16)
+
+ je _multiple_of_16_bytes\@
+
+ # handle the last <16 Byte block separately
+
+ mov %r13, PBlockLen(arg2)
+
+ vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
+ vmovdqu %xmm9, CurCount(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn)
+ vmovdqu %xmm9, PBlockEncKey(arg2)
+
+ cmp $16, arg5
+ jge _large_enough_update\@
+
+ lea (arg4,%r11,1), %r10
+ mov %r13, %r12
+
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ lea SHIFT_MASK+16(%rip), %r12
+ sub %r13, %r12 # adjust the shuffle mask pointer to be
+ # able to shift 16-r13 bytes (r13 is the
+ # number of bytes in plaintext mod 16)
+
+ jmp _final_ghash_mul\@
+
+_large_enough_update\@:
+ sub $16, %r11
+ add %r13, %r11
+
+ # receive the last <16 Byte block
+ vmovdqu (arg4, %r11, 1), %xmm1
+
+ sub %r13, %r11
+ add $16, %r11
+
+ lea SHIFT_MASK+16(%rip), %r12
+ # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
+ # (r13 is the number of bytes in plaintext mod 16)
+ sub %r13, %r12
+ # get the appropriate shuffle mask
+ vmovdqu (%r12), %xmm2
+ # shift right 16-r13 bytes
+ vpshufb %xmm2, %xmm1, %xmm1
+
+_final_ghash_mul\@:
+ .if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm2
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm2, %xmm2
+ vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ .else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
+ # mask out top 16-r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
+ vpxor %xmm9, %xmm14, %xmm14
+
+ vmovdqu %xmm14, AadHash(arg2)
+ vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
+ .endif
+
+
+ #############################
+ # output r13 Bytes
+ vmovq %xmm9, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left\@
+
+ mov %rax, (arg3 , %r11)
+ add $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ vmovq %xmm9, %rax
+ sub $8, %r13
+
+_less_than_8_bytes_left\@:
+ movb %al, (arg3 , %r11)
+ add $1, %r11
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left\@
+ #############################
+
+_multiple_of_16_bytes\@:
+.endm
+
+
+# GCM_COMPLETE Finishes update of tag of last partial block
+# Output: Authorization Tag (AUTH_TAG)
+# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
+.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
+ vmovdqu AadHash(arg2), %xmm14
+ vmovdqu HashKey(arg2), %xmm13
+
+ mov PBlockLen(arg2), %r12
+ cmp $0, %r12
+ je _partial_done\@
+
+ #GHASH computation for the last <16 Byte block
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+
+_partial_done\@:
+ mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes)
+ shl $3, %r12 # convert into number of bits
+ vmovd %r12d, %xmm15 # len(A) in xmm15
+
+ mov InLen(arg2), %r12
+ shl $3, %r12 # len(C) in bits (*128)
+ vmovq %r12, %xmm1
+ vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
+ vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
+
+ vpxor %xmm15, %xmm14, %xmm14
+ \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
+ vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
+
+ vmovdqu OrigIV(arg2), %xmm9
+
+ ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0)
+
+ vpxor %xmm14, %xmm9, %xmm9
+
+
+
+_return_T\@:
+ mov \AUTH_TAG, %r10 # r10 = authTag
+ mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len
+
+ cmp $16, %r11
+ je _T_16\@
+
+ cmp $8, %r11
+ jl _T_4\@
+
+_T_8\@:
+ vmovq %xmm9, %rax
+ mov %rax, (%r10)
+ add $8, %r10
+ sub $8, %r11
+ vpsrldq $8, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_4\@:
+ vmovd %xmm9, %eax
+ mov %eax, (%r10)
+ add $4, %r10
+ sub $4, %r11
+ vpsrldq $4, %xmm9, %xmm9
+ cmp $0, %r11
+ je _return_T_done\@
+_T_123\@:
+ vmovd %xmm9, %eax
+ cmp $2, %r11
+ jl _T_1\@
+ mov %ax, (%r10)
+ cmp $2, %r11
+ je _return_T_done\@
+ add $2, %r10
+ sar $16, %eax
+_T_1\@:
+ mov %al, (%r10)
+ jmp _return_T_done\@
+
+_T_16\@:
+ vmovdqu %xmm9, (%r10)
+
+_return_T_done\@:
+.endm
+
+.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
+
+ mov \AAD, %r10 # r10 = AAD
+ mov \AADLEN, %r12 # r12 = aadLen
+
+
+ mov %r12, %r11
+
+ vpxor \T8, \T8, \T8
+ vpxor \T7, \T7, \T7
+ cmp $16, %r11
+ jl _get_AAD_rest8\@
+_get_AAD_blocks\@:
+ vmovdqu (%r10), \T7
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T7, \T8, \T8
+ \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6
+ add $16, %r10
+ sub $16, %r12
+ sub $16, %r11
+ cmp $16, %r11
+ jge _get_AAD_blocks\@
+ vmovdqu \T8, \T7
+ cmp $0, %r11
+ je _get_AAD_done\@
+
+ vpxor \T7, \T7, \T7
+
+ /* read the last <16B of AAD. since we have at least 4B of
+ data right after the AAD (the ICV, and maybe some CT), we can
+ read 4B/8B blocks safely, and then get rid of the extra stuff */
+_get_AAD_rest8\@:
+ cmp $4, %r11
+ jle _get_AAD_rest4\@
+ movq (%r10), \T1
+ add $8, %r10
+ sub $8, %r11
+ vpslldq $8, \T1, \T1
+ vpsrldq $8, \T7, \T7
+ vpxor \T1, \T7, \T7
+ jmp _get_AAD_rest8\@
+_get_AAD_rest4\@:
+ cmp $0, %r11
+ jle _get_AAD_rest0\@
+ mov (%r10), %eax
+ movq %rax, \T1
+ add $4, %r10
+ sub $4, %r11
+ vpslldq $12, \T1, \T1
+ vpsrldq $4, \T7, \T7
+ vpxor \T1, \T7, \T7
+_get_AAD_rest0\@:
+ /* finalize: shift out the extra bytes we read, and align
+ left. since pslldq can only shift by an immediate, we use
+ vpshufb and an array of shuffle masks */
+ movq %r12, %r11
+ salq $4, %r11
+ vmovdqu aad_shift_arr(%r11), \T1
+ vpshufb \T1, \T7, \T7
+_get_AAD_rest_final\@:
+ vpshufb SHUF_MASK(%rip), \T7, \T7
+ vpxor \T8, \T7, \T7
+ \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6
+
+_get_AAD_done\@:
+ vmovdqu \T7, AadHash(arg2)
+.endm
+
+.macro INIT GHASH_MUL PRECOMPUTE
+ mov arg6, %r11
+ mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
+ xor %r11d, %r11d
+ mov %r11, InLen(arg2) # ctx_data.in_length = 0
+
+ mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
+ mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
+ mov arg3, %rax
+ movdqu (%rax), %xmm0
+ movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
+
+ vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
+ movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
+
+ vmovdqu (arg4), %xmm6 # xmm6 = HashKey
+
+ vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
+ ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
+ vmovdqa %xmm6, %xmm2
+ vpsllq $1, %xmm6, %xmm6
+ vpsrlq $63, %xmm2, %xmm2
+ vmovdqa %xmm2, %xmm1
+ vpslldq $8, %xmm2, %xmm2
+ vpsrldq $8, %xmm1, %xmm1
+ vpor %xmm2, %xmm6, %xmm6
+ #reduction
+ vpshufd $0b00100100, %xmm1, %xmm2
+ vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
+ vpand POLY(%rip), %xmm2, %xmm2
+ vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
+ #######################################################################
+ vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly
+
+ CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
+
+ \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
+.endm
+
+
+# Reads DLEN bytes starting at DPTR and stores in XMMDst
+# where 0 < DLEN < 16
+# Clobbers %rax, DLEN
+.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
+ vpxor \XMMDst, \XMMDst, \XMMDst
+
+ cmp $8, \DLEN
+ jl _read_lt8_\@
+ mov (\DPTR), %rax
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+ sub $8, \DLEN
+ jz _done_read_partial_block_\@
+ xor %eax, %eax
+_read_next_byte_\@:
+ shl $8, %rax
+ mov 7(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_\@
+ vpinsrq $1, %rax, \XMMDst, \XMMDst
+ jmp _done_read_partial_block_\@
+_read_lt8_\@:
+ xor %eax, %eax
+_read_next_byte_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \DLEN, 1), %al
+ dec \DLEN
+ jnz _read_next_byte_lt8_\@
+ vpinsrq $0, %rax, \XMMDst, \XMMDst
+_done_read_partial_block_\@:
+.endm
+
+# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
+# between update calls.
+# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
+# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
+# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
+.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
+ AAD_HASH ENC_DEC
+ mov PBlockLen(arg2), %r13
+ cmp $0, %r13
+ je _partial_block_done_\@ # Leave Macro if no partial blocks
+ # Read in input data without over reading
+ cmp $16, \PLAIN_CYPH_LEN
+ jl _fewer_than_16_bytes_\@
+ vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm
+ jmp _data_read_\@
+
+_fewer_than_16_bytes_\@:
+ lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
+ mov \PLAIN_CYPH_LEN, %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm1
+
+ mov PBlockLen(arg2), %r13
+
+_data_read_\@: # Finished reading in data
+
+ vmovdqu PBlockEncKey(arg2), %xmm9
+ vmovdqu HashKey(arg2), %xmm13
+
+ lea SHIFT_MASK(%rip), %r12
+
+ # adjust the shuffle mask pointer to be able to shift r13 bytes
+ # r16-r13 is the number of bytes in plaintext mod 16)
+ add %r13, %r12
+ vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
+ vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes
+
+.if \ENC_DEC == DEC
+ vmovdqa %xmm1, %xmm3
+ pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_1_\@
+ sub %r10, %r12
+_no_extra_mask_1_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9
+
+ vpand %xmm1, %xmm3, %xmm3
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ vpshufb %xmm10, %xmm3, %xmm3
+ vpshufb %xmm2, %xmm3, %xmm3
+ vpxor %xmm3, \AAD_HASH, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_1_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _dec_done_\@
+_partial_incomplete_1_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_dec_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+.else
+ vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
+
+ mov \PLAIN_CYPH_LEN, %r10
+ add %r13, %r10
+ # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
+ sub $16, %r10
+ # Determine if if partial block is not being filled and
+ # shift mask accordingly
+ jge _no_extra_mask_2_\@
+ sub %r10, %r12
+_no_extra_mask_2_\@:
+
+ vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ # get the appropriate mask to mask out bottom r13 bytes of xmm9
+ vpand %xmm1, %xmm9, %xmm9
+
+ vmovdqa SHUF_MASK(%rip), %xmm1
+ vpshufb %xmm1, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+ vpxor %xmm9, \AAD_HASH, \AAD_HASH
+
+ cmp $0, %r10
+ jl _partial_incomplete_2_\@
+
+ # GHASH computation for the last <16 Byte block
+ \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
+ xor %eax,%eax
+
+ mov %rax, PBlockLen(arg2)
+ jmp _encode_done_\@
+_partial_incomplete_2_\@:
+ add \PLAIN_CYPH_LEN, PBlockLen(arg2)
+_encode_done_\@:
+ vmovdqu \AAD_HASH, AadHash(arg2)
+
+ vmovdqa SHUF_MASK(%rip), %xmm10
+ # shuffle xmm9 back to output as ciphertext
+ vpshufb %xmm10, %xmm9, %xmm9
+ vpshufb %xmm2, %xmm9, %xmm9
+.endif
+ # output encrypted Bytes
+ cmp $0, %r10
+ jl _partial_fill_\@
+ mov %r13, %r12
+ mov $16, %r13
+ # Set r13 to be the number of bytes to write out
+ sub %r12, %r13
+ jmp _count_set_\@
+_partial_fill_\@:
+ mov \PLAIN_CYPH_LEN, %r13
+_count_set_\@:
+ vmovdqa %xmm9, %xmm0
+ vmovq %xmm0, %rax
+ cmp $8, %r13
+ jle _less_than_8_bytes_left_\@
+
+ mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $8, \DATA_OFFSET
+ psrldq $8, %xmm0
+ vmovq %xmm0, %rax
+ sub $8, %r13
+_less_than_8_bytes_left_\@:
+ movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
+ add $1, \DATA_OFFSET
+ shr $8, %rax
+ sub $1, %r13
+ jne _less_than_8_bytes_left_\@
+_partial_block_done_\@:
+.endm # PARTIAL_BLOCK
+
#ifdef CONFIG_AS_AVX
###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
@@ -341,49 +947,49 @@ VARIABLE_OFFSET = 16*8
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_k(arg1)
+ vmovdqu \T1, HashKey_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_2_k(arg1)
+ vmovdqu \T1, HashKey_2_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqa \T5, HashKey_3(arg1)
+ vmovdqu \T5, HashKey_3(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_3_k(arg1)
+ vmovdqu \T1, HashKey_3_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqa \T5, HashKey_4(arg1)
+ vmovdqu \T5, HashKey_4(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_4_k(arg1)
+ vmovdqu \T1, HashKey_4_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqa \T5, HashKey_5(arg1)
+ vmovdqu \T5, HashKey_5(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_5_k(arg1)
+ vmovdqu \T1, HashKey_5_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqa \T5, HashKey_6(arg1)
+ vmovdqu \T5, HashKey_6(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_6_k(arg1)
+ vmovdqu \T1, HashKey_6_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqa \T5, HashKey_7(arg1)
+ vmovdqu \T5, HashKey_7(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_7_k(arg1)
+ vmovdqu \T1, HashKey_7_k(arg2)
GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqa \T5, HashKey_8(arg1)
+ vmovdqu \T5, HashKey_8(arg2)
vpshufd $0b01001110, \T5, \T1
vpxor \T5, \T1, \T1
- vmovdqa \T1, HashKey_8_k(arg1)
+ vmovdqu \T1, HashKey_8_k(arg2)
.endm
@@ -392,84 +998,15 @@ VARIABLE_OFFSET = 16*8
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
-.macro INITIAL_BLOCKS_AVX num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
+.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
i = (8-\num_initial_blocks)
- j = 0
setreg
-
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
+ vmovdqu AadHash(arg2), reg_i
# start AES for num_initial_blocks blocks
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), \CTR # CTR = Y0
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+ vmovdqu CurCount(arg2), \CTR
i = (9-\num_initial_blocks)
setreg
@@ -490,10 +1027,10 @@ _get_AAD_done\@:
setreg
.endr
- j = 1
- setreg
-.rep 9
- vmovdqa 16*j(arg1), \T_key
+ j = 1
+ setreg
+.rep \REP
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -502,12 +1039,11 @@ _get_AAD_done\@:
setreg
.endr
- j = (j+1)
- setreg
+ j = (j+1)
+ setreg
.endr
-
- vmovdqa 16*10(arg1), \T_key
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -519,9 +1055,9 @@ _get_AAD_done\@:
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for num_initial_blocks blocks
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks
add $16, %r11
.if \ENC_DEC == DEC
vmovdqa \T1, reg_i
@@ -595,9 +1131,9 @@ _get_AAD_done\@:
vpxor \T_key, \XMM7, \XMM7
vpxor \T_key, \XMM8, \XMM8
- i = 1
- setreg
-.rep 9 # do 9 rounds
+ i = 1
+ setreg
+.rep \REP # do REP rounds
vmovdqa 16*i(arg1), \T_key
vaesenc \T_key, \XMM1, \XMM1
vaesenc \T_key, \XMM2, \XMM2
@@ -607,11 +1143,10 @@ _get_AAD_done\@:
vaesenc \T_key, \XMM6, \XMM6
vaesenc \T_key, \XMM7, \XMM7
vaesenc \T_key, \XMM8, \XMM8
- i = (i+1)
- setreg
+ i = (i+1)
+ setreg
.endr
-
vmovdqa 16*i(arg1), \T_key
vaesenclast \T_key, \XMM1, \XMM1
vaesenclast \T_key, \XMM2, \XMM2
@@ -622,58 +1157,58 @@ _get_AAD_done\@:
vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg2 , %r11)
+ vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM1
.endif
- vmovdqu 16*1(arg3, %r11), \T1
+ vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg2 , %r11)
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM2
.endif
- vmovdqu 16*2(arg3, %r11), \T1
+ vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg2 , %r11)
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM3
.endif
- vmovdqu 16*3(arg3, %r11), \T1
+ vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg2 , %r11)
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM4
.endif
- vmovdqu 16*4(arg3, %r11), \T1
+ vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg2 , %r11)
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM5
.endif
- vmovdqu 16*5(arg3, %r11), \T1
+ vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg2 , %r11)
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM6
.endif
- vmovdqu 16*6(arg3, %r11), \T1
+ vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg2 , %r11)
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM7
.endif
- vmovdqu 16*7(arg3, %r11), \T1
+ vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg2 , %r11)
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM8
.endif
@@ -698,9 +1233,9 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
vmovdqa \XMM1, \T2
vmovdqa \XMM2, TMP2(%rsp)
@@ -784,14 +1319,14 @@ _initial_blocks_done\@:
#######################################################################
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpshufd $0b01001110, \T2, \T6
vpxor \T2, \T6, \T6
- vmovdqa HashKey_8_k(arg1), \T5
+ vmovdqu HashKey_8_k(arg2), \T5
vpclmulqdq $0x00, \T5, \T6, \T6
vmovdqu 16*3(arg1), \T1
@@ -805,7 +1340,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -813,7 +1348,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_7_k(arg1), \T5
+ vmovdqu HashKey_7_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -830,7 +1365,7 @@ _initial_blocks_done\@:
#######################################################################
vmovdqa TMP3(%rsp), \T1
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -838,7 +1373,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_6_k(arg1), \T5
+ vmovdqu HashKey_6_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -853,7 +1388,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -861,7 +1396,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_5_k(arg1), \T5
+ vmovdqu HashKey_5_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -877,7 +1412,7 @@ _initial_blocks_done\@:
vmovdqa TMP5(%rsp), \T1
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -885,7 +1420,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_4_k(arg1), \T5
+ vmovdqu HashKey_4_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -900,7 +1435,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -908,7 +1443,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_3_k(arg1), \T5
+ vmovdqu HashKey_3_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -924,7 +1459,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -932,7 +1467,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_2_k(arg1), \T5
+ vmovdqu HashKey_2_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -949,7 +1484,7 @@ _initial_blocks_done\@:
vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
@@ -957,7 +1492,7 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
- vmovdqa HashKey_k(arg1), \T5
+ vmovdqu HashKey_k(arg2), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
@@ -966,17 +1501,35 @@ _initial_blocks_done\@:
vmovdqu 16*10(arg1), \T5
+ i = 11
+ setreg
+.rep (\REP-9)
+
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
i = 0
j = 1
setreg
.rep 8
- vpxor 16*i(arg3, %r11), \T5, \T2
+ vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg3, %r11), reg_j
- vmovdqu \T3, 16*i(arg2, %r11)
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
.endif
i = (i+1)
j = (j+1)
@@ -1008,14 +1561,14 @@ _initial_blocks_done\@:
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
.if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
@@ -1056,25 +1609,25 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM1, \T2
vpxor \XMM1, \T2, \T2
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM1, \T6
vpclmulqdq $0x00, \T5, \XMM1, \T7
- vmovdqa HashKey_8_k(arg1), \T3
+ vmovdqu HashKey_8_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \XMM1
######################
vpshufd $0b01001110, \XMM2, \T2
vpxor \XMM2, \T2, \T2
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_7_k(arg1), \T3
+ vmovdqu HashKey_7_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1082,14 +1635,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM3, \T2
vpxor \XMM3, \T2, \T2
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_6_k(arg1), \T3
+ vmovdqu HashKey_6_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1097,14 +1650,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM4, \T2
vpxor \XMM4, \T2, \T2
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_5_k(arg1), \T3
+ vmovdqu HashKey_5_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1112,14 +1665,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM5, \T2
vpxor \XMM5, \T2, \T2
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM5, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_4_k(arg1), \T3
+ vmovdqu HashKey_4_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1127,14 +1680,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM6, \T2
vpxor \XMM6, \T2, \T2
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM6, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_3_k(arg1), \T3
+ vmovdqu HashKey_3_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1142,14 +1695,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM7, \T2
vpxor \XMM7, \T2, \T2
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM7, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_2_k(arg1), \T3
+ vmovdqu HashKey_2_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1157,14 +1710,14 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM8, \T2
vpxor \XMM8, \T2, \T2
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x11, \T5, \XMM8, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM8, \T4
vpxor \T4, \T7, \T7
- vmovdqa HashKey_k(arg1), \T3
+ vmovdqu HashKey_k(arg2), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
@@ -1210,413 +1763,112 @@ _initial_blocks_done\@:
.endm
-
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro GCM_ENC_DEC_AVX ENC_DEC
-
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
-
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
-
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz _initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je _initial_num_blocks_is_7\@
- cmp $6, %r12
- je _initial_num_blocks_is_6\@
- cmp $5, %r12
- je _initial_num_blocks_is_5\@
- cmp $4, %r12
- je _initial_num_blocks_is_4\@
- cmp $3, %r12
- je _initial_num_blocks_is_3\@
- cmp $2, %r12
- je _initial_num_blocks_is_2\@
-
- jmp _initial_num_blocks_is_1\@
-
-_initial_num_blocks_is_7\@:
- INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_6\@:
- INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_5\@:
- INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_4\@:
- INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_3\@:
- INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_2\@:
- INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_1\@:
- INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_0\@:
- INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-_initial_blocks_encrypted\@:
- cmp $0, %r13
- je _zero_cipher_left\@
-
- sub $128, %r13
- je _eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg _encrypt_by_8\@
-
-
-
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
-
-_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-_eight_cipher_left\@:
- GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-_zero_cipher_left\@:
- cmp $16, arg4
- jl _only_less_than_16\@
-
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
- sub $16, %r11
- add %r13, %r11
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
- jmp _final_ghash_mul\@
-
-_only_less_than_16\@:
- # check for 0 length
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
-_get_last_16_byte_loop\@:
- movb (arg3, %r11), %al
- movb %al, TMP1 (%rsp , %r11)
- add $1, %r11
- cmp %r13, %r11
- jne _get_last_16_byte_loop\@
-
- vmovdqu TMP1(%rsp), %xmm1
-
- sub $16, %r11
-
-_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
- # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left\@
-
- mov %rax, (arg2 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-_less_than_8_bytes_left\@:
- movb %al, (arg2 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left\@
- #############################
-
-_multiple_of_16_bytes\@:
- mov arg7, %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- shl $3, arg4 # len(C) in bits (*128)
- vmovq arg4, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
-
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-_return_T\@:
- mov arg8, %r10 # r10 = authTag
- mov arg9, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je _T_16\@
-
- cmp $8, %r11
- jl _T_4\@
-
-_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl _T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je _return_T_done\@
- add $2, %r10
- sar $16, %eax
-_T_1\@:
- mov %al, (%r10)
- jmp _return_T_done\@
-
-_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-_return_T_done\@:
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-
#############################################################
#void aesni_gcm_precomp_avx_gen2
# (gcm_data *my_ctx_data,
-# u8 *hash_subkey)# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# gcm_context_data *data,
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen2)
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
-
-
- PRECOMPUTE_AVX %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
+ENTRY(aesni_gcm_init_avx_gen2)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
+ FUNC_RESTORE
ret
-ENDPROC(aesni_gcm_precomp_avx_gen2)
+ENDPROC(aesni_gcm_init_avx_gen2)
###############################################################################
-#void aesni_gcm_enc_avx_gen2(
+#void aesni_gcm_enc_update_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
# const u8 *in, /* Plaintext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen2)
- GCM_ENC_DEC_AVX ENC
- ret
-ENDPROC(aesni_gcm_enc_avx_gen2)
+ENTRY(aesni_gcm_enc_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize, %eax
+ cmp $32, %eax
+ je key_256_enc_update
+ cmp $16, %eax
+ je key_128_enc_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_enc_update_avx_gen2)
###############################################################################
-#void aesni_gcm_dec_avx_gen2(
+#void aesni_gcm_dec_update_avx_gen2(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update
+ cmp $16, %eax
+ je key_128_dec_update
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen2)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen2(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *auth_tag, /* Authenticated Tag output. */
# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
# Valid values are 16 (most likely), 12 or 8. */
###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen2)
- GCM_ENC_DEC_AVX DEC
- ret
-ENDPROC(aesni_gcm_dec_avx_gen2)
+ENTRY(aesni_gcm_finalize_avx_gen2)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize
+ cmp $16, %eax
+ je key_128_finalize
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize:
+ GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen2)
+
#endif /* CONFIG_AS_AVX */
#ifdef CONFIG_AS_AVX2
@@ -1670,113 +1922,42 @@ ENDPROC(aesni_gcm_dec_avx_gen2)
# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
vmovdqa \HK, \T5
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly
- vmovdqa \T5, HashKey_2(arg1) # [HashKey_2] = HashKey^2<<1 mod poly
+ vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly
- vmovdqa \T5, HashKey_3(arg1)
+ vmovdqu \T5, HashKey_3(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly
- vmovdqa \T5, HashKey_4(arg1)
+ vmovdqu \T5, HashKey_4(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly
- vmovdqa \T5, HashKey_5(arg1)
+ vmovdqu \T5, HashKey_5(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly
- vmovdqa \T5, HashKey_6(arg1)
+ vmovdqu \T5, HashKey_6(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly
- vmovdqa \T5, HashKey_7(arg1)
+ vmovdqu \T5, HashKey_7(arg2)
GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly
- vmovdqa \T5, HashKey_8(arg1)
+ vmovdqu \T5, HashKey_8(arg2)
.endm
-
## if a = number of total plaintext bytes
## b = floor(a/16)
## num_initial_blocks = b mod 4#
## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
## r10, r11, r12, rax are clobbered
-## arg1, arg2, arg3, r14 are used as a pointer only, not modified
+## arg1, arg3, arg4, r14 are used as a pointer only, not modified
-.macro INITIAL_BLOCKS_AVX2 num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
+.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
i = (8-\num_initial_blocks)
- j = 0
setreg
-
- mov arg6, %r10 # r10 = AAD
- mov arg7, %r12 # r12 = aadLen
-
-
- mov %r12, %r11
-
- vpxor reg_j, reg_j, reg_j
- vpxor reg_i, reg_i, reg_i
-
- cmp $16, %r11
- jl _get_AAD_rest8\@
-_get_AAD_blocks\@:
- vmovdqu (%r10), reg_i
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_i, reg_j, reg_j
- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6
- add $16, %r10
- sub $16, %r12
- sub $16, %r11
- cmp $16, %r11
- jge _get_AAD_blocks\@
- vmovdqu reg_j, reg_i
- cmp $0, %r11
- je _get_AAD_done\@
-
- vpxor reg_i, reg_i, reg_i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\@:
- cmp $4, %r11
- jle _get_AAD_rest4\@
- movq (%r10), \T1
- add $8, %r10
- sub $8, %r11
- vpslldq $8, \T1, \T1
- vpsrldq $8, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
- jmp _get_AAD_rest8\@
-_get_AAD_rest4\@:
- cmp $0, %r11
- jle _get_AAD_rest0\@
- mov (%r10), %eax
- movq %rax, \T1
- add $4, %r10
- sub $4, %r11
- vpslldq $12, \T1, \T1
- vpsrldq $4, reg_i, reg_i
- vpxor \T1, reg_i, reg_i
-_get_AAD_rest0\@:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \T1
- vpshufb \T1, reg_i, reg_i
-_get_AAD_rest_final\@:
- vpshufb SHUF_MASK(%rip), reg_i, reg_i
- vpxor reg_j, reg_i, reg_i
- GHASH_MUL_AVX2 reg_i, \T2, \T1, \T3, \T4, \T5, \T6
-
-_get_AAD_done\@:
- # initialize the data pointer offset as zero
- xor %r11d, %r11d
+ vmovdqu AadHash(arg2), reg_i
# start AES for num_initial_blocks blocks
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), \CTR # CTR = Y0
- vpshufb SHUF_MASK(%rip), \CTR, \CTR
-
+ vmovdqu CurCount(arg2), \CTR
i = (9-\num_initial_blocks)
setreg
@@ -1799,7 +1980,7 @@ _get_AAD_done\@:
j = 1
setreg
-.rep 9
+.rep \REP
vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
@@ -1814,7 +1995,7 @@ _get_AAD_done\@:
.endr
- vmovdqa 16*10(arg1), \T_key
+ vmovdqa 16*j(arg1), \T_key
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
@@ -1826,9 +2007,9 @@ _get_AAD_done\@:
i = (9-\num_initial_blocks)
setreg
.rep \num_initial_blocks
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, reg_i, reg_i
- vmovdqu reg_i, (arg2 , %r11) # write back ciphertext for
+ vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for
# num_initial_blocks blocks
add $16, %r11
.if \ENC_DEC == DEC
@@ -1905,7 +2086,7 @@ _get_AAD_done\@:
i = 1
setreg
-.rep 9 # do 9 rounds
+.rep \REP # do REP rounds
vmovdqa 16*i(arg1), \T_key
vaesenc \T_key, \XMM1, \XMM1
vaesenc \T_key, \XMM2, \XMM2
@@ -1930,58 +2111,58 @@ _get_AAD_done\@:
vaesenclast \T_key, \XMM7, \XMM7
vaesenclast \T_key, \XMM8, \XMM8
- vmovdqu (arg3, %r11), \T1
+ vmovdqu (arg4, %r11), \T1
vpxor \T1, \XMM1, \XMM1
- vmovdqu \XMM1, (arg2 , %r11)
+ vmovdqu \XMM1, (arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM1
.endif
- vmovdqu 16*1(arg3, %r11), \T1
+ vmovdqu 16*1(arg4, %r11), \T1
vpxor \T1, \XMM2, \XMM2
- vmovdqu \XMM2, 16*1(arg2 , %r11)
+ vmovdqu \XMM2, 16*1(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM2
.endif
- vmovdqu 16*2(arg3, %r11), \T1
+ vmovdqu 16*2(arg4, %r11), \T1
vpxor \T1, \XMM3, \XMM3
- vmovdqu \XMM3, 16*2(arg2 , %r11)
+ vmovdqu \XMM3, 16*2(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM3
.endif
- vmovdqu 16*3(arg3, %r11), \T1
+ vmovdqu 16*3(arg4, %r11), \T1
vpxor \T1, \XMM4, \XMM4
- vmovdqu \XMM4, 16*3(arg2 , %r11)
+ vmovdqu \XMM4, 16*3(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM4
.endif
- vmovdqu 16*4(arg3, %r11), \T1
+ vmovdqu 16*4(arg4, %r11), \T1
vpxor \T1, \XMM5, \XMM5
- vmovdqu \XMM5, 16*4(arg2 , %r11)
+ vmovdqu \XMM5, 16*4(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM5
.endif
- vmovdqu 16*5(arg3, %r11), \T1
+ vmovdqu 16*5(arg4, %r11), \T1
vpxor \T1, \XMM6, \XMM6
- vmovdqu \XMM6, 16*5(arg2 , %r11)
+ vmovdqu \XMM6, 16*5(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM6
.endif
- vmovdqu 16*6(arg3, %r11), \T1
+ vmovdqu 16*6(arg4, %r11), \T1
vpxor \T1, \XMM7, \XMM7
- vmovdqu \XMM7, 16*6(arg2 , %r11)
+ vmovdqu \XMM7, 16*6(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM7
.endif
- vmovdqu 16*7(arg3, %r11), \T1
+ vmovdqu 16*7(arg4, %r11), \T1
vpxor \T1, \XMM8, \XMM8
- vmovdqu \XMM8, 16*7(arg2 , %r11)
+ vmovdqu \XMM8, 16*7(arg3 , %r11)
.if \ENC_DEC == DEC
vmovdqa \T1, \XMM8
.endif
@@ -2010,9 +2191,9 @@ _initial_blocks_done\@:
# encrypt 8 blocks at a time
# ghash the 8 previously encrypted ciphertext blocks
-# arg1, arg2, arg3 are used as pointers only, not modified
+# arg1, arg3, arg4 are used as pointers only, not modified
# r11 is the data offset value
-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
+.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
vmovdqa \XMM1, \T2
vmovdqa \XMM2, TMP2(%rsp)
@@ -2096,7 +2277,7 @@ _initial_blocks_done\@:
#######################################################################
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1
vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0
vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0
@@ -2114,7 +2295,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP2(%rsp), \T1
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2140,7 +2321,7 @@ _initial_blocks_done\@:
#######################################################################
vmovdqa TMP3(%rsp), \T1
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2164,7 +2345,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP4(%rsp), \T1
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2189,7 +2370,7 @@ _initial_blocks_done\@:
vmovdqa TMP5(%rsp), \T1
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2213,7 +2394,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2237,7 +2418,7 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
@@ -2264,7 +2445,7 @@ _initial_blocks_done\@:
vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
@@ -2281,17 +2462,34 @@ _initial_blocks_done\@:
vmovdqu 16*10(arg1), \T5
+ i = 11
+ setreg
+.rep (\REP-9)
+ vaesenc \T5, \XMM1, \XMM1
+ vaesenc \T5, \XMM2, \XMM2
+ vaesenc \T5, \XMM3, \XMM3
+ vaesenc \T5, \XMM4, \XMM4
+ vaesenc \T5, \XMM5, \XMM5
+ vaesenc \T5, \XMM6, \XMM6
+ vaesenc \T5, \XMM7, \XMM7
+ vaesenc \T5, \XMM8, \XMM8
+
+ vmovdqu 16*i(arg1), \T5
+ i = i + 1
+ setreg
+.endr
+
i = 0
j = 1
setreg
.rep 8
- vpxor 16*i(arg3, %r11), \T5, \T2
+ vpxor 16*i(arg4, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
- vmovdqu 16*i(arg3, %r11), reg_j
- vmovdqu \T3, 16*i(arg2, %r11)
+ vmovdqu 16*i(arg4, %r11), reg_j
+ vmovdqu \T3, 16*i(arg3, %r11)
.endif
i = (i+1)
j = (j+1)
@@ -2317,14 +2515,14 @@ _initial_blocks_done\@:
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
.if \ENC_DEC == ENC
- vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
- vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer
+ vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
@@ -2361,7 +2559,7 @@ _initial_blocks_done\@:
## Karatsuba Method
- vmovdqa HashKey_8(arg1), \T5
+ vmovdqu HashKey_8(arg2), \T5
vpshufd $0b01001110, \XMM1, \T2
vpshufd $0b01001110, \T5, \T3
@@ -2375,7 +2573,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_7(arg1), \T5
+ vmovdqu HashKey_7(arg2), \T5
vpshufd $0b01001110, \XMM2, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM2, \T2, \T2
@@ -2393,7 +2591,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_6(arg1), \T5
+ vmovdqu HashKey_6(arg2), \T5
vpshufd $0b01001110, \XMM3, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM3, \T2, \T2
@@ -2411,7 +2609,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_5(arg1), \T5
+ vmovdqu HashKey_5(arg2), \T5
vpshufd $0b01001110, \XMM4, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM4, \T2, \T2
@@ -2429,7 +2627,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_4(arg1), \T5
+ vmovdqu HashKey_4(arg2), \T5
vpshufd $0b01001110, \XMM5, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM5, \T2, \T2
@@ -2447,7 +2645,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_3(arg1), \T5
+ vmovdqu HashKey_3(arg2), \T5
vpshufd $0b01001110, \XMM6, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM6, \T2, \T2
@@ -2465,7 +2663,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey_2(arg1), \T5
+ vmovdqu HashKey_2(arg2), \T5
vpshufd $0b01001110, \XMM7, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM7, \T2, \T2
@@ -2483,7 +2681,7 @@ _initial_blocks_done\@:
######################
- vmovdqa HashKey(arg1), \T5
+ vmovdqu HashKey(arg2), \T5
vpshufd $0b01001110, \XMM8, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM8, \T2, \T2
@@ -2536,411 +2734,110 @@ _initial_blocks_done\@:
-# combined for GCM encrypt and decrypt functions
-# clobbering all xmm registers
-# clobbering r10, r11, r12, r13, r14, r15
-.macro GCM_ENC_DEC_AVX2 ENC_DEC
-
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
-
- vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
-
- mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
- and $-16, %r13 # r13 = r13 - (r13 mod 16)
-
- mov %r13, %r12
- shr $4, %r12
- and $7, %r12
- jz _initial_num_blocks_is_0\@
-
- cmp $7, %r12
- je _initial_num_blocks_is_7\@
- cmp $6, %r12
- je _initial_num_blocks_is_6\@
- cmp $5, %r12
- je _initial_num_blocks_is_5\@
- cmp $4, %r12
- je _initial_num_blocks_is_4\@
- cmp $3, %r12
- je _initial_num_blocks_is_3\@
- cmp $2, %r12
- je _initial_num_blocks_is_2\@
-
- jmp _initial_num_blocks_is_1\@
-
-_initial_num_blocks_is_7\@:
- INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*7, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_6\@:
- INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*6, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_5\@:
- INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*5, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_4\@:
- INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*4, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_3\@:
- INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*3, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_2\@:
- INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*2, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_1\@:
- INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
- sub $16*1, %r13
- jmp _initial_blocks_encrypted\@
-
-_initial_num_blocks_is_0\@:
- INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
-
-
-_initial_blocks_encrypted\@:
- cmp $0, %r13
- je _zero_cipher_left\@
-
- sub $128, %r13
- je _eight_cipher_left\@
-
-
-
-
- vmovd %xmm9, %r15d
- and $255, %r15d
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-_encrypt_by_8_new\@:
- cmp $(255-8), %r15d
- jg _encrypt_by_8\@
-
-
-
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- jmp _eight_cipher_left\@
-
-_encrypt_by_8\@:
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $8, %r15b
- GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- add $128, %r11
- sub $128, %r13
- jne _encrypt_by_8_new\@
-
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
-
-
-
-
-_eight_cipher_left\@:
- GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
-
-
-_zero_cipher_left\@:
- cmp $16, arg4
- jl _only_less_than_16\@
-
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
- sub $16, %r11
- add %r13, %r11
- vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer
- # to be able to shift 16-r13 bytes
- # (r13 is the number of bytes in plaintext mod 16)
- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
- vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
- jmp _final_ghash_mul\@
-
-_only_less_than_16\@:
- # check for 0 length
- mov arg4, %r13
- and $15, %r13 # r13 = (arg4 mod 16)
-
- je _multiple_of_16_bytes\@
-
- # handle the last <16 Byte block seperately
-
-
- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
-
-
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12 # adjust the shuffle mask pointer to be
- # able to shift 16-r13 bytes (r13 is the
- # number of bytes in plaintext mod 16)
-
-_get_last_16_byte_loop\@:
- movb (arg3, %r11), %al
- movb %al, TMP1 (%rsp , %r11)
- add $1, %r11
- cmp %r13, %r11
- jne _get_last_16_byte_loop\@
-
- vmovdqu TMP1(%rsp), %xmm1
-
- sub $16, %r11
-
-_final_ghash_mul\@:
- .if \ENC_DEC == DEC
- vmovdqa %xmm1, %xmm2
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm2, %xmm2
- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- .else
- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
- vpxor %xmm9, %xmm14, %xmm14
- #GHASH computation for the last <16 Byte block
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
- sub %r13, %r11
- add $16, %r11
- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
- .endif
-
-
- #############################
- # output r13 Bytes
- vmovq %xmm9, %rax
- cmp $8, %r13
- jle _less_than_8_bytes_left\@
-
- mov %rax, (arg2 , %r11)
- add $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- vmovq %xmm9, %rax
- sub $8, %r13
-
-_less_than_8_bytes_left\@:
- movb %al, (arg2 , %r11)
- add $1, %r11
- shr $8, %rax
- sub $1, %r13
- jne _less_than_8_bytes_left\@
- #############################
-
-_multiple_of_16_bytes\@:
- mov arg7, %r12 # r12 = aadLen (number of bytes)
- shl $3, %r12 # convert into number of bits
- vmovd %r12d, %xmm15 # len(A) in xmm15
-
- shl $3, arg4 # len(C) in bits (*128)
- vmovq arg4, %xmm1
- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
-
- vpxor %xmm15, %xmm14, %xmm14
- GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
-
- mov arg5, %rax # rax = *Y0
- vmovdqu (%rax), %xmm9 # xmm9 = Y0
-
- ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
-
- vpxor %xmm14, %xmm9, %xmm9
-
-
-
-_return_T\@:
- mov arg8, %r10 # r10 = authTag
- mov arg9, %r11 # r11 = auth_tag_len
-
- cmp $16, %r11
- je _T_16\@
-
- cmp $8, %r11
- jl _T_4\@
-
-_T_8\@:
- vmovq %xmm9, %rax
- mov %rax, (%r10)
- add $8, %r10
- sub $8, %r11
- vpsrldq $8, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_4\@:
- vmovd %xmm9, %eax
- mov %eax, (%r10)
- add $4, %r10
- sub $4, %r11
- vpsrldq $4, %xmm9, %xmm9
- cmp $0, %r11
- je _return_T_done\@
-_T_123\@:
- vmovd %xmm9, %eax
- cmp $2, %r11
- jl _T_1\@
- mov %ax, (%r10)
- cmp $2, %r11
- je _return_T_done\@
- add $2, %r10
- sar $16, %eax
-_T_1\@:
- mov %al, (%r10)
- jmp _return_T_done\@
-
-_T_16\@:
- vmovdqu %xmm9, (%r10)
-
-_return_T_done\@:
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
-.endm
-
-
#############################################################
-#void aesni_gcm_precomp_avx_gen4
+#void aesni_gcm_init_avx_gen4
# (gcm_data *my_ctx_data,
-# u8 *hash_subkey)# /* H, the Hash sub key input.
-# Data starts on a 16-byte boundary. */
+# gcm_context_data *data,
+# u8 *iv, /* Pre-counter block j0: 4 byte salt
+# (from Security Association) concatenated with 8 byte
+# Initialisation Vector (from IPSec ESP Payload)
+# concatenated with 0x00000001. 16-byte aligned pointer. */
+# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
+# const u8 *aad, /* Additional Authentication Data (AAD)*/
+# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
#############################################################
-ENTRY(aesni_gcm_precomp_avx_gen4)
- #the number of pushes must equal STACK_OFFSET
- push %r12
- push %r13
- push %r14
- push %r15
-
- mov %rsp, %r14
-
-
-
- sub $VARIABLE_OFFSET, %rsp
- and $~63, %rsp # align rsp to 64 bytes
-
- vmovdqu (arg2), %xmm6 # xmm6 = HashKey
-
- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6
- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
- vmovdqa %xmm6, %xmm2
- vpsllq $1, %xmm6, %xmm6
- vpsrlq $63, %xmm2, %xmm2
- vmovdqa %xmm2, %xmm1
- vpslldq $8, %xmm2, %xmm2
- vpsrldq $8, %xmm1, %xmm1
- vpor %xmm2, %xmm6, %xmm6
- #reduction
- vpshufd $0b00100100, %xmm1, %xmm2
- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
- vpand POLY(%rip), %xmm2, %xmm2
- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly
- #######################################################################
- vmovdqa %xmm6, HashKey(arg1) # store HashKey<<1 mod poly
-
-
- PRECOMPUTE_AVX2 %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
-
- mov %r14, %rsp
-
- pop %r15
- pop %r14
- pop %r13
- pop %r12
+ENTRY(aesni_gcm_init_avx_gen4)
+ FUNC_SAVE
+ INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
+ FUNC_RESTORE
ret
-ENDPROC(aesni_gcm_precomp_avx_gen4)
-
+ENDPROC(aesni_gcm_init_avx_gen4)
###############################################################################
#void aesni_gcm_enc_avx_gen4(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */
# const u8 *in, /* Plaintext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
-# u8 *auth_tag, /* Authenticated Tag output. */
-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
###############################################################################
-ENTRY(aesni_gcm_enc_avx_gen4)
- GCM_ENC_DEC_AVX2 ENC
+ENTRY(aesni_gcm_enc_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_enc_update4
+ cmp $16, %eax
+ je key_128_enc_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
+ FUNC_RESTORE
+ ret
+key_128_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
+ FUNC_RESTORE
+ ret
+key_256_enc_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
+ FUNC_RESTORE
ret
-ENDPROC(aesni_gcm_enc_avx_gen4)
+ENDPROC(aesni_gcm_enc_update_avx_gen4)
###############################################################################
-#void aesni_gcm_dec_avx_gen4(
+#void aesni_gcm_dec_update_avx_gen4(
# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */
# const u8 *in, /* Ciphertext input */
-# u64 plaintext_len, /* Length of data in Bytes for encryption. */
-# u8 *iv, /* Pre-counter block j0: 4 byte salt
-# (from Security Association) concatenated with 8 byte
-# Initialisation Vector (from IPSec ESP Payload)
-# concatenated with 0x00000001. 16-byte aligned pointer. */
-# const u8 *aad, /* Additional Authentication Data (AAD)*/
-# u64 aad_len, /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
+# u64 plaintext_len) /* Length of data in Bytes for encryption. */
+###############################################################################
+ENTRY(aesni_gcm_dec_update_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_dec_update4
+ cmp $16, %eax
+ je key_128_dec_update4
+ # must be 192
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
+ FUNC_RESTORE
+ ret
+key_128_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
+ FUNC_RESTORE
+ ret
+key_256_dec_update4:
+ GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_dec_update_avx_gen4)
+
+###############################################################################
+#void aesni_gcm_finalize_avx_gen4(
+# gcm_data *my_ctx_data, /* aligned to 16 Bytes */
+# gcm_context_data *data,
# u8 *auth_tag, /* Authenticated Tag output. */
# u64 auth_tag_len)# /* Authenticated Tag Length in bytes.
-# Valid values are 16 (most likely), 12 or 8. */
+# Valid values are 16 (most likely), 12 or 8. */
###############################################################################
-ENTRY(aesni_gcm_dec_avx_gen4)
- GCM_ENC_DEC_AVX2 DEC
- ret
-ENDPROC(aesni_gcm_dec_avx_gen4)
+ENTRY(aesni_gcm_finalize_avx_gen4)
+ FUNC_SAVE
+ mov keysize,%eax
+ cmp $32, %eax
+ je key_256_finalize4
+ cmp $16, %eax
+ je key_128_finalize4
+ # must be 192
+ GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_128_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
+ FUNC_RESTORE
+ ret
+key_256_finalize4:
+ GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
+ FUNC_RESTORE
+ ret
+ENDPROC(aesni_gcm_finalize_avx_gen4)
#endif /* CONFIG_AS_AVX2 */
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 661f7daf43da..1321700d6647 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -84,7 +84,7 @@ struct gcm_context_data {
u8 current_counter[GCM_BLOCK_LEN];
u64 partial_block_len;
u64 unused;
- u8 hash_keys[GCM_BLOCK_LEN * 8];
+ u8 hash_keys[GCM_BLOCK_LEN * 16];
};
asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
@@ -175,6 +175,32 @@ asmlinkage void aesni_gcm_finalize(void *ctx,
struct gcm_context_data *gdata,
u8 *auth_tag, unsigned long auth_tag_len);
+static struct aesni_gcm_tfm_s {
+void (*init)(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey, const u8 *aad,
+ unsigned long aad_len);
+void (*enc_update)(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long plaintext_len);
+void (*dec_update)(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+void (*finalize)(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
+} *aesni_gcm_tfm;
+
+struct aesni_gcm_tfm_s aesni_gcm_tfm_sse = {
+ .init = &aesni_gcm_init,
+ .enc_update = &aesni_gcm_enc_update,
+ .dec_update = &aesni_gcm_dec_update,
+ .finalize = &aesni_gcm_finalize,
+};
+
#ifdef CONFIG_AS_AVX
asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
@@ -183,136 +209,94 @@ asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv,
asmlinkage void aes_ctr_enc_256_avx_by8(const u8 *in, u8 *iv,
void *keys, u8 *out, unsigned int num_bytes);
/*
- * asmlinkage void aesni_gcm_precomp_avx_gen2()
+ * asmlinkage void aesni_gcm_init_avx_gen2()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
-asmlinkage void aesni_gcm_precomp_avx_gen2(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen2(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-static void aesni_gcm_enc_avx(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)){
- aesni_gcm_enc(ctx, data, out, in,
- plaintext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen2 = {
+ .init = &aesni_gcm_init_avx_gen2,
+ .enc_update = &aesni_gcm_enc_update_avx_gen2,
+ .dec_update = &aesni_gcm_dec_update_avx_gen2,
+ .finalize = &aesni_gcm_finalize_avx_gen2,
+};
-static void aesni_gcm_dec_avx(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_dec(ctx, data, out, in,
- ciphertext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
#endif
#ifdef CONFIG_AS_AVX2
/*
- * asmlinkage void aesni_gcm_precomp_avx_gen4()
+ * asmlinkage void aesni_gcm_init_avx_gen4()
* gcm_data *my_ctx_data, context data
* u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
*/
-asmlinkage void aesni_gcm_precomp_avx_gen4(void *my_ctx_data, u8 *hash_subkey);
+asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data,
+ struct gcm_context_data *gdata,
+ u8 *iv,
+ u8 *hash_subkey,
+ const u8 *aad,
+ unsigned long aad_len);
+
+asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in, unsigned long plaintext_len);
+asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
+ const u8 *in,
+ unsigned long ciphertext_len);
+asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata,
+ u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_enc_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long plaintext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx, u8 *out,
+asmlinkage void aesni_gcm_dec_avx_gen4(void *ctx,
+ struct gcm_context_data *gdata, u8 *out,
const u8 *in, unsigned long ciphertext_len, u8 *iv,
const u8 *aad, unsigned long aad_len,
u8 *auth_tag, unsigned long auth_tag_len);
-static void aesni_gcm_enc_avx2(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((plaintext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_enc(ctx, data, out, in,
- plaintext_len, iv, hash_subkey, aad,
- aad_len, auth_tag, auth_tag_len);
- } else if (plaintext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen2(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
- aesni_gcm_enc_avx_gen4(ctx, out, in, plaintext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
+struct aesni_gcm_tfm_s aesni_gcm_tfm_avx_gen4 = {
+ .init = &aesni_gcm_init_avx_gen4,
+ .enc_update = &aesni_gcm_enc_update_avx_gen4,
+ .dec_update = &aesni_gcm_dec_update_avx_gen4,
+ .finalize = &aesni_gcm_finalize_avx_gen4,
+};
-static void aesni_gcm_dec_avx2(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len, u8 *iv,
- u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
- u8 *auth_tag, unsigned long auth_tag_len)
-{
- struct crypto_aes_ctx *aes_ctx = (struct crypto_aes_ctx*)ctx;
- if ((ciphertext_len < AVX_GEN2_OPTSIZE) || (aes_ctx-> key_length != AES_KEYSIZE_128)) {
- aesni_gcm_dec(ctx, data, out, in,
- ciphertext_len, iv, hash_subkey,
- aad, aad_len, auth_tag, auth_tag_len);
- } else if (ciphertext_len < AVX_GEN4_OPTSIZE) {
- aesni_gcm_precomp_avx_gen2(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen2(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- } else {
- aesni_gcm_precomp_avx_gen4(ctx, hash_subkey);
- aesni_gcm_dec_avx_gen4(ctx, out, in, ciphertext_len, iv, aad,
- aad_len, auth_tag, auth_tag_len);
- }
-}
#endif
-static void (*aesni_gcm_enc_tfm)(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long plaintext_len,
- u8 *iv, u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len, u8 *auth_tag,
- unsigned long auth_tag_len);
-
-static void (*aesni_gcm_dec_tfm)(void *ctx,
- struct gcm_context_data *data, u8 *out,
- const u8 *in, unsigned long ciphertext_len,
- u8 *iv, u8 *hash_subkey, const u8 *aad,
- unsigned long aad_len, u8 *auth_tag,
- unsigned long auth_tag_len);
-
static inline struct
aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
{
@@ -794,6 +778,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
{
struct crypto_aead *tfm = crypto_aead_reqtfm(req);
unsigned long auth_tag_len = crypto_aead_authsize(tfm);
+ struct aesni_gcm_tfm_s *gcm_tfm = aesni_gcm_tfm;
struct gcm_context_data data AESNI_ALIGN_ATTR;
struct scatter_walk dst_sg_walk = {};
unsigned long left = req->cryptlen;
@@ -811,6 +796,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
if (!enc)
left -= auth_tag_len;
+#ifdef CONFIG_AS_AVX2
+ if (left < AVX_GEN4_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen4)
+ gcm_tfm = &aesni_gcm_tfm_avx_gen2;
+#endif
+#ifdef CONFIG_AS_AVX
+ if (left < AVX_GEN2_OPTSIZE && gcm_tfm == &aesni_gcm_tfm_avx_gen2)
+ gcm_tfm = &aesni_gcm_tfm_sse;
+#endif
+
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length &&
(!PageHighMem(sg_page(req->src)) ||
@@ -835,7 +829,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
}
kernel_fpu_begin();
- aesni_gcm_init(aes_ctx, &data, iv,
+ gcm_tfm->init(aes_ctx, &data, iv,
hash_subkey, assoc, assoclen);
if (req->src != req->dst) {
while (left) {
@@ -846,10 +840,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
len = min(srclen, dstlen);
if (len) {
if (enc)
- aesni_gcm_enc_update(aes_ctx, &data,
+ gcm_tfm->enc_update(aes_ctx, &data,
dst, src, len);
else
- aesni_gcm_dec_update(aes_ctx, &data,
+ gcm_tfm->dec_update(aes_ctx, &data,
dst, src, len);
}
left -= len;
@@ -867,10 +861,10 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
len = scatterwalk_clamp(&src_sg_walk, left);
if (len) {
if (enc)
- aesni_gcm_enc_update(aes_ctx, &data,
+ gcm_tfm->enc_update(aes_ctx, &data,
src, src, len);
else
- aesni_gcm_dec_update(aes_ctx, &data,
+ gcm_tfm->dec_update(aes_ctx, &data,
src, src, len);
}
left -= len;
@@ -879,7 +873,7 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
scatterwalk_done(&src_sg_walk, 1, left);
}
}
- aesni_gcm_finalize(aes_ctx, &data, authTag, auth_tag_len);
+ gcm_tfm->finalize(aes_ctx, &data, authTag, auth_tag_len);
kernel_fpu_end();
if (!assocmem)
@@ -912,147 +906,15 @@ static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req,
static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- u8 one_entry_in_sg = 0;
- u8 *src, *dst, *assoc;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- struct scatter_walk src_sg_walk;
- struct scatter_walk dst_sg_walk = {};
- struct gcm_context_data data AESNI_ALIGN_ATTR;
-
- if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
- aesni_gcm_enc_tfm == aesni_gcm_enc ||
- req->cryptlen < AVX_GEN2_OPTSIZE) {
- return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
- aes_ctx);
- }
- if (sg_is_last(req->src) &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE) &&
- sg_is_last(req->dst) &&
- (!PageHighMem(sg_page(req->dst)) ||
- req->dst->offset + req->dst->length <= PAGE_SIZE)) {
- one_entry_in_sg = 1;
- scatterwalk_start(&src_sg_walk, req->src);
- assoc = scatterwalk_map(&src_sg_walk);
- src = assoc + req->assoclen;
- dst = src;
- if (unlikely(req->src != req->dst)) {
- scatterwalk_start(&dst_sg_walk, req->dst);
- dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
- }
- } else {
- /* Allocate memory for src, dst, assoc */
- assoc = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
- GFP_ATOMIC);
- if (unlikely(!assoc))
- return -ENOMEM;
- scatterwalk_map_and_copy(assoc, req->src, 0,
- req->assoclen + req->cryptlen, 0);
- src = assoc + req->assoclen;
- dst = src;
- }
-
- kernel_fpu_begin();
- aesni_gcm_enc_tfm(aes_ctx, &data, dst, src, req->cryptlen, iv,
- hash_subkey, assoc, assoclen,
- dst + req->cryptlen, auth_tag_len);
- kernel_fpu_end();
-
- /* The authTag (aka the Integrity Check Value) needs to be written
- * back to the packet. */
- if (one_entry_in_sg) {
- if (unlikely(req->src != req->dst)) {
- scatterwalk_unmap(dst - req->assoclen);
- scatterwalk_advance(&dst_sg_walk, req->dst->length);
- scatterwalk_done(&dst_sg_walk, 1, 0);
- }
- scatterwalk_unmap(assoc);
- scatterwalk_advance(&src_sg_walk, req->src->length);
- scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
- } else {
- scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
- req->cryptlen + auth_tag_len, 1);
- kfree(assoc);
- }
- return 0;
+ return gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv,
+ aes_ctx);
}
static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen,
u8 *hash_subkey, u8 *iv, void *aes_ctx)
{
- u8 one_entry_in_sg = 0;
- u8 *src, *dst, *assoc;
- unsigned long tempCipherLen = 0;
- struct crypto_aead *tfm = crypto_aead_reqtfm(req);
- unsigned long auth_tag_len = crypto_aead_authsize(tfm);
- u8 authTag[16];
- struct scatter_walk src_sg_walk;
- struct scatter_walk dst_sg_walk = {};
- struct gcm_context_data data AESNI_ALIGN_ATTR;
- int retval = 0;
-
- if (((struct crypto_aes_ctx *)aes_ctx)->key_length != AES_KEYSIZE_128 ||
- aesni_gcm_enc_tfm == aesni_gcm_enc ||
- req->cryptlen < AVX_GEN2_OPTSIZE) {
- return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
- aes_ctx);
- }
- tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
-
- if (sg_is_last(req->src) &&
- (!PageHighMem(sg_page(req->src)) ||
- req->src->offset + req->src->length <= PAGE_SIZE) &&
- sg_is_last(req->dst) && req->dst->length &&
- (!PageHighMem(sg_page(req->dst)) ||
- req->dst->offset + req->dst->length <= PAGE_SIZE)) {
- one_entry_in_sg = 1;
- scatterwalk_start(&src_sg_walk, req->src);
- assoc = scatterwalk_map(&src_sg_walk);
- src = assoc + req->assoclen;
- dst = src;
- if (unlikely(req->src != req->dst)) {
- scatterwalk_start(&dst_sg_walk, req->dst);
- dst = scatterwalk_map(&dst_sg_walk) + req->assoclen;
- }
- } else {
- /* Allocate memory for src, dst, assoc */
- assoc = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
- if (!assoc)
- return -ENOMEM;
- scatterwalk_map_and_copy(assoc, req->src, 0,
- req->assoclen + req->cryptlen, 0);
- src = assoc + req->assoclen;
- dst = src;
- }
-
-
- kernel_fpu_begin();
- aesni_gcm_dec_tfm(aes_ctx, &data, dst, src, tempCipherLen, iv,
- hash_subkey, assoc, assoclen,
- authTag, auth_tag_len);
- kernel_fpu_end();
-
- /* Compare generated tag with passed in tag. */
- retval = crypto_memneq(src + tempCipherLen, authTag, auth_tag_len) ?
- -EBADMSG : 0;
-
- if (one_entry_in_sg) {
- if (unlikely(req->src != req->dst)) {
- scatterwalk_unmap(dst - req->assoclen);
- scatterwalk_advance(&dst_sg_walk, req->dst->length);
- scatterwalk_done(&dst_sg_walk, 1, 0);
- }
- scatterwalk_unmap(assoc);
- scatterwalk_advance(&src_sg_walk, req->src->length);
- scatterwalk_done(&src_sg_walk, req->src == req->dst, 0);
- } else {
- scatterwalk_map_and_copy(dst, req->dst, req->assoclen,
- tempCipherLen, 1);
- kfree(assoc);
- }
- return retval;
-
+ return gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv,
+ aes_ctx);
}
static int helper_rfc4106_encrypt(struct aead_request *req)
@@ -1420,21 +1282,18 @@ static int __init aesni_init(void)
#ifdef CONFIG_AS_AVX2
if (boot_cpu_has(X86_FEATURE_AVX2)) {
pr_info("AVX2 version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc_avx2;
- aesni_gcm_dec_tfm = aesni_gcm_dec_avx2;
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen4;
} else
#endif
#ifdef CONFIG_AS_AVX
if (boot_cpu_has(X86_FEATURE_AVX)) {
pr_info("AVX version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc_avx;
- aesni_gcm_dec_tfm = aesni_gcm_dec_avx;
+ aesni_gcm_tfm = &aesni_gcm_tfm_avx_gen2;
} else
#endif
{
pr_info("SSE version of gcm_enc/dec engaged.\n");
- aesni_gcm_enc_tfm = aesni_gcm_enc;
- aesni_gcm_dec_tfm = aesni_gcm_dec;
+ aesni_gcm_tfm = &aesni_gcm_tfm_sse;
}
aesni_ctr_enc_tfm = aesni_ctr_enc;
#ifdef CONFIG_AS_AVX
diff --git a/arch/x86/crypto/chacha-avx2-x86_64.S b/arch/x86/crypto/chacha-avx2-x86_64.S
new file mode 100644
index 000000000000..32903fd450af
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx2-x86_64.S
@@ -0,0 +1,1025 @@
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.ROT8, "aM", @progbits, 32
+.align 32
+ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
+ .octa 0x0e0d0c0f0a09080b0605040702010003
+
+.section .rodata.cst32.ROT16, "aM", @progbits, 32
+.align 32
+ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
+ .octa 0x0d0c0f0e09080b0a0504070601000302
+
+.section .rodata.cst32.CTRINC, "aM", @progbits, 32
+.align 32
+CTRINC: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.text
+
+ENTRY(chacha_2block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+ vmovdqa ROT8(%rip),%ymm4
+ vmovdqa ROT16(%rip),%ymm5
+
+ mov %rcx,%rax
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm5,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm6
+ vpslld $12,%ymm6,%ymm6
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm6,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm4,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm7
+ vpslld $7,%ymm7,%ymm7
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rax
+ jl .Lxorpart2
+ vpxor 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rax
+ jl .Lxorpart2
+ vpxor 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rax
+ jl .Lxorpart2
+ vpxor 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rax
+ jl .Lxorpart2
+ vpxor 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rax
+ jl .Lxorpart2
+ vpxor 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rax
+ jl .Lxorpart2
+ vpxor 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rax
+ jl .Lxorpart2
+ vpxor 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rax
+ jl .Lxorpart2
+ vpxor 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone2
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm7,%xmm7
+ vmovdqa %xmm7,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx2)
+
+ENTRY(chacha_4block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+ vmovdqa ROT8(%rip),%ymm8
+ vmovdqa ROT16(%rip),%ymm9
+
+ mov %rcx,%rax
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm9,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm9,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $12,%ymm10,%ymm10
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxor %ymm0,%ymm3,%ymm3
+ vpshufb %ymm8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxor %ymm4,%ymm7,%ymm7
+ vpshufb %ymm8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxor %ymm2,%ymm1,%ymm1
+ vmovdqa %ymm1,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm1,%ymm1
+ vpor %ymm10,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxor %ymm6,%ymm5,%ymm5
+ vmovdqa %ymm5,%ymm10
+ vpslld $7,%ymm10,%ymm10
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm10,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rax
+ jl .Lxorpart4
+ vpxor 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ vpxor 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ vpxor 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ vpxor 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rax
+ jl .Lxorpart4
+ vpxor 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ vpxor 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ vpxor 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ vpxor 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ vpxor 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ vpxor 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ vpxor 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ vpxor 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rax
+ jl .Lxorpart4
+ vpxor 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rax
+ jl .Lxorpart4
+ vpxor 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rax
+ jl .Lxorpart4
+ vpxor 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rax
+ jl .Lxorpart4
+ vpxor 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%xmm10,%xmm10
+ vmovdqa %xmm10,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx2)
+
+ENTRY(chacha_8block_xor_avx2)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. As we need some
+ # scratch registers, we save the first four registers on the stack. The
+ # algorithm performs each operation on the corresponding word of each
+ # state matrix, hence requires no word shuffling. For final XORing step
+ # we transpose the matrix by interleaving 32-, 64- and then 128-bit
+ # words, which allows us to do XOR in AVX registers. 8/16-bit word
+ # rotation is done with the slightly better performing byte shuffling,
+ # 7/12-bit word rotation uses traditional shift+OR.
+
+ vzeroupper
+ # 4 * 32 byte stack, 32-byte aligned
+ lea 8(%rsp),%r10
+ and $~31, %rsp
+ sub $0x80, %rsp
+ mov %rcx,%rax
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+ # x0..3 on stack
+ vmovdqa %ymm0,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm3,0x60(%rsp)
+
+ vmovdqa CTRINC(%rip),%ymm1
+ vmovdqa ROT8(%rip),%ymm2
+ vmovdqa ROT16(%rip),%ymm3
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxor %ymm8,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxor %ymm9,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxor %ymm10,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxor %ymm11,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm3,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm3,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm3,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm3,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $12,%ymm5,%ymm0
+ vpsrld $20,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $12,%ymm6,%ymm0
+ vpsrld $20,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $12,%ymm7,%ymm0
+ vpsrld $20,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $12,%ymm4,%ymm0
+ vpsrld $20,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd 0x00(%rsp),%ymm5,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpxor %ymm0,%ymm15,%ymm15
+ vpshufb %ymm2,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd 0x20(%rsp),%ymm6,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpxor %ymm0,%ymm12,%ymm12
+ vpshufb %ymm2,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd 0x40(%rsp),%ymm7,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpxor %ymm0,%ymm13,%ymm13
+ vpshufb %ymm2,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd 0x60(%rsp),%ymm4,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpxor %ymm0,%ymm14,%ymm14
+ vpshufb %ymm2,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxor %ymm10,%ymm5,%ymm5
+ vpslld $7,%ymm5,%ymm0
+ vpsrld $25,%ymm5,%ymm5
+ vpor %ymm0,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxor %ymm11,%ymm6,%ymm6
+ vpslld $7,%ymm6,%ymm0
+ vpsrld $25,%ymm6,%ymm6
+ vpor %ymm0,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxor %ymm8,%ymm7,%ymm7
+ vpslld $7,%ymm7,%ymm0
+ vpsrld $25,%ymm7,%ymm7
+ vpor %ymm0,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxor %ymm9,%ymm4,%ymm4
+ vpslld $7,%ymm4,%ymm0
+ vpsrld $25,%ymm4,%ymm4
+ vpor %ymm0,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpaddd 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+ vpbroadcastd 0x04(%rdi),%ymm0
+ vpaddd 0x20(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x20(%rsp)
+ vpbroadcastd 0x08(%rdi),%ymm0
+ vpaddd 0x40(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x40(%rsp)
+ vpbroadcastd 0x0c(%rdi),%ymm0
+ vpaddd 0x60(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x60(%rsp)
+ vpbroadcastd 0x10(%rdi),%ymm0
+ vpaddd %ymm0,%ymm4,%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm0
+ vpaddd %ymm0,%ymm5,%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm0
+ vpaddd %ymm0,%ymm6,%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm7,%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm0
+ vpaddd %ymm0,%ymm8,%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm0
+ vpaddd %ymm0,%ymm9,%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm0
+ vpaddd %ymm0,%ymm10,%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm11,%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm0
+ vpaddd %ymm0,%ymm12,%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm0
+ vpaddd %ymm0,%ymm13,%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm0
+ vpaddd %ymm0,%ymm14,%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm0
+ vpaddd %ymm0,%ymm15,%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd %ymm1,%ymm12,%ymm12
+
+ # interleave 32-bit words in state n, n+1
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x20(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x00(%rsp)
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa 0x40(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm1
+ vpunpckldq %ymm1,%ymm0,%ymm2
+ vpunpckhdq %ymm1,%ymm0,%ymm1
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa %ymm1,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpckldq %ymm5,%ymm0,%ymm4
+ vpunpckhdq %ymm5,%ymm0,%ymm5
+ vmovdqa %ymm6,%ymm0
+ vpunpckldq %ymm7,%ymm0,%ymm6
+ vpunpckhdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpckldq %ymm9,%ymm0,%ymm8
+ vpunpckhdq %ymm9,%ymm0,%ymm9
+ vmovdqa %ymm10,%ymm0
+ vpunpckldq %ymm11,%ymm0,%ymm10
+ vpunpckhdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpckldq %ymm13,%ymm0,%ymm12
+ vpunpckhdq %ymm13,%ymm0,%ymm13
+ vmovdqa %ymm14,%ymm0
+ vpunpckldq %ymm15,%ymm0,%ymm14
+ vpunpckhdq %ymm15,%ymm0,%ymm15
+
+ # interleave 64-bit words in state n, n+2
+ vmovdqa 0x00(%rsp),%ymm0
+ vmovdqa 0x40(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x00(%rsp)
+ vmovdqa %ymm2,0x40(%rsp)
+ vmovdqa 0x20(%rsp),%ymm0
+ vmovdqa 0x60(%rsp),%ymm2
+ vpunpcklqdq %ymm2,%ymm0,%ymm1
+ vpunpckhqdq %ymm2,%ymm0,%ymm2
+ vmovdqa %ymm1,0x20(%rsp)
+ vmovdqa %ymm2,0x60(%rsp)
+ vmovdqa %ymm4,%ymm0
+ vpunpcklqdq %ymm6,%ymm0,%ymm4
+ vpunpckhqdq %ymm6,%ymm0,%ymm6
+ vmovdqa %ymm5,%ymm0
+ vpunpcklqdq %ymm7,%ymm0,%ymm5
+ vpunpckhqdq %ymm7,%ymm0,%ymm7
+ vmovdqa %ymm8,%ymm0
+ vpunpcklqdq %ymm10,%ymm0,%ymm8
+ vpunpckhqdq %ymm10,%ymm0,%ymm10
+ vmovdqa %ymm9,%ymm0
+ vpunpcklqdq %ymm11,%ymm0,%ymm9
+ vpunpckhqdq %ymm11,%ymm0,%ymm11
+ vmovdqa %ymm12,%ymm0
+ vpunpcklqdq %ymm14,%ymm0,%ymm12
+ vpunpckhqdq %ymm14,%ymm0,%ymm14
+ vmovdqa %ymm13,%ymm0
+ vpunpcklqdq %ymm15,%ymm0,%ymm13
+ vpunpckhqdq %ymm15,%ymm0,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa 0x00(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm4,%ymm1,%ymm0
+ cmp $0x0020,%rax
+ jl .Lxorpart8
+ vpxor 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0000(%rsi)
+ vperm2i128 $0x31,%ymm4,%ymm1,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rax
+ jl .Lxorpart8
+ vpxor 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vmovdqa 0x40(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm6,%ymm1,%ymm0
+ cmp $0x0060,%rax
+ jl .Lxorpart8
+ vpxor 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm1,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rax
+ jl .Lxorpart8
+ vpxor 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vmovdqa 0x20(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rax
+ jl .Lxorpart8
+ vpxor 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vmovdqa 0x60(%rsp),%ymm1
+ vperm2i128 $0x20,%ymm7,%ymm1,%ymm0
+ cmp $0x00e0,%rax
+ jl .Lxorpart8
+ vpxor 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm1,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rax
+ jl .Lxorpart8
+ vpxor 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa %ymm4,%ymm0
+ cmp $0x0120,%rax
+ jl .Lxorpart8
+ vpxor 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0100(%rsi)
+
+ vmovdqa %ymm12,%ymm0
+ cmp $0x0140,%rax
+ jl .Lxorpart8
+ vpxor 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0120(%rsi)
+
+ vmovdqa %ymm6,%ymm0
+ cmp $0x0160,%rax
+ jl .Lxorpart8
+ vpxor 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0140(%rsi)
+
+ vmovdqa %ymm14,%ymm0
+ cmp $0x0180,%rax
+ jl .Lxorpart8
+ vpxor 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0160(%rsi)
+
+ vmovdqa %ymm5,%ymm0
+ cmp $0x01a0,%rax
+ jl .Lxorpart8
+ vpxor 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x0180(%rsi)
+
+ vmovdqa %ymm13,%ymm0
+ cmp $0x01c0,%rax
+ jl .Lxorpart8
+ vpxor 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01a0(%rsi)
+
+ vmovdqa %ymm7,%ymm0
+ cmp $0x01e0,%rax
+ jl .Lxorpart8
+ vpxor 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01c0(%rsi)
+
+ vmovdqa %ymm15,%ymm0
+ cmp $0x0200,%rax
+ jl .Lxorpart8
+ vpxor 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ lea -8(%r10),%rsp
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x1f,%r9
+ jz .Ldone8
+ and $~0x1f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ vpxor 0x00(%rsp),%ymm0,%ymm0
+ vmovdqa %ymm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha-avx512vl-x86_64.S b/arch/x86/crypto/chacha-avx512vl-x86_64.S
new file mode 100644
index 000000000000..848f9c75fd4f
--- /dev/null
+++ b/arch/x86/crypto/chacha-avx512vl-x86_64.S
@@ -0,0 +1,836 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
+ *
+ * Copyright (C) 2018 Martin Willi
+ */
+
+#include <linux/linkage.h>
+
+.section .rodata.cst32.CTR2BL, "aM", @progbits, 32
+.align 32
+CTR2BL: .octa 0x00000000000000000000000000000000
+ .octa 0x00000000000000000000000000000001
+
+.section .rodata.cst32.CTR4BL, "aM", @progbits, 32
+.align 32
+CTR4BL: .octa 0x00000000000000000000000000000002
+ .octa 0x00000000000000000000000000000003
+
+.section .rodata.cst32.CTR8BL, "aM", @progbits, 32
+.align 32
+CTR8BL: .octa 0x00000003000000020000000100000000
+ .octa 0x00000007000000060000000500000004
+
+.text
+
+ENTRY(chacha_2block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 2 data blocks output, o
+ # %rdx: up to 2 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts two ChaCha blocks by loading the state
+ # matrix twice across four AVX registers. It performs matrix operations
+ # on four words in each matrix in parallel, but requires shuffling to
+ # rearrange the words after each round.
+
+ vzeroupper
+
+ # x0..3[0-2] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+
+ vmovdqa %ymm0,%ymm8
+ vmovdqa %ymm1,%ymm9
+ vmovdqa %ymm2,%ymm10
+ vmovdqa %ymm3,%ymm11
+
+.Ldoubleround:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+
+ sub $2,%r8d
+ jnz .Ldoubleround
+
+ # o0 = i0 ^ (x0 + s0)
+ vpaddd %ymm8,%ymm0,%ymm7
+ cmp $0x10,%rcx
+ jl .Lxorpart2
+ vpxord 0x00(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x00(%rsi)
+ vextracti128 $1,%ymm7,%xmm0
+ # o1 = i1 ^ (x1 + s1)
+ vpaddd %ymm9,%ymm1,%ymm7
+ cmp $0x20,%rcx
+ jl .Lxorpart2
+ vpxord 0x10(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x10(%rsi)
+ vextracti128 $1,%ymm7,%xmm1
+ # o2 = i2 ^ (x2 + s2)
+ vpaddd %ymm10,%ymm2,%ymm7
+ cmp $0x30,%rcx
+ jl .Lxorpart2
+ vpxord 0x20(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x20(%rsi)
+ vextracti128 $1,%ymm7,%xmm2
+ # o3 = i3 ^ (x3 + s3)
+ vpaddd %ymm11,%ymm3,%ymm7
+ cmp $0x40,%rcx
+ jl .Lxorpart2
+ vpxord 0x30(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x30(%rsi)
+ vextracti128 $1,%ymm7,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm7
+ cmp $0x50,%rcx
+ jl .Lxorpart2
+ vpxord 0x40(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm7
+ cmp $0x60,%rcx
+ jl .Lxorpart2
+ vpxord 0x50(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm7
+ cmp $0x70,%rcx
+ jl .Lxorpart2
+ vpxord 0x60(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm7
+ cmp $0x80,%rcx
+ jl .Lxorpart2
+ vpxord 0x70(%rdx),%xmm7,%xmm6
+ vmovdqu %xmm6,0x70(%rsi)
+
+.Ldone2:
+ vzeroupper
+ ret
+
+.Lxorpart2:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm7,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone2
+
+ENDPROC(chacha_2block_xor_avx512vl)
+
+ENTRY(chacha_4block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts four ChaCha blocks by loading the state
+ # matrix four times across eight AVX registers. It performs matrix
+ # operations on four words in two matrices in parallel, sequentially
+ # to the operations on the four words of the other two matrices. The
+ # required word shuffling has a rather high latency, we can do the
+ # arithmetic on two matrix-pairs without much slowdown.
+
+ vzeroupper
+
+ # x0..3[0-4] = s0..3
+ vbroadcasti128 0x00(%rdi),%ymm0
+ vbroadcasti128 0x10(%rdi),%ymm1
+ vbroadcasti128 0x20(%rdi),%ymm2
+ vbroadcasti128 0x30(%rdi),%ymm3
+
+ vmovdqa %ymm0,%ymm4
+ vmovdqa %ymm1,%ymm5
+ vmovdqa %ymm2,%ymm6
+ vmovdqa %ymm3,%ymm7
+
+ vpaddd CTR2BL(%rip),%ymm3,%ymm3
+ vpaddd CTR4BL(%rip),%ymm7,%ymm7
+
+ vmovdqa %ymm0,%ymm11
+ vmovdqa %ymm1,%ymm12
+ vmovdqa %ymm2,%ymm13
+ vmovdqa %ymm3,%ymm14
+ vmovdqa %ymm7,%ymm15
+
+.Ldoubleround4:
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm1,%ymm1
+ vpshufd $0x39,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm3,%ymm3
+ vpshufd $0x93,%ymm7,%ymm7
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $16,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $16,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $12,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+
+ # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
+ vpaddd %ymm1,%ymm0,%ymm0
+ vpxord %ymm0,%ymm3,%ymm3
+ vprold $8,%ymm3,%ymm3
+
+ vpaddd %ymm5,%ymm4,%ymm4
+ vpxord %ymm4,%ymm7,%ymm7
+ vprold $8,%ymm7,%ymm7
+
+ # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
+ vpaddd %ymm3,%ymm2,%ymm2
+ vpxord %ymm2,%ymm1,%ymm1
+ vprold $7,%ymm1,%ymm1
+
+ vpaddd %ymm7,%ymm6,%ymm6
+ vpxord %ymm6,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+
+ # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
+ vpshufd $0x93,%ymm1,%ymm1
+ vpshufd $0x93,%ymm5,%ymm5
+ # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
+ vpshufd $0x4e,%ymm2,%ymm2
+ vpshufd $0x4e,%ymm6,%ymm6
+ # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
+ vpshufd $0x39,%ymm3,%ymm3
+ vpshufd $0x39,%ymm7,%ymm7
+
+ sub $2,%r8d
+ jnz .Ldoubleround4
+
+ # o0 = i0 ^ (x0 + s0), first block
+ vpaddd %ymm11,%ymm0,%ymm10
+ cmp $0x10,%rcx
+ jl .Lxorpart4
+ vpxord 0x00(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x00(%rsi)
+ vextracti128 $1,%ymm10,%xmm0
+ # o1 = i1 ^ (x1 + s1), first block
+ vpaddd %ymm12,%ymm1,%ymm10
+ cmp $0x20,%rcx
+ jl .Lxorpart4
+ vpxord 0x10(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x10(%rsi)
+ vextracti128 $1,%ymm10,%xmm1
+ # o2 = i2 ^ (x2 + s2), first block
+ vpaddd %ymm13,%ymm2,%ymm10
+ cmp $0x30,%rcx
+ jl .Lxorpart4
+ vpxord 0x20(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x20(%rsi)
+ vextracti128 $1,%ymm10,%xmm2
+ # o3 = i3 ^ (x3 + s3), first block
+ vpaddd %ymm14,%ymm3,%ymm10
+ cmp $0x40,%rcx
+ jl .Lxorpart4
+ vpxord 0x30(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x30(%rsi)
+ vextracti128 $1,%ymm10,%xmm3
+
+ # xor and write second block
+ vmovdqa %xmm0,%xmm10
+ cmp $0x50,%rcx
+ jl .Lxorpart4
+ vpxord 0x40(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x40(%rsi)
+
+ vmovdqa %xmm1,%xmm10
+ cmp $0x60,%rcx
+ jl .Lxorpart4
+ vpxord 0x50(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x50(%rsi)
+
+ vmovdqa %xmm2,%xmm10
+ cmp $0x70,%rcx
+ jl .Lxorpart4
+ vpxord 0x60(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x60(%rsi)
+
+ vmovdqa %xmm3,%xmm10
+ cmp $0x80,%rcx
+ jl .Lxorpart4
+ vpxord 0x70(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x70(%rsi)
+
+ # o0 = i0 ^ (x0 + s0), third block
+ vpaddd %ymm11,%ymm4,%ymm10
+ cmp $0x90,%rcx
+ jl .Lxorpart4
+ vpxord 0x80(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x80(%rsi)
+ vextracti128 $1,%ymm10,%xmm4
+ # o1 = i1 ^ (x1 + s1), third block
+ vpaddd %ymm12,%ymm5,%ymm10
+ cmp $0xa0,%rcx
+ jl .Lxorpart4
+ vpxord 0x90(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0x90(%rsi)
+ vextracti128 $1,%ymm10,%xmm5
+ # o2 = i2 ^ (x2 + s2), third block
+ vpaddd %ymm13,%ymm6,%ymm10
+ cmp $0xb0,%rcx
+ jl .Lxorpart4
+ vpxord 0xa0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xa0(%rsi)
+ vextracti128 $1,%ymm10,%xmm6
+ # o3 = i3 ^ (x3 + s3), third block
+ vpaddd %ymm15,%ymm7,%ymm10
+ cmp $0xc0,%rcx
+ jl .Lxorpart4
+ vpxord 0xb0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xb0(%rsi)
+ vextracti128 $1,%ymm10,%xmm7
+
+ # xor and write fourth block
+ vmovdqa %xmm4,%xmm10
+ cmp $0xd0,%rcx
+ jl .Lxorpart4
+ vpxord 0xc0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xc0(%rsi)
+
+ vmovdqa %xmm5,%xmm10
+ cmp $0xe0,%rcx
+ jl .Lxorpart4
+ vpxord 0xd0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xd0(%rsi)
+
+ vmovdqa %xmm6,%xmm10
+ cmp $0xf0,%rcx
+ jl .Lxorpart4
+ vpxord 0xe0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xe0(%rsi)
+
+ vmovdqa %xmm7,%xmm10
+ cmp $0x100,%rcx
+ jl .Lxorpart4
+ vpxord 0xf0(%rdx),%xmm10,%xmm9
+ vmovdqu %xmm9,0xf0(%rsi)
+
+.Ldone4:
+ vzeroupper
+ ret
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0xf,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0xf,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
+ vpxord %xmm10,%xmm1,%xmm1
+ vmovdqu8 %xmm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_avx512vl)
+
+ENTRY(chacha_8block_xor_avx512vl)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 8 data blocks output, o
+ # %rdx: up to 8 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+
+ # This function encrypts eight consecutive ChaCha blocks by loading
+ # the state matrix in AVX registers eight times. Compared to AVX2, this
+ # mostly benefits from the new rotate instructions in VL and the
+ # additional registers.
+
+ vzeroupper
+
+ # x0..15[0-7] = s[0..15]
+ vpbroadcastd 0x00(%rdi),%ymm0
+ vpbroadcastd 0x04(%rdi),%ymm1
+ vpbroadcastd 0x08(%rdi),%ymm2
+ vpbroadcastd 0x0c(%rdi),%ymm3
+ vpbroadcastd 0x10(%rdi),%ymm4
+ vpbroadcastd 0x14(%rdi),%ymm5
+ vpbroadcastd 0x18(%rdi),%ymm6
+ vpbroadcastd 0x1c(%rdi),%ymm7
+ vpbroadcastd 0x20(%rdi),%ymm8
+ vpbroadcastd 0x24(%rdi),%ymm9
+ vpbroadcastd 0x28(%rdi),%ymm10
+ vpbroadcastd 0x2c(%rdi),%ymm11
+ vpbroadcastd 0x30(%rdi),%ymm12
+ vpbroadcastd 0x34(%rdi),%ymm13
+ vpbroadcastd 0x38(%rdi),%ymm14
+ vpbroadcastd 0x3c(%rdi),%ymm15
+
+ # x12 += counter values 0-3
+ vpaddd CTR8BL(%rip),%ymm12,%ymm12
+
+ vmovdqa64 %ymm0,%ymm16
+ vmovdqa64 %ymm1,%ymm17
+ vmovdqa64 %ymm2,%ymm18
+ vmovdqa64 %ymm3,%ymm19
+ vmovdqa64 %ymm4,%ymm20
+ vmovdqa64 %ymm5,%ymm21
+ vmovdqa64 %ymm6,%ymm22
+ vmovdqa64 %ymm7,%ymm23
+ vmovdqa64 %ymm8,%ymm24
+ vmovdqa64 %ymm9,%ymm25
+ vmovdqa64 %ymm10,%ymm26
+ vmovdqa64 %ymm11,%ymm27
+ vmovdqa64 %ymm12,%ymm28
+ vmovdqa64 %ymm13,%ymm29
+ vmovdqa64 %ymm14,%ymm30
+ vmovdqa64 %ymm15,%ymm31
+
+.Ldoubleround8:
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+
+ # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
+ vpaddd %ymm0,%ymm4,%ymm0
+ vpxord %ymm0,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
+ vpaddd %ymm1,%ymm5,%ymm1
+ vpxord %ymm1,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
+ vpaddd %ymm2,%ymm6,%ymm2
+ vpxord %ymm2,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+ # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
+ vpaddd %ymm3,%ymm7,%ymm3
+ vpxord %ymm3,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+
+ # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
+ vpaddd %ymm12,%ymm8,%ymm8
+ vpxord %ymm8,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+ # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
+ vpaddd %ymm13,%ymm9,%ymm9
+ vpxord %ymm9,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
+ vpaddd %ymm14,%ymm10,%ymm10
+ vpxord %ymm10,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
+ vpaddd %ymm15,%ymm11,%ymm11
+ vpxord %ymm11,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $16,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $16,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $16,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $16,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $12,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $12,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $12,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $12,%ymm4,%ymm4
+
+ # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
+ vpaddd %ymm0,%ymm5,%ymm0
+ vpxord %ymm0,%ymm15,%ymm15
+ vprold $8,%ymm15,%ymm15
+ # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
+ vpaddd %ymm1,%ymm6,%ymm1
+ vpxord %ymm1,%ymm12,%ymm12
+ vprold $8,%ymm12,%ymm12
+ # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
+ vpaddd %ymm2,%ymm7,%ymm2
+ vpxord %ymm2,%ymm13,%ymm13
+ vprold $8,%ymm13,%ymm13
+ # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
+ vpaddd %ymm3,%ymm4,%ymm3
+ vpxord %ymm3,%ymm14,%ymm14
+ vprold $8,%ymm14,%ymm14
+
+ # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
+ vpaddd %ymm15,%ymm10,%ymm10
+ vpxord %ymm10,%ymm5,%ymm5
+ vprold $7,%ymm5,%ymm5
+ # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
+ vpaddd %ymm12,%ymm11,%ymm11
+ vpxord %ymm11,%ymm6,%ymm6
+ vprold $7,%ymm6,%ymm6
+ # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
+ vpaddd %ymm13,%ymm8,%ymm8
+ vpxord %ymm8,%ymm7,%ymm7
+ vprold $7,%ymm7,%ymm7
+ # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
+ vpaddd %ymm14,%ymm9,%ymm9
+ vpxord %ymm9,%ymm4,%ymm4
+ vprold $7,%ymm4,%ymm4
+
+ sub $2,%r8d
+ jnz .Ldoubleround8
+
+ # x0..15[0-3] += s[0..15]
+ vpaddd %ymm16,%ymm0,%ymm0
+ vpaddd %ymm17,%ymm1,%ymm1
+ vpaddd %ymm18,%ymm2,%ymm2
+ vpaddd %ymm19,%ymm3,%ymm3
+ vpaddd %ymm20,%ymm4,%ymm4
+ vpaddd %ymm21,%ymm5,%ymm5
+ vpaddd %ymm22,%ymm6,%ymm6
+ vpaddd %ymm23,%ymm7,%ymm7
+ vpaddd %ymm24,%ymm8,%ymm8
+ vpaddd %ymm25,%ymm9,%ymm9
+ vpaddd %ymm26,%ymm10,%ymm10
+ vpaddd %ymm27,%ymm11,%ymm11
+ vpaddd %ymm28,%ymm12,%ymm12
+ vpaddd %ymm29,%ymm13,%ymm13
+ vpaddd %ymm30,%ymm14,%ymm14
+ vpaddd %ymm31,%ymm15,%ymm15
+
+ # interleave 32-bit words in state n, n+1
+ vpunpckldq %ymm1,%ymm0,%ymm16
+ vpunpckhdq %ymm1,%ymm0,%ymm17
+ vpunpckldq %ymm3,%ymm2,%ymm18
+ vpunpckhdq %ymm3,%ymm2,%ymm19
+ vpunpckldq %ymm5,%ymm4,%ymm20
+ vpunpckhdq %ymm5,%ymm4,%ymm21
+ vpunpckldq %ymm7,%ymm6,%ymm22
+ vpunpckhdq %ymm7,%ymm6,%ymm23
+ vpunpckldq %ymm9,%ymm8,%ymm24
+ vpunpckhdq %ymm9,%ymm8,%ymm25
+ vpunpckldq %ymm11,%ymm10,%ymm26
+ vpunpckhdq %ymm11,%ymm10,%ymm27
+ vpunpckldq %ymm13,%ymm12,%ymm28
+ vpunpckhdq %ymm13,%ymm12,%ymm29
+ vpunpckldq %ymm15,%ymm14,%ymm30
+ vpunpckhdq %ymm15,%ymm14,%ymm31
+
+ # interleave 64-bit words in state n, n+2
+ vpunpcklqdq %ymm18,%ymm16,%ymm0
+ vpunpcklqdq %ymm19,%ymm17,%ymm1
+ vpunpckhqdq %ymm18,%ymm16,%ymm2
+ vpunpckhqdq %ymm19,%ymm17,%ymm3
+ vpunpcklqdq %ymm22,%ymm20,%ymm4
+ vpunpcklqdq %ymm23,%ymm21,%ymm5
+ vpunpckhqdq %ymm22,%ymm20,%ymm6
+ vpunpckhqdq %ymm23,%ymm21,%ymm7
+ vpunpcklqdq %ymm26,%ymm24,%ymm8
+ vpunpcklqdq %ymm27,%ymm25,%ymm9
+ vpunpckhqdq %ymm26,%ymm24,%ymm10
+ vpunpckhqdq %ymm27,%ymm25,%ymm11
+ vpunpcklqdq %ymm30,%ymm28,%ymm12
+ vpunpcklqdq %ymm31,%ymm29,%ymm13
+ vpunpckhqdq %ymm30,%ymm28,%ymm14
+ vpunpckhqdq %ymm31,%ymm29,%ymm15
+
+ # interleave 128-bit words in state n, n+4
+ # xor/write first four blocks
+ vmovdqa64 %ymm0,%ymm16
+ vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
+ cmp $0x0020,%rcx
+ jl .Lxorpart8
+ vpxord 0x0000(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0000(%rsi)
+ vmovdqa64 %ymm16,%ymm0
+ vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
+
+ vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
+ cmp $0x0040,%rcx
+ jl .Lxorpart8
+ vpxord 0x0020(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0020(%rsi)
+ vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
+
+ vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
+ cmp $0x0060,%rcx
+ jl .Lxorpart8
+ vpxord 0x0040(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0040(%rsi)
+ vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
+
+ vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
+ cmp $0x0080,%rcx
+ jl .Lxorpart8
+ vpxord 0x0060(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0060(%rsi)
+ vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
+
+ vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
+ cmp $0x00a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0080(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0080(%rsi)
+ vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
+
+ vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
+ cmp $0x00c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00a0(%rsi)
+ vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
+
+ vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
+ cmp $0x00e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x00c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00c0(%rsi)
+ vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
+
+ vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
+ cmp $0x0100,%rcx
+ jl .Lxorpart8
+ vpxord 0x00e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x00e0(%rsi)
+ vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
+
+ # xor remaining blocks, write to output
+ vmovdqa64 %ymm4,%ymm0
+ cmp $0x0120,%rcx
+ jl .Lxorpart8
+ vpxord 0x0100(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0100(%rsi)
+
+ vmovdqa64 %ymm12,%ymm0
+ cmp $0x0140,%rcx
+ jl .Lxorpart8
+ vpxord 0x0120(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0120(%rsi)
+
+ vmovdqa64 %ymm6,%ymm0
+ cmp $0x0160,%rcx
+ jl .Lxorpart8
+ vpxord 0x0140(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0140(%rsi)
+
+ vmovdqa64 %ymm14,%ymm0
+ cmp $0x0180,%rcx
+ jl .Lxorpart8
+ vpxord 0x0160(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0160(%rsi)
+
+ vmovdqa64 %ymm5,%ymm0
+ cmp $0x01a0,%rcx
+ jl .Lxorpart8
+ vpxord 0x0180(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x0180(%rsi)
+
+ vmovdqa64 %ymm13,%ymm0
+ cmp $0x01c0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01a0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01a0(%rsi)
+
+ vmovdqa64 %ymm7,%ymm0
+ cmp $0x01e0,%rcx
+ jl .Lxorpart8
+ vpxord 0x01c0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01c0(%rsi)
+
+ vmovdqa64 %ymm15,%ymm0
+ cmp $0x0200,%rcx
+ jl .Lxorpart8
+ vpxord 0x01e0(%rdx),%ymm0,%ymm0
+ vmovdqu64 %ymm0,0x01e0(%rsi)
+
+.Ldone8:
+ vzeroupper
+ ret
+
+.Lxorpart8:
+ # xor remaining bytes from partial register into output
+ mov %rcx,%rax
+ and $0x1f,%rcx
+ jz .Ldone8
+ mov %rax,%r9
+ and $~0x1f,%r9
+
+ mov $1,%rax
+ shld %cl,%rax,%rax
+ sub $1,%rax
+ kmovq %rax,%k1
+
+ vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
+ vpxord %ymm0,%ymm1,%ymm1
+ vmovdqu8 %ymm1,(%rsi,%r9){%k1}
+
+ jmp .Ldone8
+
+ENDPROC(chacha_8block_xor_avx512vl)
diff --git a/arch/x86/crypto/chacha20-ssse3-x86_64.S b/arch/x86/crypto/chacha-ssse3-x86_64.S
index 512a2b500fd1..c05a7a963dc3 100644
--- a/arch/x86/crypto/chacha20-ssse3-x86_64.S
+++ b/arch/x86/crypto/chacha-ssse3-x86_64.S
@@ -1,5 +1,5 @@
/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
+ * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
*
@@ -10,6 +10,7 @@
*/
#include <linux/linkage.h>
+#include <asm/frame.h>
.section .rodata.cst16.ROT8, "aM", @progbits, 16
.align 16
@@ -23,35 +24,25 @@ CTRINC: .octa 0x00000003000000020000000100000000
.text
-ENTRY(chacha20_block_xor_ssse3)
- # %rdi: Input state matrix, s
- # %rsi: 1 data block output, o
- # %rdx: 1 data block input, i
-
- # This function encrypts one ChaCha20 block by loading the state matrix
- # in four SSE registers. It performs matrix operation on four words in
- # parallel, but requireds shuffling to rearrange the words after each
- # round. 8/16-bit word rotation is done with the slightly better
- # performing SSSE3 byte shuffling, 7/12-bit word rotation uses
- # traditional shift+OR.
-
- # x0..3 = s0..3
- movdqa 0x00(%rdi),%xmm0
- movdqa 0x10(%rdi),%xmm1
- movdqa 0x20(%rdi),%xmm2
- movdqa 0x30(%rdi),%xmm3
- movdqa %xmm0,%xmm8
- movdqa %xmm1,%xmm9
- movdqa %xmm2,%xmm10
- movdqa %xmm3,%xmm11
+/*
+ * chacha_permute - permute one block
+ *
+ * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
+ * function performs matrix operations on four words in parallel, but requires
+ * shuffling to rearrange the words after each round. 8/16-bit word rotation is
+ * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
+ * rotation uses traditional shift+OR.
+ *
+ * The round count is given in %r8d.
+ *
+ * Clobbers: %r8d, %xmm4-%xmm7
+ */
+chacha_permute:
movdqa ROT8(%rip),%xmm4
movdqa ROT16(%rip),%xmm5
- mov $10,%ecx
-
.Ldoubleround:
-
# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
@@ -118,39 +109,129 @@ ENTRY(chacha20_block_xor_ssse3)
# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
pshufd $0x39,%xmm3,%xmm3
- dec %ecx
+ sub $2,%r8d
jnz .Ldoubleround
+ ret
+ENDPROC(chacha_permute)
+
+ENTRY(chacha_block_xor_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: up to 1 data block output, o
+ # %rdx: up to 1 data block input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
+ FRAME_BEGIN
+
+ # x0..3 = s0..3
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+ movdqa %xmm0,%xmm8
+ movdqa %xmm1,%xmm9
+ movdqa %xmm2,%xmm10
+ movdqa %xmm3,%xmm11
+
+ mov %rcx,%rax
+ call chacha_permute
+
# o0 = i0 ^ (x0 + s0)
- movdqu 0x00(%rdx),%xmm4
paddd %xmm8,%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart
+ movdqu 0x00(%rdx),%xmm4
pxor %xmm4,%xmm0
movdqu %xmm0,0x00(%rsi)
# o1 = i1 ^ (x1 + s1)
- movdqu 0x10(%rdx),%xmm5
paddd %xmm9,%xmm1
- pxor %xmm5,%xmm1
- movdqu %xmm1,0x10(%rsi)
+ movdqa %xmm1,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart
+ movdqu 0x10(%rdx),%xmm0
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x10(%rsi)
# o2 = i2 ^ (x2 + s2)
- movdqu 0x20(%rdx),%xmm6
paddd %xmm10,%xmm2
- pxor %xmm6,%xmm2
- movdqu %xmm2,0x20(%rsi)
+ movdqa %xmm2,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart
+ movdqu 0x20(%rdx),%xmm0
+ pxor %xmm2,%xmm0
+ movdqu %xmm0,0x20(%rsi)
# o3 = i3 ^ (x3 + s3)
- movdqu 0x30(%rdx),%xmm7
paddd %xmm11,%xmm3
- pxor %xmm7,%xmm3
- movdqu %xmm3,0x30(%rsi)
+ movdqa %xmm3,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart
+ movdqu 0x30(%rdx),%xmm0
+ pxor %xmm3,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
+.Ldone:
+ FRAME_END
+ ret
+
+.Lxorpart:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea 8(%rsp),%r10
+ sub $0x10,%rsp
+ and $~31,%rsp
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ lea -8(%r10),%rsp
+ jmp .Ldone
+
+ENDPROC(chacha_block_xor_ssse3)
+
+ENTRY(hchacha_block_ssse3)
+ # %rdi: Input state matrix, s
+ # %rsi: output (8 32-bit words)
+ # %edx: nrounds
+ FRAME_BEGIN
+
+ movdqa 0x00(%rdi),%xmm0
+ movdqa 0x10(%rdi),%xmm1
+ movdqa 0x20(%rdi),%xmm2
+ movdqa 0x30(%rdi),%xmm3
+
+ mov %edx,%r8d
+ call chacha_permute
+
+ movdqu %xmm0,0x00(%rsi)
+ movdqu %xmm3,0x10(%rsi)
+
+ FRAME_END
ret
-ENDPROC(chacha20_block_xor_ssse3)
+ENDPROC(hchacha_block_ssse3)
-ENTRY(chacha20_4block_xor_ssse3)
+ENTRY(chacha_4block_xor_ssse3)
# %rdi: Input state matrix, s
- # %rsi: 4 data blocks output, o
- # %rdx: 4 data blocks input, i
+ # %rsi: up to 4 data blocks output, o
+ # %rdx: up to 4 data blocks input, i
+ # %rcx: input/output length in bytes
+ # %r8d: nrounds
- # This function encrypts four consecutive ChaCha20 blocks by loading the
+ # This function encrypts four consecutive ChaCha blocks by loading the
# the state matrix in SSE registers four times. As we need some scratch
# registers, we save the first four registers on the stack. The
# algorithm performs each operation on the corresponding word of each
@@ -163,6 +244,7 @@ ENTRY(chacha20_4block_xor_ssse3)
lea 8(%rsp),%r10
sub $0x80,%rsp
and $~63,%rsp
+ mov %rcx,%rax
# x0..15[0-3] = s0..3[0..3]
movq 0x00(%rdi),%xmm1
@@ -202,8 +284,6 @@ ENTRY(chacha20_4block_xor_ssse3)
# x12 += counter values 0-3
paddd %xmm1,%xmm12
- mov $10,%ecx
-
.Ldoubleround4:
# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
movdqa 0x00(%rsp),%xmm0
@@ -421,7 +501,7 @@ ENTRY(chacha20_4block_xor_ssse3)
psrld $25,%xmm4
por %xmm0,%xmm4
- dec %ecx
+ sub $2,%r8d
jnz .Ldoubleround4
# x0[0-3] += s0[0]
@@ -573,58 +653,143 @@ ENTRY(chacha20_4block_xor_ssse3)
# xor with corresponding input, write to output
movdqa 0x00(%rsp),%xmm0
+ cmp $0x10,%rax
+ jl .Lxorpart4
movdqu 0x00(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x00(%rsi)
- movdqa 0x10(%rsp),%xmm0
- movdqu 0x80(%rdx),%xmm1
+
+ movdqu %xmm4,%xmm0
+ cmp $0x20,%rax
+ jl .Lxorpart4
+ movdqu 0x10(%rdx),%xmm1
pxor %xmm1,%xmm0
- movdqu %xmm0,0x80(%rsi)
+ movdqu %xmm0,0x10(%rsi)
+
+ movdqu %xmm8,%xmm0
+ cmp $0x30,%rax
+ jl .Lxorpart4
+ movdqu 0x20(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x20(%rsi)
+
+ movdqu %xmm12,%xmm0
+ cmp $0x40,%rax
+ jl .Lxorpart4
+ movdqu 0x30(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x30(%rsi)
+
movdqa 0x20(%rsp),%xmm0
+ cmp $0x50,%rax
+ jl .Lxorpart4
movdqu 0x40(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0x40(%rsi)
+
+ movdqu %xmm6,%xmm0
+ cmp $0x60,%rax
+ jl .Lxorpart4
+ movdqu 0x50(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x50(%rsi)
+
+ movdqu %xmm10,%xmm0
+ cmp $0x70,%rax
+ jl .Lxorpart4
+ movdqu 0x60(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x60(%rsi)
+
+ movdqu %xmm14,%xmm0
+ cmp $0x80,%rax
+ jl .Lxorpart4
+ movdqu 0x70(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x70(%rsi)
+
+ movdqa 0x10(%rsp),%xmm0
+ cmp $0x90,%rax
+ jl .Lxorpart4
+ movdqu 0x80(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x80(%rsi)
+
+ movdqu %xmm5,%xmm0
+ cmp $0xa0,%rax
+ jl .Lxorpart4
+ movdqu 0x90(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0x90(%rsi)
+
+ movdqu %xmm9,%xmm0
+ cmp $0xb0,%rax
+ jl .Lxorpart4
+ movdqu 0xa0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xa0(%rsi)
+
+ movdqu %xmm13,%xmm0
+ cmp $0xc0,%rax
+ jl .Lxorpart4
+ movdqu 0xb0(%rdx),%xmm1
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xb0(%rsi)
+
movdqa 0x30(%rsp),%xmm0
+ cmp $0xd0,%rax
+ jl .Lxorpart4
movdqu 0xc0(%rdx),%xmm1
pxor %xmm1,%xmm0
movdqu %xmm0,0xc0(%rsi)
- movdqu 0x10(%rdx),%xmm1
- pxor %xmm1,%xmm4
- movdqu %xmm4,0x10(%rsi)
- movdqu 0x90(%rdx),%xmm1
- pxor %xmm1,%xmm5
- movdqu %xmm5,0x90(%rsi)
- movdqu 0x50(%rdx),%xmm1
- pxor %xmm1,%xmm6
- movdqu %xmm6,0x50(%rsi)
+
+ movdqu %xmm7,%xmm0
+ cmp $0xe0,%rax
+ jl .Lxorpart4
movdqu 0xd0(%rdx),%xmm1
- pxor %xmm1,%xmm7
- movdqu %xmm7,0xd0(%rsi)
- movdqu 0x20(%rdx),%xmm1
- pxor %xmm1,%xmm8
- movdqu %xmm8,0x20(%rsi)
- movdqu 0xa0(%rdx),%xmm1
- pxor %xmm1,%xmm9
- movdqu %xmm9,0xa0(%rsi)
- movdqu 0x60(%rdx),%xmm1
- pxor %xmm1,%xmm10
- movdqu %xmm10,0x60(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xd0(%rsi)
+
+ movdqu %xmm11,%xmm0
+ cmp $0xf0,%rax
+ jl .Lxorpart4
movdqu 0xe0(%rdx),%xmm1
- pxor %xmm1,%xmm11
- movdqu %xmm11,0xe0(%rsi)
- movdqu 0x30(%rdx),%xmm1
- pxor %xmm1,%xmm12
- movdqu %xmm12,0x30(%rsi)
- movdqu 0xb0(%rdx),%xmm1
- pxor %xmm1,%xmm13
- movdqu %xmm13,0xb0(%rsi)
- movdqu 0x70(%rdx),%xmm1
- pxor %xmm1,%xmm14
- movdqu %xmm14,0x70(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xe0(%rsi)
+
+ movdqu %xmm15,%xmm0
+ cmp $0x100,%rax
+ jl .Lxorpart4
movdqu 0xf0(%rdx),%xmm1
- pxor %xmm1,%xmm15
- movdqu %xmm15,0xf0(%rsi)
+ pxor %xmm1,%xmm0
+ movdqu %xmm0,0xf0(%rsi)
+.Ldone4:
lea -8(%r10),%rsp
ret
-ENDPROC(chacha20_4block_xor_ssse3)
+
+.Lxorpart4:
+ # xor remaining bytes from partial register into output
+ mov %rax,%r9
+ and $0x0f,%r9
+ jz .Ldone4
+ and $~0x0f,%rax
+
+ mov %rsi,%r11
+
+ lea (%rdx,%rax),%rsi
+ mov %rsp,%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ pxor 0x00(%rsp),%xmm0
+ movdqa %xmm0,0x00(%rsp)
+
+ mov %rsp,%rsi
+ lea (%r11,%rax),%rdi
+ mov %r9,%rcx
+ rep movsb
+
+ jmp .Ldone4
+
+ENDPROC(chacha_4block_xor_ssse3)
diff --git a/arch/x86/crypto/chacha20-avx2-x86_64.S b/arch/x86/crypto/chacha20-avx2-x86_64.S
deleted file mode 100644
index f3cd26f48332..000000000000
--- a/arch/x86/crypto/chacha20-avx2-x86_64.S
+++ /dev/null
@@ -1,448 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX2 functions
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <linux/linkage.h>
-
-.section .rodata.cst32.ROT8, "aM", @progbits, 32
-.align 32
-ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
- .octa 0x0e0d0c0f0a09080b0605040702010003
-
-.section .rodata.cst32.ROT16, "aM", @progbits, 32
-.align 32
-ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
- .octa 0x0d0c0f0e09080b0a0504070601000302
-
-.section .rodata.cst32.CTRINC, "aM", @progbits, 32
-.align 32
-CTRINC: .octa 0x00000003000000020000000100000000
- .octa 0x00000007000000060000000500000004
-
-.text
-
-ENTRY(chacha20_8block_xor_avx2)
- # %rdi: Input state matrix, s
- # %rsi: 8 data blocks output, o
- # %rdx: 8 data blocks input, i
-
- # This function encrypts eight consecutive ChaCha20 blocks by loading
- # the state matrix in AVX registers eight times. As we need some
- # scratch registers, we save the first four registers on the stack. The
- # algorithm performs each operation on the corresponding word of each
- # state matrix, hence requires no word shuffling. For final XORing step
- # we transpose the matrix by interleaving 32-, 64- and then 128-bit
- # words, which allows us to do XOR in AVX registers. 8/16-bit word
- # rotation is done with the slightly better performing byte shuffling,
- # 7/12-bit word rotation uses traditional shift+OR.
-
- vzeroupper
- # 4 * 32 byte stack, 32-byte aligned
- lea 8(%rsp),%r10
- and $~31, %rsp
- sub $0x80, %rsp
-
- # x0..15[0-7] = s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpbroadcastd 0x04(%rdi),%ymm1
- vpbroadcastd 0x08(%rdi),%ymm2
- vpbroadcastd 0x0c(%rdi),%ymm3
- vpbroadcastd 0x10(%rdi),%ymm4
- vpbroadcastd 0x14(%rdi),%ymm5
- vpbroadcastd 0x18(%rdi),%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm7
- vpbroadcastd 0x20(%rdi),%ymm8
- vpbroadcastd 0x24(%rdi),%ymm9
- vpbroadcastd 0x28(%rdi),%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm11
- vpbroadcastd 0x30(%rdi),%ymm12
- vpbroadcastd 0x34(%rdi),%ymm13
- vpbroadcastd 0x38(%rdi),%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm15
- # x0..3 on stack
- vmovdqa %ymm0,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm3,0x60(%rsp)
-
- vmovdqa CTRINC(%rip),%ymm1
- vmovdqa ROT8(%rip),%ymm2
- vmovdqa ROT16(%rip),%ymm3
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- mov $10,%ecx
-
-.Ldoubleround8:
- # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
- # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
-
- # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
- vpaddd %ymm12,%ymm8,%ymm8
- vpxor %ymm8,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
- # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
- vpaddd %ymm13,%ymm9,%ymm9
- vpxor %ymm9,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
- vpaddd %ymm14,%ymm10,%ymm10
- vpxor %ymm10,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
- vpaddd %ymm15,%ymm11,%ymm11
- vpxor %ymm11,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm3,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm3,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm3,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm3,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $12,%ymm5,%ymm0
- vpsrld $20,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $12,%ymm6,%ymm0
- vpsrld $20,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $12,%ymm7,%ymm0
- vpsrld $20,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $12,%ymm4,%ymm0
- vpsrld $20,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
- vpaddd 0x00(%rsp),%ymm5,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpxor %ymm0,%ymm15,%ymm15
- vpshufb %ymm2,%ymm15,%ymm15
- # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
- vpaddd 0x20(%rsp),%ymm6,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpxor %ymm0,%ymm12,%ymm12
- vpshufb %ymm2,%ymm12,%ymm12
- # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
- vpaddd 0x40(%rsp),%ymm7,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpxor %ymm0,%ymm13,%ymm13
- vpshufb %ymm2,%ymm13,%ymm13
- # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
- vpaddd 0x60(%rsp),%ymm4,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpxor %ymm0,%ymm14,%ymm14
- vpshufb %ymm2,%ymm14,%ymm14
-
- # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
- vpaddd %ymm15,%ymm10,%ymm10
- vpxor %ymm10,%ymm5,%ymm5
- vpslld $7,%ymm5,%ymm0
- vpsrld $25,%ymm5,%ymm5
- vpor %ymm0,%ymm5,%ymm5
- # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
- vpaddd %ymm12,%ymm11,%ymm11
- vpxor %ymm11,%ymm6,%ymm6
- vpslld $7,%ymm6,%ymm0
- vpsrld $25,%ymm6,%ymm6
- vpor %ymm0,%ymm6,%ymm6
- # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
- vpaddd %ymm13,%ymm8,%ymm8
- vpxor %ymm8,%ymm7,%ymm7
- vpslld $7,%ymm7,%ymm0
- vpsrld $25,%ymm7,%ymm7
- vpor %ymm0,%ymm7,%ymm7
- # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
- vpaddd %ymm14,%ymm9,%ymm9
- vpxor %ymm9,%ymm4,%ymm4
- vpslld $7,%ymm4,%ymm0
- vpsrld $25,%ymm4,%ymm4
- vpor %ymm0,%ymm4,%ymm4
-
- dec %ecx
- jnz .Ldoubleround8
-
- # x0..15[0-3] += s[0..15]
- vpbroadcastd 0x00(%rdi),%ymm0
- vpaddd 0x00(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x00(%rsp)
- vpbroadcastd 0x04(%rdi),%ymm0
- vpaddd 0x20(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x20(%rsp)
- vpbroadcastd 0x08(%rdi),%ymm0
- vpaddd 0x40(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x40(%rsp)
- vpbroadcastd 0x0c(%rdi),%ymm0
- vpaddd 0x60(%rsp),%ymm0,%ymm0
- vmovdqa %ymm0,0x60(%rsp)
- vpbroadcastd 0x10(%rdi),%ymm0
- vpaddd %ymm0,%ymm4,%ymm4
- vpbroadcastd 0x14(%rdi),%ymm0
- vpaddd %ymm0,%ymm5,%ymm5
- vpbroadcastd 0x18(%rdi),%ymm0
- vpaddd %ymm0,%ymm6,%ymm6
- vpbroadcastd 0x1c(%rdi),%ymm0
- vpaddd %ymm0,%ymm7,%ymm7
- vpbroadcastd 0x20(%rdi),%ymm0
- vpaddd %ymm0,%ymm8,%ymm8
- vpbroadcastd 0x24(%rdi),%ymm0
- vpaddd %ymm0,%ymm9,%ymm9
- vpbroadcastd 0x28(%rdi),%ymm0
- vpaddd %ymm0,%ymm10,%ymm10
- vpbroadcastd 0x2c(%rdi),%ymm0
- vpaddd %ymm0,%ymm11,%ymm11
- vpbroadcastd 0x30(%rdi),%ymm0
- vpaddd %ymm0,%ymm12,%ymm12
- vpbroadcastd 0x34(%rdi),%ymm0
- vpaddd %ymm0,%ymm13,%ymm13
- vpbroadcastd 0x38(%rdi),%ymm0
- vpaddd %ymm0,%ymm14,%ymm14
- vpbroadcastd 0x3c(%rdi),%ymm0
- vpaddd %ymm0,%ymm15,%ymm15
-
- # x12 += counter values 0-3
- vpaddd %ymm1,%ymm12,%ymm12
-
- # interleave 32-bit words in state n, n+1
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x20(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x00(%rsp)
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm1
- vpunpckldq %ymm1,%ymm0,%ymm2
- vpunpckhdq %ymm1,%ymm0,%ymm1
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa %ymm1,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpckldq %ymm5,%ymm0,%ymm4
- vpunpckhdq %ymm5,%ymm0,%ymm5
- vmovdqa %ymm6,%ymm0
- vpunpckldq %ymm7,%ymm0,%ymm6
- vpunpckhdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpckldq %ymm9,%ymm0,%ymm8
- vpunpckhdq %ymm9,%ymm0,%ymm9
- vmovdqa %ymm10,%ymm0
- vpunpckldq %ymm11,%ymm0,%ymm10
- vpunpckhdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpckldq %ymm13,%ymm0,%ymm12
- vpunpckhdq %ymm13,%ymm0,%ymm13
- vmovdqa %ymm14,%ymm0
- vpunpckldq %ymm15,%ymm0,%ymm14
- vpunpckhdq %ymm15,%ymm0,%ymm15
-
- # interleave 64-bit words in state n, n+2
- vmovdqa 0x00(%rsp),%ymm0
- vmovdqa 0x40(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa %ymm2,0x40(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vmovdqa 0x60(%rsp),%ymm2
- vpunpcklqdq %ymm2,%ymm0,%ymm1
- vpunpckhqdq %ymm2,%ymm0,%ymm2
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa %ymm2,0x60(%rsp)
- vmovdqa %ymm4,%ymm0
- vpunpcklqdq %ymm6,%ymm0,%ymm4
- vpunpckhqdq %ymm6,%ymm0,%ymm6
- vmovdqa %ymm5,%ymm0
- vpunpcklqdq %ymm7,%ymm0,%ymm5
- vpunpckhqdq %ymm7,%ymm0,%ymm7
- vmovdqa %ymm8,%ymm0
- vpunpcklqdq %ymm10,%ymm0,%ymm8
- vpunpckhqdq %ymm10,%ymm0,%ymm10
- vmovdqa %ymm9,%ymm0
- vpunpcklqdq %ymm11,%ymm0,%ymm9
- vpunpckhqdq %ymm11,%ymm0,%ymm11
- vmovdqa %ymm12,%ymm0
- vpunpcklqdq %ymm14,%ymm0,%ymm12
- vpunpckhqdq %ymm14,%ymm0,%ymm14
- vmovdqa %ymm13,%ymm0
- vpunpcklqdq %ymm15,%ymm0,%ymm13
- vpunpckhqdq %ymm15,%ymm0,%ymm15
-
- # interleave 128-bit words in state n, n+4
- vmovdqa 0x00(%rsp),%ymm0
- vperm2i128 $0x20,%ymm4,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
- vmovdqa %ymm1,0x00(%rsp)
- vmovdqa 0x20(%rsp),%ymm0
- vperm2i128 $0x20,%ymm5,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm5,%ymm0,%ymm5
- vmovdqa %ymm1,0x20(%rsp)
- vmovdqa 0x40(%rsp),%ymm0
- vperm2i128 $0x20,%ymm6,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm6,%ymm0,%ymm6
- vmovdqa %ymm1,0x40(%rsp)
- vmovdqa 0x60(%rsp),%ymm0
- vperm2i128 $0x20,%ymm7,%ymm0,%ymm1
- vperm2i128 $0x31,%ymm7,%ymm0,%ymm7
- vmovdqa %ymm1,0x60(%rsp)
- vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
- vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
- vmovdqa %ymm0,%ymm8
- vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
- vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
- vmovdqa %ymm0,%ymm9
- vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
- vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
- vmovdqa %ymm0,%ymm10
- vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
- vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
- vmovdqa %ymm0,%ymm11
-
- # xor with corresponding input, write to output
- vmovdqa 0x00(%rsp),%ymm0
- vpxor 0x0000(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0000(%rsi)
- vmovdqa 0x20(%rsp),%ymm0
- vpxor 0x0080(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0080(%rsi)
- vmovdqa 0x40(%rsp),%ymm0
- vpxor 0x0040(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x0040(%rsi)
- vmovdqa 0x60(%rsp),%ymm0
- vpxor 0x00c0(%rdx),%ymm0,%ymm0
- vmovdqu %ymm0,0x00c0(%rsi)
- vpxor 0x0100(%rdx),%ymm4,%ymm4
- vmovdqu %ymm4,0x0100(%rsi)
- vpxor 0x0180(%rdx),%ymm5,%ymm5
- vmovdqu %ymm5,0x00180(%rsi)
- vpxor 0x0140(%rdx),%ymm6,%ymm6
- vmovdqu %ymm6,0x0140(%rsi)
- vpxor 0x01c0(%rdx),%ymm7,%ymm7
- vmovdqu %ymm7,0x01c0(%rsi)
- vpxor 0x0020(%rdx),%ymm8,%ymm8
- vmovdqu %ymm8,0x0020(%rsi)
- vpxor 0x00a0(%rdx),%ymm9,%ymm9
- vmovdqu %ymm9,0x00a0(%rsi)
- vpxor 0x0060(%rdx),%ymm10,%ymm10
- vmovdqu %ymm10,0x0060(%rsi)
- vpxor 0x00e0(%rdx),%ymm11,%ymm11
- vmovdqu %ymm11,0x00e0(%rsi)
- vpxor 0x0120(%rdx),%ymm12,%ymm12
- vmovdqu %ymm12,0x0120(%rsi)
- vpxor 0x01a0(%rdx),%ymm13,%ymm13
- vmovdqu %ymm13,0x01a0(%rsi)
- vpxor 0x0160(%rdx),%ymm14,%ymm14
- vmovdqu %ymm14,0x0160(%rsi)
- vpxor 0x01e0(%rdx),%ymm15,%ymm15
- vmovdqu %ymm15,0x01e0(%rsi)
-
- vzeroupper
- lea -8(%r10),%rsp
- ret
-ENDPROC(chacha20_8block_xor_avx2)
diff --git a/arch/x86/crypto/chacha20_glue.c b/arch/x86/crypto/chacha20_glue.c
deleted file mode 100644
index dce7c5d39c2f..000000000000
--- a/arch/x86/crypto/chacha20_glue.c
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
- *
- * Copyright (C) 2015 Martin Willi
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- */
-
-#include <crypto/algapi.h>
-#include <crypto/chacha20.h>
-#include <crypto/internal/skcipher.h>
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <asm/fpu/api.h>
-#include <asm/simd.h>
-
-#define CHACHA20_STATE_ALIGN 16
-
-asmlinkage void chacha20_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-asmlinkage void chacha20_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src);
-#ifdef CONFIG_AS_AVX2
-asmlinkage void chacha20_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src);
-static bool chacha20_use_avx2;
-#endif
-
-static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
- unsigned int bytes)
-{
- u8 buf[CHACHA20_BLOCK_SIZE];
-
-#ifdef CONFIG_AS_AVX2
- if (chacha20_use_avx2) {
- while (bytes >= CHACHA20_BLOCK_SIZE * 8) {
- chacha20_8block_xor_avx2(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 8;
- src += CHACHA20_BLOCK_SIZE * 8;
- dst += CHACHA20_BLOCK_SIZE * 8;
- state[12] += 8;
- }
- }
-#endif
- while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
- chacha20_4block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE * 4;
- src += CHACHA20_BLOCK_SIZE * 4;
- dst += CHACHA20_BLOCK_SIZE * 4;
- state[12] += 4;
- }
- while (bytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_block_xor_ssse3(state, dst, src);
- bytes -= CHACHA20_BLOCK_SIZE;
- src += CHACHA20_BLOCK_SIZE;
- dst += CHACHA20_BLOCK_SIZE;
- state[12]++;
- }
- if (bytes) {
- memcpy(buf, src, bytes);
- chacha20_block_xor_ssse3(state, buf, buf);
- memcpy(dst, buf, bytes);
- }
-}
-
-static int chacha20_simd(struct skcipher_request *req)
-{
- struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
- struct chacha20_ctx *ctx = crypto_skcipher_ctx(tfm);
- u32 *state, state_buf[16 + 2] __aligned(8);
- struct skcipher_walk walk;
- int err;
-
- BUILD_BUG_ON(CHACHA20_STATE_ALIGN != 16);
- state = PTR_ALIGN(state_buf + 0, CHACHA20_STATE_ALIGN);
-
- if (req->cryptlen <= CHACHA20_BLOCK_SIZE || !may_use_simd())
- return crypto_chacha20_crypt(req);
-
- err = skcipher_walk_virt(&walk, req, true);
-
- crypto_chacha20_init(state, ctx, walk.iv);
-
- kernel_fpu_begin();
-
- while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
- chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
- err = skcipher_walk_done(&walk,
- walk.nbytes % CHACHA20_BLOCK_SIZE);
- }
-
- if (walk.nbytes) {
- chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
- walk.nbytes);
- err = skcipher_walk_done(&walk, 0);
- }
-
- kernel_fpu_end();
-
- return err;
-}
-
-static struct skcipher_alg alg = {
- .base.cra_name = "chacha20",
- .base.cra_driver_name = "chacha20-simd",
- .base.cra_priority = 300,
- .base.cra_blocksize = 1,
- .base.cra_ctxsize = sizeof(struct chacha20_ctx),
- .base.cra_module = THIS_MODULE,
-
- .min_keysize = CHACHA20_KEY_SIZE,
- .max_keysize = CHACHA20_KEY_SIZE,
- .ivsize = CHACHA20_IV_SIZE,
- .chunksize = CHACHA20_BLOCK_SIZE,
- .setkey = crypto_chacha20_setkey,
- .encrypt = chacha20_simd,
- .decrypt = chacha20_simd,
-};
-
-static int __init chacha20_simd_mod_init(void)
-{
- if (!boot_cpu_has(X86_FEATURE_SSSE3))
- return -ENODEV;
-
-#ifdef CONFIG_AS_AVX2
- chacha20_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
- boot_cpu_has(X86_FEATURE_AVX2) &&
- cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
-#endif
- return crypto_register_skcipher(&alg);
-}
-
-static void __exit chacha20_simd_mod_fini(void)
-{
- crypto_unregister_skcipher(&alg);
-}
-
-module_init(chacha20_simd_mod_init);
-module_exit(chacha20_simd_mod_fini);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
-MODULE_DESCRIPTION("chacha20 cipher algorithm, SIMD accelerated");
-MODULE_ALIAS_CRYPTO("chacha20");
-MODULE_ALIAS_CRYPTO("chacha20-simd");
diff --git a/arch/x86/crypto/chacha_glue.c b/arch/x86/crypto/chacha_glue.c
new file mode 100644
index 000000000000..45c1c4143176
--- /dev/null
+++ b/arch/x86/crypto/chacha_glue.c
@@ -0,0 +1,304 @@
+/*
+ * x64 SIMD accelerated ChaCha and XChaCha stream ciphers,
+ * including ChaCha20 (RFC7539)
+ *
+ * Copyright (C) 2015 Martin Willi
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <crypto/algapi.h>
+#include <crypto/chacha.h>
+#include <crypto/internal/skcipher.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+#include <asm/simd.h>
+
+#define CHACHA_STATE_ALIGN 16
+
+asmlinkage void chacha_block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_ssse3(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void hchacha_block_ssse3(const u32 *state, u32 *out, int nrounds);
+#ifdef CONFIG_AS_AVX2
+asmlinkage void chacha_2block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx2(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx2;
+#ifdef CONFIG_AS_AVX512
+asmlinkage void chacha_2block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_4block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+asmlinkage void chacha_8block_xor_avx512vl(u32 *state, u8 *dst, const u8 *src,
+ unsigned int len, int nrounds);
+static bool chacha_use_avx512vl;
+#endif
+#endif
+
+static unsigned int chacha_advance(unsigned int len, unsigned int maxblocks)
+{
+ len = min(len, maxblocks * CHACHA_BLOCK_SIZE);
+ return round_up(len, CHACHA_BLOCK_SIZE) / CHACHA_BLOCK_SIZE;
+}
+
+static void chacha_dosimd(u32 *state, u8 *dst, const u8 *src,
+ unsigned int bytes, int nrounds)
+{
+#ifdef CONFIG_AS_AVX2
+#ifdef CONFIG_AS_AVX512
+ if (chacha_use_avx512vl) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_2block_xor_avx512vl(state, dst, src, bytes,
+ nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ if (chacha_use_avx2) {
+ while (bytes >= CHACHA_BLOCK_SIZE * 8) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 8;
+ src += CHACHA_BLOCK_SIZE * 8;
+ dst += CHACHA_BLOCK_SIZE * 8;
+ state[12] += 8;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 4) {
+ chacha_8block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 8);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE * 2) {
+ chacha_4block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_2block_xor_avx2(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 2);
+ return;
+ }
+ }
+#endif
+ while (bytes >= CHACHA_BLOCK_SIZE * 4) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ bytes -= CHACHA_BLOCK_SIZE * 4;
+ src += CHACHA_BLOCK_SIZE * 4;
+ dst += CHACHA_BLOCK_SIZE * 4;
+ state[12] += 4;
+ }
+ if (bytes > CHACHA_BLOCK_SIZE) {
+ chacha_4block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12] += chacha_advance(bytes, 4);
+ return;
+ }
+ if (bytes) {
+ chacha_block_xor_ssse3(state, dst, src, bytes, nrounds);
+ state[12]++;
+ }
+}
+
+static int chacha_simd_stream_xor(struct skcipher_walk *walk,
+ struct chacha_ctx *ctx, u8 *iv)
+{
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ int next_yield = 4096; /* bytes until next FPU yield */
+ int err = 0;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+
+ crypto_chacha_init(state, ctx, iv);
+
+ while (walk->nbytes > 0) {
+ unsigned int nbytes = walk->nbytes;
+
+ if (nbytes < walk->total) {
+ nbytes = round_down(nbytes, walk->stride);
+ next_yield -= nbytes;
+ }
+
+ chacha_dosimd(state, walk->dst.virt.addr, walk->src.virt.addr,
+ nbytes, ctx->nrounds);
+
+ if (next_yield <= 0) {
+ /* temporarily allow preemption */
+ kernel_fpu_end();
+ kernel_fpu_begin();
+ next_yield = 4096;
+ }
+
+ err = skcipher_walk_done(walk, walk->nbytes - nbytes);
+ }
+
+ return err;
+}
+
+static int chacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+ return crypto_chacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ kernel_fpu_begin();
+ err = chacha_simd_stream_xor(&walk, ctx, req->iv);
+ kernel_fpu_end();
+ return err;
+}
+
+static int xchacha_simd(struct skcipher_request *req)
+{
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req);
+ struct chacha_ctx *ctx = crypto_skcipher_ctx(tfm);
+ struct skcipher_walk walk;
+ struct chacha_ctx subctx;
+ u32 *state, state_buf[16 + 2] __aligned(8);
+ u8 real_iv[16];
+ int err;
+
+ if (req->cryptlen <= CHACHA_BLOCK_SIZE || !irq_fpu_usable())
+ return crypto_xchacha_crypt(req);
+
+ err = skcipher_walk_virt(&walk, req, true);
+ if (err)
+ return err;
+
+ BUILD_BUG_ON(CHACHA_STATE_ALIGN != 16);
+ state = PTR_ALIGN(state_buf + 0, CHACHA_STATE_ALIGN);
+ crypto_chacha_init(state, ctx, req->iv);
+
+ kernel_fpu_begin();
+
+ hchacha_block_ssse3(state, subctx.key, ctx->nrounds);
+ subctx.nrounds = ctx->nrounds;
+
+ memcpy(&real_iv[0], req->iv + 24, 8);
+ memcpy(&real_iv[8], req->iv + 16, 8);
+ err = chacha_simd_stream_xor(&walk, &subctx, real_iv);
+
+ kernel_fpu_end();
+
+ return err;
+}
+
+static struct skcipher_alg algs[] = {
+ {
+ .base.cra_name = "chacha20",
+ .base.cra_driver_name = "chacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = CHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = chacha_simd,
+ .decrypt = chacha_simd,
+ }, {
+ .base.cra_name = "xchacha20",
+ .base.cra_driver_name = "xchacha20-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha20_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ }, {
+ .base.cra_name = "xchacha12",
+ .base.cra_driver_name = "xchacha12-simd",
+ .base.cra_priority = 300,
+ .base.cra_blocksize = 1,
+ .base.cra_ctxsize = sizeof(struct chacha_ctx),
+ .base.cra_module = THIS_MODULE,
+
+ .min_keysize = CHACHA_KEY_SIZE,
+ .max_keysize = CHACHA_KEY_SIZE,
+ .ivsize = XCHACHA_IV_SIZE,
+ .chunksize = CHACHA_BLOCK_SIZE,
+ .setkey = crypto_chacha12_setkey,
+ .encrypt = xchacha_simd,
+ .decrypt = xchacha_simd,
+ },
+};
+
+static int __init chacha_simd_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_SSSE3))
+ return -ENODEV;
+
+#ifdef CONFIG_AS_AVX2
+ chacha_use_avx2 = boot_cpu_has(X86_FEATURE_AVX) &&
+ boot_cpu_has(X86_FEATURE_AVX2) &&
+ cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL);
+#ifdef CONFIG_AS_AVX512
+ chacha_use_avx512vl = chacha_use_avx2 &&
+ boot_cpu_has(X86_FEATURE_AVX512VL) &&
+ boot_cpu_has(X86_FEATURE_AVX512BW); /* kmovq */
+#endif
+#endif
+ return crypto_register_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit chacha_simd_mod_fini(void)
+{
+ crypto_unregister_skciphers(algs, ARRAY_SIZE(algs));
+}
+
+module_init(chacha_simd_mod_init);
+module_exit(chacha_simd_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Martin Willi <martin@strongswan.org>");
+MODULE_DESCRIPTION("ChaCha and XChaCha stream ciphers (x64 SIMD accelerated)");
+MODULE_ALIAS_CRYPTO("chacha20");
+MODULE_ALIAS_CRYPTO("chacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha20");
+MODULE_ALIAS_CRYPTO("xchacha20-simd");
+MODULE_ALIAS_CRYPTO("xchacha12");
+MODULE_ALIAS_CRYPTO("xchacha12-simd");
diff --git a/arch/x86/crypto/nh-avx2-x86_64.S b/arch/x86/crypto/nh-avx2-x86_64.S
new file mode 100644
index 000000000000..f7946ea1b704
--- /dev/null
+++ b/arch/x86/crypto/nh-avx2-x86_64.S
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 AVX2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %ymm0
+#define PASS1_SUMS %ymm1
+#define PASS2_SUMS %ymm2
+#define PASS3_SUMS %ymm3
+#define K0 %ymm4
+#define K0_XMM %xmm4
+#define K1 %ymm5
+#define K1_XMM %xmm5
+#define K2 %ymm6
+#define K2_XMM %xmm6
+#define K3 %ymm7
+#define K3_XMM %xmm7
+#define T0 %ymm8
+#define T1 %ymm9
+#define T2 %ymm10
+#define T2_XMM %xmm10
+#define T3 %ymm11
+#define T3_XMM %xmm11
+#define T4 %ymm12
+#define T5 %ymm13
+#define T6 %ymm14
+#define T7 %ymm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_2xstride k0, k1, k2, k3
+
+ // Add message words to key words
+ vpaddd \k0, T3, T0
+ vpaddd \k1, T3, T1
+ vpaddd \k2, T3, T2
+ vpaddd \k3, T3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ vpshufd $0x10, T0, T4
+ vpshufd $0x32, T0, T0
+ vpshufd $0x10, T1, T5
+ vpshufd $0x32, T1, T1
+ vpshufd $0x10, T2, T6
+ vpshufd $0x32, T2, T2
+ vpshufd $0x10, T3, T7
+ vpshufd $0x32, T3, T3
+ vpmuludq T4, T0, T0
+ vpmuludq T5, T1, T1
+ vpmuludq T6, T2, T2
+ vpmuludq T7, T3, T3
+ vpaddq T0, PASS0_SUMS, PASS0_SUMS
+ vpaddq T1, PASS1_SUMS, PASS1_SUMS
+ vpaddq T2, PASS2_SUMS, PASS2_SUMS
+ vpaddq T3, PASS3_SUMS, PASS3_SUMS
+.endm
+
+/*
+ * void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_avx2)
+
+ vmovdqu 0x00(KEY), K0
+ vmovdqu 0x10(KEY), K1
+ add $0x20, KEY
+ vpxor PASS0_SUMS, PASS0_SUMS, PASS0_SUMS
+ vpxor PASS1_SUMS, PASS1_SUMS, PASS1_SUMS
+ vpxor PASS2_SUMS, PASS2_SUMS, PASS2_SUMS
+ vpxor PASS3_SUMS, PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+
+ vmovdqu 0x20(MESSAGE), T3
+ vmovdqu 0x20(KEY), K0
+ vmovdqu 0x30(KEY), K1
+ _nh_2xstride K2, K3, K0, K1
+
+ add $0x40, MESSAGE
+ add $0x40, KEY
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+
+ cmp $0x20, MESSAGE_LEN
+ jl .Llast
+
+ // 2 or 3 strides remain; do 2 more.
+ vmovdqu (MESSAGE), T3
+ vmovdqu 0x00(KEY), K2
+ vmovdqu 0x10(KEY), K3
+ _nh_2xstride K0, K1, K2, K3
+ add $0x20, MESSAGE
+ add $0x20, KEY
+ sub $0x20, MESSAGE_LEN
+ jz .Ldone
+ vmovdqa K2, K0
+ vmovdqa K3, K1
+.Llast:
+ // Last stride. Zero the high 128 bits of the message and keys so they
+ // don't affect the result when processing them like 2 strides.
+ vmovdqu (MESSAGE), T3_XMM
+ vmovdqa K0_XMM, K0_XMM
+ vmovdqa K1_XMM, K1_XMM
+ vmovdqu 0x00(KEY), K2_XMM
+ vmovdqu 0x10(KEY), K3_XMM
+ _nh_2xstride K0, K1, K2, K3
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+
+ // PASS0_SUMS is (0A 0B 0C 0D)
+ // PASS1_SUMS is (1A 1B 1C 1D)
+ // PASS2_SUMS is (2A 2B 2C 2D)
+ // PASS3_SUMS is (3A 3B 3C 3D)
+ // We need the horizontal sums:
+ // (0A + 0B + 0C + 0D,
+ // 1A + 1B + 1C + 1D,
+ // 2A + 2B + 2C + 2D,
+ // 3A + 3B + 3C + 3D)
+ //
+
+ vpunpcklqdq PASS1_SUMS, PASS0_SUMS, T0 // T0 = (0A 1A 0C 1C)
+ vpunpckhqdq PASS1_SUMS, PASS0_SUMS, T1 // T1 = (0B 1B 0D 1D)
+ vpunpcklqdq PASS3_SUMS, PASS2_SUMS, T2 // T2 = (2A 3A 2C 3C)
+ vpunpckhqdq PASS3_SUMS, PASS2_SUMS, T3 // T3 = (2B 3B 2D 3D)
+
+ vinserti128 $0x1, T2_XMM, T0, T4 // T4 = (0A 1A 2A 3A)
+ vinserti128 $0x1, T3_XMM, T1, T5 // T5 = (0B 1B 2B 3B)
+ vperm2i128 $0x31, T2, T0, T0 // T0 = (0C 1C 2C 3C)
+ vperm2i128 $0x31, T3, T1, T1 // T1 = (0D 1D 2D 3D)
+
+ vpaddq T5, T4, T4
+ vpaddq T1, T0, T0
+ vpaddq T4, T0, T0
+ vmovdqu T0, (HASH)
+ ret
+ENDPROC(nh_avx2)
diff --git a/arch/x86/crypto/nh-sse2-x86_64.S b/arch/x86/crypto/nh-sse2-x86_64.S
new file mode 100644
index 000000000000..51f52d4ab4bb
--- /dev/null
+++ b/arch/x86/crypto/nh-sse2-x86_64.S
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * NH - ε-almost-universal hash function, x86_64 SSE2 accelerated
+ *
+ * Copyright 2018 Google LLC
+ *
+ * Author: Eric Biggers <ebiggers@google.com>
+ */
+
+#include <linux/linkage.h>
+
+#define PASS0_SUMS %xmm0
+#define PASS1_SUMS %xmm1
+#define PASS2_SUMS %xmm2
+#define PASS3_SUMS %xmm3
+#define K0 %xmm4
+#define K1 %xmm5
+#define K2 %xmm6
+#define K3 %xmm7
+#define T0 %xmm8
+#define T1 %xmm9
+#define T2 %xmm10
+#define T3 %xmm11
+#define T4 %xmm12
+#define T5 %xmm13
+#define T6 %xmm14
+#define T7 %xmm15
+#define KEY %rdi
+#define MESSAGE %rsi
+#define MESSAGE_LEN %rdx
+#define HASH %rcx
+
+.macro _nh_stride k0, k1, k2, k3, offset
+
+ // Load next message stride
+ movdqu \offset(MESSAGE), T1
+
+ // Load next key stride
+ movdqu \offset(KEY), \k3
+
+ // Add message words to key words
+ movdqa T1, T2
+ movdqa T1, T3
+ paddd T1, \k0 // reuse k0 to avoid a move
+ paddd \k1, T1
+ paddd \k2, T2
+ paddd \k3, T3
+
+ // Multiply 32x32 => 64 and accumulate
+ pshufd $0x10, \k0, T4
+ pshufd $0x32, \k0, \k0
+ pshufd $0x10, T1, T5
+ pshufd $0x32, T1, T1
+ pshufd $0x10, T2, T6
+ pshufd $0x32, T2, T2
+ pshufd $0x10, T3, T7
+ pshufd $0x32, T3, T3
+ pmuludq T4, \k0
+ pmuludq T5, T1
+ pmuludq T6, T2
+ pmuludq T7, T3
+ paddq \k0, PASS0_SUMS
+ paddq T1, PASS1_SUMS
+ paddq T2, PASS2_SUMS
+ paddq T3, PASS3_SUMS
+.endm
+
+/*
+ * void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ * u8 hash[NH_HASH_BYTES])
+ *
+ * It's guaranteed that message_len % 16 == 0.
+ */
+ENTRY(nh_sse2)
+
+ movdqu 0x00(KEY), K0
+ movdqu 0x10(KEY), K1
+ movdqu 0x20(KEY), K2
+ add $0x30, KEY
+ pxor PASS0_SUMS, PASS0_SUMS
+ pxor PASS1_SUMS, PASS1_SUMS
+ pxor PASS2_SUMS, PASS2_SUMS
+ pxor PASS3_SUMS, PASS3_SUMS
+
+ sub $0x40, MESSAGE_LEN
+ jl .Lloop4_done
+.Lloop4:
+ _nh_stride K0, K1, K2, K3, 0x00
+ _nh_stride K1, K2, K3, K0, 0x10
+ _nh_stride K2, K3, K0, K1, 0x20
+ _nh_stride K3, K0, K1, K2, 0x30
+ add $0x40, KEY
+ add $0x40, MESSAGE
+ sub $0x40, MESSAGE_LEN
+ jge .Lloop4
+
+.Lloop4_done:
+ and $0x3f, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K0, K1, K2, K3, 0x00
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K1, K2, K3, K0, 0x10
+
+ sub $0x10, MESSAGE_LEN
+ jz .Ldone
+ _nh_stride K2, K3, K0, K1, 0x20
+
+.Ldone:
+ // Sum the accumulators for each pass, then store the sums to 'hash'
+ movdqa PASS0_SUMS, T0
+ movdqa PASS2_SUMS, T1
+ punpcklqdq PASS1_SUMS, T0 // => (PASS0_SUM_A PASS1_SUM_A)
+ punpcklqdq PASS3_SUMS, T1 // => (PASS2_SUM_A PASS3_SUM_A)
+ punpckhqdq PASS1_SUMS, PASS0_SUMS // => (PASS0_SUM_B PASS1_SUM_B)
+ punpckhqdq PASS3_SUMS, PASS2_SUMS // => (PASS2_SUM_B PASS3_SUM_B)
+ paddq PASS0_SUMS, T0
+ paddq PASS2_SUMS, T1
+ movdqu T0, 0x00(HASH)
+ movdqu T1, 0x10(HASH)
+ ret
+ENDPROC(nh_sse2)
diff --git a/arch/x86/crypto/nhpoly1305-avx2-glue.c b/arch/x86/crypto/nhpoly1305-avx2-glue.c
new file mode 100644
index 000000000000..20d815ea4b6a
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-avx2-glue.c
@@ -0,0 +1,77 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (AVX2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_avx2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_avx2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_avx2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !irq_fpu_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_avx2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-avx2",
+ .base.cra_priority = 300,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_avx2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_AVX2) ||
+ !boot_cpu_has(X86_FEATURE_OSXSAVE))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (AVX2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-avx2");
diff --git a/arch/x86/crypto/nhpoly1305-sse2-glue.c b/arch/x86/crypto/nhpoly1305-sse2-glue.c
new file mode 100644
index 000000000000..ed68d164ce14
--- /dev/null
+++ b/arch/x86/crypto/nhpoly1305-sse2-glue.c
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NHPoly1305 - ε-almost-∆-universal hash function for Adiantum
+ * (SSE2 accelerated version)
+ *
+ * Copyright 2018 Google LLC
+ */
+
+#include <crypto/internal/hash.h>
+#include <crypto/nhpoly1305.h>
+#include <linux/module.h>
+#include <asm/fpu/api.h>
+
+asmlinkage void nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ u8 hash[NH_HASH_BYTES]);
+
+/* wrapper to avoid indirect call to assembly, which doesn't work with CFI */
+static void _nh_sse2(const u32 *key, const u8 *message, size_t message_len,
+ __le64 hash[NH_NUM_PASSES])
+{
+ nh_sse2(key, message, message_len, (u8 *)hash);
+}
+
+static int nhpoly1305_sse2_update(struct shash_desc *desc,
+ const u8 *src, unsigned int srclen)
+{
+ if (srclen < 64 || !irq_fpu_usable())
+ return crypto_nhpoly1305_update(desc, src, srclen);
+
+ do {
+ unsigned int n = min_t(unsigned int, srclen, PAGE_SIZE);
+
+ kernel_fpu_begin();
+ crypto_nhpoly1305_update_helper(desc, src, n, _nh_sse2);
+ kernel_fpu_end();
+ src += n;
+ srclen -= n;
+ } while (srclen);
+ return 0;
+}
+
+static struct shash_alg nhpoly1305_alg = {
+ .base.cra_name = "nhpoly1305",
+ .base.cra_driver_name = "nhpoly1305-sse2",
+ .base.cra_priority = 200,
+ .base.cra_ctxsize = sizeof(struct nhpoly1305_key),
+ .base.cra_module = THIS_MODULE,
+ .digestsize = POLY1305_DIGEST_SIZE,
+ .init = crypto_nhpoly1305_init,
+ .update = nhpoly1305_sse2_update,
+ .final = crypto_nhpoly1305_final,
+ .setkey = crypto_nhpoly1305_setkey,
+ .descsize = sizeof(struct nhpoly1305_state),
+};
+
+static int __init nhpoly1305_mod_init(void)
+{
+ if (!boot_cpu_has(X86_FEATURE_XMM2))
+ return -ENODEV;
+
+ return crypto_register_shash(&nhpoly1305_alg);
+}
+
+static void __exit nhpoly1305_mod_exit(void)
+{
+ crypto_unregister_shash(&nhpoly1305_alg);
+}
+
+module_init(nhpoly1305_mod_init);
+module_exit(nhpoly1305_mod_exit);
+
+MODULE_DESCRIPTION("NHPoly1305 ε-almost-∆-universal hash function (SSE2-accelerated)");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Eric Biggers <ebiggers@google.com>");
+MODULE_ALIAS_CRYPTO("nhpoly1305");
+MODULE_ALIAS_CRYPTO("nhpoly1305-sse2");
diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index f012b7e28ad1..88cc01506c84 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -83,35 +83,37 @@ static unsigned int poly1305_simd_blocks(struct poly1305_desc_ctx *dctx,
if (poly1305_use_avx2 && srclen >= POLY1305_BLOCK_SIZE * 4) {
if (unlikely(!sctx->wset)) {
if (!sctx->uset) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
memcpy(sctx->u + 5, sctx->u, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 5, dctx->r);
+ poly1305_simd_mult(sctx->u + 5, dctx->r.r);
memcpy(sctx->u + 10, sctx->u + 5, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u + 10, dctx->r);
+ poly1305_simd_mult(sctx->u + 10, dctx->r.r);
sctx->wset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 4);
- poly1305_4block_avx2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_4block_avx2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 4 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 4 * blocks;
}
#endif
if (likely(srclen >= POLY1305_BLOCK_SIZE * 2)) {
if (unlikely(!sctx->uset)) {
- memcpy(sctx->u, dctx->r, sizeof(sctx->u));
- poly1305_simd_mult(sctx->u, dctx->r);
+ memcpy(sctx->u, dctx->r.r, sizeof(sctx->u));
+ poly1305_simd_mult(sctx->u, dctx->r.r);
sctx->uset = true;
}
blocks = srclen / (POLY1305_BLOCK_SIZE * 2);
- poly1305_2block_sse2(dctx->h, src, dctx->r, blocks, sctx->u);
+ poly1305_2block_sse2(dctx->h.h, src, dctx->r.r, blocks,
+ sctx->u);
src += POLY1305_BLOCK_SIZE * 2 * blocks;
srclen -= POLY1305_BLOCK_SIZE * 2 * blocks;
}
if (srclen >= POLY1305_BLOCK_SIZE) {
- poly1305_block_sse2(dctx->h, src, dctx->r, 1);
+ poly1305_block_sse2(dctx->h.h, src, dctx->r.r, 1);
srclen -= POLY1305_BLOCK_SIZE;
}
return srclen;