/* SPDX-License-Identifier: GPL-2.0-only OR BSD-3-Clause */ /* * AES CTR mode by8 optimization with AVX instructions. (x86_64) * * Copyright(c) 2014 Intel Corporation. * * Contact Information: * James Guilford * Sean Gulley * Chandramouli Narayanan */ /* * This is AES128/192/256 CTR mode optimization implementation. It requires * the support of Intel(R) AESNI and AVX instructions. * * This work was inspired by the AES CTR mode optimization published * in Intel Optimized IPSEC Cryptographic library. * Additional information on it can be found at: * https://github.com/intel/intel-ipsec-mb */ #include #define VMOVDQ vmovdqu /* * Note: the "x" prefix in these aliases means "this is an xmm register". The * alias prefixes have no relation to XCTR where the "X" prefix means "XOR * counter". */ #define xdata0 %xmm0 #define xdata1 %xmm1 #define xdata2 %xmm2 #define xdata3 %xmm3 #define xdata4 %xmm4 #define xdata5 %xmm5 #define xdata6 %xmm6 #define xdata7 %xmm7 #define xcounter %xmm8 // CTR mode only #define xiv %xmm8 // XCTR mode only #define xbyteswap %xmm9 // CTR mode only #define xtmp %xmm9 // XCTR mode only #define xkey0 %xmm10 #define xkey4 %xmm11 #define xkey8 %xmm12 #define xkey12 %xmm13 #define xkeyA %xmm14 #define xkeyB %xmm15 #define p_in %rdi #define p_iv %rsi #define p_keys %rdx #define p_out %rcx #define num_bytes %r8 #define counter %r9 // XCTR mode only #define tmp %r10 #define DDQ_DATA 0 #define XDATA 1 #define KEY_128 1 #define KEY_192 2 #define KEY_256 3 .section .rodata .align 16 byteswap_const: .octa 0x000102030405060708090A0B0C0D0E0F ddq_low_msk: .octa 0x0000000000000000FFFFFFFFFFFFFFFF ddq_high_add_1: .octa 0x00000000000000010000000000000000 ddq_add_1: .octa 0x00000000000000000000000000000001 ddq_add_2: .octa 0x00000000000000000000000000000002 ddq_add_3: .octa 0x00000000000000000000000000000003 ddq_add_4: .octa 0x00000000000000000000000000000004 ddq_add_5: .octa 0x00000000000000000000000000000005 ddq_add_6: .octa 0x00000000000000000000000000000006 ddq_add_7: .octa 0x00000000000000000000000000000007 ddq_add_8: .octa 0x00000000000000000000000000000008 .text /* generate a unique variable for ddq_add_x */ /* generate a unique variable for xmm register */ .macro setxdata n var_xdata = %xmm\n .endm /* club the numeric 'id' to the symbol 'name' */ .macro club name, id .altmacro .if \name == XDATA setxdata %\id .endif .noaltmacro .endm /* * do_aes num_in_par load_keys key_len * This increments p_in, but not p_out */ .macro do_aes b, k, key_len, xctr .set by, \b .set load_keys, \k .set klen, \key_len .if (load_keys) vmovdqa 0*16(p_keys), xkey0 .endif .if \xctr movq counter, xtmp .set i, 0 .rept (by) club XDATA, i vpaddq (ddq_add_1 + 16 * i)(%rip), xtmp, var_xdata .set i, (i +1) .endr .set i, 0 .rept (by) club XDATA, i vpxor xiv, var_xdata, var_xdata .set i, (i +1) .endr .else vpshufb xbyteswap, xcounter, xdata0 .set i, 1 .rept (by - 1) club XDATA, i vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata vptest ddq_low_msk(%rip), var_xdata jnz 1f vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata vpaddq ddq_high_add_1(%rip), xcounter, xcounter 1: vpshufb xbyteswap, var_xdata, var_xdata .set i, (i +1) .endr .endif vmovdqa 1*16(p_keys), xkeyA vpxor xkey0, xdata0, xdata0 .if \xctr add $by, counter .else vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter vptest ddq_low_msk(%rip), xcounter jnz 1f vpaddq ddq_high_add_1(%rip), xcounter, xcounter 1: .endif .set i, 1 .rept (by - 1) club XDATA, i vpxor xkey0, var_xdata, var_xdata .set i, (i +1) .endr vmovdqa 2*16(p_keys), xkeyB .set i, 0 .rept by club XDATA, i vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ .set i, (i +1) .endr .if (klen == KEY_128) .if (load_keys) vmovdqa 3*16(p_keys), xkey4 .endif .else vmovdqa 3*16(p_keys), xkeyA .endif .set i, 0 .rept by club XDATA, i vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ .set i, (i +1) .endr add $(16*by), p_in .if (klen == KEY_128) vmovdqa 4*16(p_keys), xkeyB .else .if (load_keys) vmovdqa 4*16(p_keys), xkey4 .endif .endif .set i, 0 .rept by club XDATA, i /* key 3 */ .if (klen == KEY_128) vaesenc xkey4, var_xdata, var_xdata .else vaesenc xkeyA, var_xdata, var_xdata .endif .set i, (i +1) .endr vmovdqa 5*16(p_keys), xkeyA .set i, 0 .rept by club XDATA, i /* key 4 */ .if (klen == KEY_128) vaesenc xkeyB, var_xdata, var_xdata .else vaesenc xkey4, var_xdata, var_xdata .endif .set i, (i +1) .endr .if (klen == KEY_128) .if (load_keys) vmovdqa 6*16(p_keys), xkey8 .endif .else vmovdqa 6*16(p_keys), xkeyB .endif .set i, 0 .rept by club XDATA, i vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ .set i, (i +1) .endr vmovdqa 7*16(p_keys), xkeyA .set i, 0 .rept by club XDATA, i /* key 6 */ .if (klen == KEY_128) vaesenc xkey8, var_xdata, var_xdata .else vaesenc xkeyB, var_xdata, var_xdata .endif .set i, (i +1) .endr .if (klen == KEY_128) vmovdqa 8*16(p_keys), xkeyB .else .if (load_keys) vmovdqa 8*16(p_keys), xkey8 .endif .endif .set i, 0 .rept by club XDATA, i vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ .set i, (i +1) .endr .if (klen == KEY_128) .if (load_keys) vmovdqa 9*16(p_keys), xkey12 .endif .else vmovdqa 9*16(p_keys), xkeyA .endif .set i, 0 .rept by club XDATA, i /* key 8 */ .if (klen == KEY_128) vaesenc xkeyB, var_xdata, var_xdata .else vaesenc xkey8, var_xdata, var_xdata .endif .set i, (i +1) .endr vmovdqa 10*16(p_keys), xkeyB .set i, 0 .rept by club XDATA, i /* key 9 */ .if (klen == KEY_128) vaesenc xkey12, var_xdata, var_xdata .else vaesenc xkeyA, var_xdata, var_xdata .endif .set i, (i +1) .endr .if (klen != KEY_128) vmovdqa 11*16(p_keys), xkeyA .endif .set i, 0 .rept by club XDATA, i /* key 10 */ .if (klen == KEY_128) vaesenclast xkeyB, var_xdata, var_xdata .else vaesenc xkeyB, var_xdata, var_xdata .endif .set i, (i +1) .endr .if (klen != KEY_128) .if (load_keys) vmovdqa 12*16(p_keys), xkey12 .endif .set i, 0 .rept by club XDATA, i vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ .set i, (i +1) .endr .if (klen == KEY_256) vmovdqa 13*16(p_keys), xkeyA .endif .set i, 0 .rept by club XDATA, i .if (klen == KEY_256) /* key 12 */ vaesenc xkey12, var_xdata, var_xdata .else vaesenclast xkey12, var_xdata, var_xdata .endif .set i, (i +1) .endr .if (klen == KEY_256) vmovdqa 14*16(p_keys), xkeyB .set i, 0 .rept by club XDATA, i /* key 13 */ vaesenc xkeyA, var_xdata, var_xdata .set i, (i +1) .endr .set i, 0 .rept by club XDATA, i /* key 14 */ vaesenclast xkeyB, var_xdata, var_xdata .set i, (i +1) .endr .endif .endif .set i, 0 .rept (by / 2) .set j, (i+1) VMOVDQ (i*16 - 16*by)(p_in), xkeyA VMOVDQ (j*16 - 16*by)(p_in), xkeyB club XDATA, i vpxor xkeyA, var_xdata, var_xdata club XDATA, j vpxor xkeyB, var_xdata, var_xdata .set i, (i+2) .endr .if (i < by) VMOVDQ (i*16 - 16*by)(p_in), xkeyA club XDATA, i vpxor xkeyA, var_xdata, var_xdata .endif .set i, 0 .rept by club XDATA, i VMOVDQ var_xdata, i*16(p_out) .set i, (i+1) .endr .endm .macro do_aes_load val, key_len, xctr do_aes \val, 1, \key_len, \xctr .endm .macro do_aes_noload val, key_len, xctr do_aes \val, 0, \key_len, \xctr .endm /* main body of aes ctr load */ .macro do_aes_ctrmain key_len, xctr cmp $16, num_bytes jb .Ldo_return2\xctr\key_len .if \xctr shr $4, counter vmovdqu (p_iv), xiv .else vmovdqa byteswap_const(%rip), xbyteswap vmovdqu (p_iv), xcounter vpshufb xbyteswap, xcounter, xcounter .endif mov num_bytes, tmp and $(7*16), tmp jz .Lmult_of_8_blks\xctr\key_len /* 1 <= tmp <= 7 */ cmp $(4*16), tmp jg .Lgt4\xctr\key_len je .Leq4\xctr\key_len .Llt4\xctr\key_len: cmp $(2*16), tmp jg .Leq3\xctr\key_len je .Leq2\xctr\key_len .Leq1\xctr\key_len: do_aes_load 1, \key_len, \xctr add $(1*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Leq2\xctr\key_len: do_aes_load 2, \key_len, \xctr add $(2*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Leq3\xctr\key_len: do_aes_load 3, \key_len, \xctr add $(3*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Leq4\xctr\key_len: do_aes_load 4, \key_len, \xctr add $(4*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Lgt4\xctr\key_len: cmp $(6*16), tmp jg .Leq7\xctr\key_len je .Leq6\xctr\key_len .Leq5\xctr\key_len: do_aes_load 5, \key_len, \xctr add $(5*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Leq6\xctr\key_len: do_aes_load 6, \key_len, \xctr add $(6*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Leq7\xctr\key_len: do_aes_load 7, \key_len, \xctr add $(7*16), p_out and $(~7*16), num_bytes jz .Ldo_return2\xctr\key_len jmp .Lmain_loop2\xctr\key_len .Lmult_of_8_blks\xctr\key_len: .if (\key_len != KEY_128) vmovdqa 0*16(p_keys), xkey0 vmovdqa 4*16(p_keys), xkey4 vmovdqa 8*16(p_keys), xkey8 vmovdqa 12*16(p_keys), xkey12 .else vmovdqa 0*16(p_keys), xkey0 vmovdqa 3*16(p_keys), xkey4 vmovdqa 6*16(p_keys), xkey8 vmovdqa 9*16(p_keys), xkey12 .endif .align 16 .Lmain_loop2\xctr\key_len: /* num_bytes is a multiple of 8 and >0 */ do_aes_noload 8, \key_len, \xctr add $(8*16), p_out sub $(8*16), num_bytes jne .Lmain_loop2\xctr\key_len .Ldo_return2\xctr\key_len: .if !\xctr /* return updated IV */ vpshufb xbyteswap, xcounter, xcounter vmovdqu xcounter, (p_iv) .endif RET .endm /* * routine to do AES128 CTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ SYM_FUNC_START(aes_ctr_enc_128_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_128 0 SYM_FUNC_END(aes_ctr_enc_128_avx_by8) /* * routine to do AES192 CTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ SYM_FUNC_START(aes_ctr_enc_192_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_192 0 SYM_FUNC_END(aes_ctr_enc_192_avx_by8) /* * routine to do AES256 CTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, * unsigned int num_bytes) */ SYM_FUNC_START(aes_ctr_enc_256_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_256 0 SYM_FUNC_END(aes_ctr_enc_256_avx_by8) /* * routine to do AES128 XCTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_xctr_enc_128_avx_by8(const u8 *in, const u8 *iv, const void *keys, * u8* out, unsigned int num_bytes, unsigned int byte_ctr) */ SYM_FUNC_START(aes_xctr_enc_128_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_128 1 SYM_FUNC_END(aes_xctr_enc_128_avx_by8) /* * routine to do AES192 XCTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, const void *keys, * u8* out, unsigned int num_bytes, unsigned int byte_ctr) */ SYM_FUNC_START(aes_xctr_enc_192_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_192 1 SYM_FUNC_END(aes_xctr_enc_192_avx_by8) /* * routine to do AES256 XCTR enc/decrypt "by8" * XMM registers are clobbered. * Saving/restoring must be done at a higher level * aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, const void *keys, * u8* out, unsigned int num_bytes, unsigned int byte_ctr) */ SYM_FUNC_START(aes_xctr_enc_256_avx_by8) /* call the aes main loop */ do_aes_ctrmain KEY_256 1 SYM_FUNC_END(aes_xctr_enc_256_avx_by8)