From a30a7c3f678c1e829c6381acc6ab2f43f0e331f8 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 1 Feb 2018 21:55:50 +0100 Subject: Check for CPU features --- curve25519-precomp.c | 444 ++++++++++++++++++++++++++++++++++++++++----------- main.c | 39 +++-- 2 files changed, 382 insertions(+), 101 deletions(-) diff --git a/curve25519-precomp.c b/curve25519-precomp.c index 1ac8380..45addfa 100644 --- a/curve25519-precomp.c +++ b/curve25519-precomp.c @@ -10,25 +10,49 @@ enum { CURVE25519_POINT_SIZE = 32 }; +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + #define NUM_WORDS_ELTFP25519_X64 4 typedef __aligned(32) u64 EltFp25519_1w_x64[NUM_WORDS_ELTFP25519_X64]; typedef __aligned(32) u64 EltFp25519_1w_Buffer_x64[2 * NUM_WORDS_ELTFP25519_X64]; -#define mul_EltFp25519_1w_x64(c, a, b) \ - mul_256x256_integer_x64(buffer_1w, a, b); \ - red_EltFp25519_1w_x64(c, buffer_1w); +#define mul_EltFp25519_1w_x64_adx(c, a, b) \ + mul_256x256_integer_x64_adx(buffer_1w, a, b); \ + red_EltFp25519_1w_x64_adx(c, buffer_1w); + +#define sqr_EltFp25519_1w_x64_adx(a) \ + sqr_256x256_integer_x64(buffer_1w, a); \ + red_EltFp25519_1w_x64_adx(a, buffer_1w); + +#define mul_EltFp25519_2w_x64_adx(c, a, b) \ + mul2_256x256_integer_x64_adx(buffer_2w, a, b); \ + red_EltFp25519_2w_x64_adx(c, buffer_2w); + +#define sqr_EltFp25519_2w_x64_adx(a) \ + sqr2_256x256_integer_x64(buffer_2w, a); \ + red_EltFp25519_2w_x64_adx(a, buffer_2w); + + +#define mul_EltFp25519_1w_x64_bmi2(c, a, b) \ + mul_256x256_integer_x64_bmi2(buffer_1w, a, b); \ + red_EltFp25519_1w_x64_bmi2(c, buffer_1w); -#define sqr_EltFp25519_1w_x64(a) \ +#define sqr_EltFp25519_1w_x64_bmi2(a) \ sqr_256x256_integer_x64(buffer_1w, a); \ - red_EltFp25519_1w_x64(a, buffer_1w); + red_EltFp25519_1w_x64_bmi2(a, buffer_1w); -#define mul_EltFp25519_2w_x64(c, a, b) \ - mul2_256x256_integer_x64(buffer_2w, a, b); \ - red_EltFp25519_2w_x64(c, buffer_2w); +#define mul_EltFp25519_2w_x64_bmi2(c, a, b) \ + mul2_256x256_integer_x64_bmi2(buffer_2w, a, b); \ + red_EltFp25519_2w_x64_bmi2(c, buffer_2w); -#define sqr_EltFp25519_2w_x64(a) \ +#define sqr_EltFp25519_2w_x64_bmi2(a) \ sqr2_256x256_integer_x64(buffer_2w, a); \ - red_EltFp25519_2w_x64(a, buffer_2w); + red_EltFp25519_2w_x64_bmi2(a, buffer_2w); #define copy_EltFp25519_1w_x64(C, A) \ (C)[0] = (A)[0]; \ @@ -297,9 +321,8 @@ __aligned(32) static const u64 Table_Ladder_8k[252 * NUM_WORDS_ELTFP25519_X64] = /* 252 */ 0xccdfcf2fc18b6d68, 0xa8ebcba8b7806167, 0x980697f95e2937e3, 0x02fbba1cd0126e8c }; -static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) +static void mul2_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b) { -#ifdef __ADX__ __asm__ __volatile__( "movq (%1), %%rdx # A[0] \n\t" "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" @@ -459,7 +482,11 @@ static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"); -#else +} + + +static void mul2_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b) +{ __asm__ __volatile__( "movq (%1), %%rdx # A[0] \n\t" "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" @@ -602,7 +629,6 @@ static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) : "r"(c), "r"(a), "r"(b) : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"); -#endif } static void sqr2_256x256_integer_x64(u64 *const c, u64 *const a) @@ -749,9 +775,8 @@ static void sqr2_256x256_integer_x64(u64 *const c, u64 *const a) "%r12", "%r13", "%r14"); } -static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a) +static void red_EltFp25519_2w_x64_adx(u64 *const c, u64 *const a) { -#ifdef __ADX__ __asm__ __volatile__( "movl $38, %%edx # 2*c = 38 = 2^256 \n\t" "mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" @@ -802,7 +827,10 @@ static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a) : : "r"(c), "r"(a) : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"); -#else +} + +static void red_EltFp25519_2w_x64_bmi2(u64 *const c, u64 *const a) +{ __asm__ __volatile__( "movl $38, %%edx # 2*c = 38 = 2^256 \n\t" "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t" @@ -854,12 +882,10 @@ static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a) : : "r"(c), "r"(a) : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"); -#endif } -static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) +static void mul_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b) { -#ifdef __ADX__ __asm__ __volatile__( "movq (%1), %%rdx # A[0] \n\t" "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" @@ -942,7 +968,10 @@ static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"); -#else +} + +static void mul_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b) +{ __asm__ __volatile__( "movq (%1), %%rdx # A[0] \n\t" "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t" @@ -1016,7 +1045,6 @@ static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b) : "r"(c), "r"(a), "r"(b) : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14"); -#endif } static void sqr_256x256_integer_x64(u64 *const c, u64 *const a) @@ -1095,9 +1123,8 @@ static void sqr_256x256_integer_x64(u64 *const c, u64 *const a) "%r12", "%r13", "%r14"); } -static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a) +static void red_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a) { -#ifdef __ADX__ __asm__ __volatile__( "movl $38, %%edx # 2*c = 38 = 2^256 \n\t" "mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t" @@ -1125,7 +1152,10 @@ static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a) : : "r"(c), "r"(a) : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"); -#else +} + +static void red_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a) +{ __asm__ __volatile__( "movl $38, %%edx # 2*c = 38 = 2^256 \n\t" "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t" @@ -1153,12 +1183,10 @@ static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a) : : "r"(c), "r"(a) : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13"); -#endif } -static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const b) +static inline void add_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a, u64 *const b) { -#ifdef __ADX__ __asm__ __volatile__( "movq (%2), %%rax \n\t" "movq 8(%2), %%rcx \n\t" @@ -1180,7 +1208,10 @@ static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const : : "r"(c), "r"(a), "r"(b) : "memory", "cc", "%rax", "%rcx", "%r8", "%r9"); -#else +} + +static inline void add_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a, u64 *const b) +{ __asm__ __volatile__( "movq (%2), %%rax \n\t" "movq 8(%2), %%rcx \n\t" @@ -1201,7 +1232,6 @@ static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const : : "r"(c), "r"(a), "r"(b) : "memory", "cc", "%rax", "%rcx", "%r8", "%r9"); -#endif } static inline void sub_EltFp25519_1w_x64(u64 *const __restrict c, u64 *const __restrict a, u64 *const __restrict b) @@ -1253,12 +1283,64 @@ static inline void mul_a24_EltFp25519_1w_x64(u64 *const c, u64 *const a) : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11"); } -static void inv_EltFp25519_1w_x64(u64 *const pC, u64 *const pA) +static void inv_EltFp25519_1w_x64_adx(u64 *const pC, u64 *const pA) +{ +#define sqrn_EltFp25519_1w_x64(a, times) \ + counter = times; \ + while (counter-- > 0) { \ + sqr_EltFp25519_1w_x64_adx(a); \ + } + + EltFp25519_1w_Buffer_x64 buffer_1w; + EltFp25519_1w_x64 x0, x1, x2; + u64 *T[5]; + u64 counter; + + T[0] = x0; + T[1] = pC; /* x^(-1) */ + T[2] = x1; + T[3] = x2; + T[4] = pA; /* x */ + + copy_EltFp25519_1w_x64(T[1], pA); + sqrn_EltFp25519_1w_x64(T[1], 1); + copy_EltFp25519_1w_x64(T[2], T[1]); + sqrn_EltFp25519_1w_x64(T[2], 2); + mul_EltFp25519_1w_x64_adx(T[0], pA, T[2]); + mul_EltFp25519_1w_x64_adx(T[1], T[1], T[0]); + copy_EltFp25519_1w_x64(T[2], T[1]); + sqrn_EltFp25519_1w_x64(T[2], 1); + mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]); + copy_EltFp25519_1w_x64(T[2], T[0]); + sqrn_EltFp25519_1w_x64(T[2], 5); + mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]); + copy_EltFp25519_1w_x64(T[2], T[0]); + sqrn_EltFp25519_1w_x64(T[2], 10); + mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]); + copy_EltFp25519_1w_x64(T[3], T[2]); + sqrn_EltFp25519_1w_x64(T[3], 20); + mul_EltFp25519_1w_x64_adx(T[3], T[3], T[2]); + sqrn_EltFp25519_1w_x64(T[3], 10); + mul_EltFp25519_1w_x64_adx(T[3], T[3], T[0]); + copy_EltFp25519_1w_x64(T[0], T[3]); + sqrn_EltFp25519_1w_x64(T[0], 50); + mul_EltFp25519_1w_x64_adx(T[0], T[0], T[3]); + copy_EltFp25519_1w_x64(T[2], T[0]); + sqrn_EltFp25519_1w_x64(T[2], 100); + mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]); + sqrn_EltFp25519_1w_x64(T[2], 50); + mul_EltFp25519_1w_x64_adx(T[2], T[2], T[3]); + sqrn_EltFp25519_1w_x64(T[2], 5); + mul_EltFp25519_1w_x64_adx(T[1], T[1], T[2]); +#undef sqrn_EltFp25519_1w_x64 +} + +static void inv_EltFp25519_1w_x64_bmi2(u64 *const pC, u64 *const pA) { -#define sqrn_EltFp25519_1w_x64(a, times) \ - counter = times; \ - while (counter-- > 0) { \ - sqr_EltFp25519_1w_x64(a); \ +#define sqrn_EltFp25519_1w_x64(a, times) \ + counter = times; \ + while (counter-- > 0) { \ + sqr_EltFp25519_1w_x64_bmi2(a); \ } EltFp25519_1w_Buffer_x64 buffer_1w; @@ -1276,32 +1358,32 @@ static void inv_EltFp25519_1w_x64(u64 *const pC, u64 *const pA) sqrn_EltFp25519_1w_x64(T[1], 1); copy_EltFp25519_1w_x64(T[2], T[1]); sqrn_EltFp25519_1w_x64(T[2], 2); - mul_EltFp25519_1w_x64(T[0], pA, T[2]); - mul_EltFp25519_1w_x64(T[1], T[1], T[0]); + mul_EltFp25519_1w_x64_bmi2(T[0], pA, T[2]); + mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[0]); copy_EltFp25519_1w_x64(T[2], T[1]); sqrn_EltFp25519_1w_x64(T[2], 1); - mul_EltFp25519_1w_x64(T[0], T[0], T[2]); + mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]); copy_EltFp25519_1w_x64(T[2], T[0]); sqrn_EltFp25519_1w_x64(T[2], 5); - mul_EltFp25519_1w_x64(T[0], T[0], T[2]); + mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]); copy_EltFp25519_1w_x64(T[2], T[0]); sqrn_EltFp25519_1w_x64(T[2], 10); - mul_EltFp25519_1w_x64(T[2], T[2], T[0]); + mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]); copy_EltFp25519_1w_x64(T[3], T[2]); sqrn_EltFp25519_1w_x64(T[3], 20); - mul_EltFp25519_1w_x64(T[3], T[3], T[2]); + mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[2]); sqrn_EltFp25519_1w_x64(T[3], 10); - mul_EltFp25519_1w_x64(T[3], T[3], T[0]); + mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[0]); copy_EltFp25519_1w_x64(T[0], T[3]); sqrn_EltFp25519_1w_x64(T[0], 50); - mul_EltFp25519_1w_x64(T[0], T[0], T[3]); + mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[3]); copy_EltFp25519_1w_x64(T[2], T[0]); sqrn_EltFp25519_1w_x64(T[2], 100); - mul_EltFp25519_1w_x64(T[2], T[2], T[0]); + mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]); sqrn_EltFp25519_1w_x64(T[2], 50); - mul_EltFp25519_1w_x64(T[2], T[2], T[3]); + mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[3]); sqrn_EltFp25519_1w_x64(T[2], 5); - mul_EltFp25519_1w_x64(T[1], T[1], T[2]); + mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[2]); #undef sqrn_EltFp25519_1w_x64 } @@ -1325,7 +1407,7 @@ static inline void cswap_x64(u64 bit, u64 *const px, u64 *const py) static __always_inline void reduce_point_mod_2_255_19(u64 *p) { - __asm__ __volatile__ ( + __asm__ __volatile__( "cmpq $-19, %0\n" "setaeb %%al\n" "cmpq $-1, %1\n" @@ -1347,18 +1429,202 @@ static __always_inline void reduce_point_mod_2_255_19(u64 *p) "btrq $63, %3\n" : "+r"(p[0]), "+r"(p[1]), "+r"(p[2]), "+r"(p[3]) : - : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx" - ); + : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"); } -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE]) { - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; + __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u8 private_key[CURVE25519_POINT_SIZE]; + __aligned(32) u8 session_key[CURVE25519_POINT_SIZE]; + + int i = 0, j = 0; + u64 prev = 0; + u64 *const X1 = (u64 *)session_key; + u64 *const key = (u64 *)private_key; + u64 *const Px = coordinates + 0; + u64 *const Pz = coordinates + 4; + u64 *const Qx = coordinates + 8; + u64 *const Qz = coordinates + 12; + u64 *const X2 = Qx; + u64 *const Z2 = Qz; + u64 *const X3 = Px; + u64 *const Z3 = Pz; + u64 *const X2Z2 = Qx; + u64 *const X3Z3 = Px; + + u64 *const A = workspace + 0; + u64 *const B = workspace + 4; + u64 *const D = workspace + 8; + u64 *const C = workspace + 12; + u64 *const DA = workspace + 16; + u64 *const CB = workspace + 20; + u64 *const AB = A; + u64 *const DC = D; + u64 *const DACB = DA; + u64 *const buffer_1w = buffer; + u64 *const buffer_2w = buffer; + + memcpy(session_key, session, sizeof(session_key)); + memcpy(private_key, private, sizeof(private_key)); + normalize_secret(private_key); + + /* As in the draft: + * When receiving such an array, implementations of curve25519 + * MUST mask the most-significant bit in the final byte. This + * is done to preserve compatibility with point formats which + * reserve the sign bit for use in other protocols and to + * increase resistance to implementation fingerprinting + */ + session_key[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1; + reduce_point_mod_2_255_19((u64 *)session_key); + copy_EltFp25519_1w_x64(Px, (u64 *)session_key); + + setzero_EltFp25519_1w_x64(Pz); + setzero_EltFp25519_1w_x64(Qx); + setzero_EltFp25519_1w_x64(Qz); + + Pz[0] = 1; + Qx[0] = 1; + + /* main-loop */ + prev = 0; + j = 62; + for (i = 3; i >= 0; --i) { + while (j >= 0) { + u64 bit = (key[i] >> j) & 0x1; + u64 swap = bit ^ prev; + prev = bit; + + add_EltFp25519_1w_x64_bmi2(A, X2, Z2); /* A = (X2+Z2) */ + sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */ + add_EltFp25519_1w_x64_bmi2(C, X3, Z3); /* C = (X3+Z3) */ + sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */ + mul_EltFp25519_2w_x64_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + + cswap_x64(swap, A, C); + cswap_x64(swap, B, D); + + sqr_EltFp25519_2w_x64_bmi2(AB); /* [AA|BB] = [A^2|B^2] */ + add_EltFp25519_1w_x64_bmi2(X3, DA, CB); /* X3 = (DA+CB) */ + sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_EltFp25519_2w_x64_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + + copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */ + sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */ + mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */ + add_EltFp25519_1w_x64_bmi2(B, B, X2); /* B = a24*E+B */ + mul_EltFp25519_2w_x64_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_EltFp25519_1w_x64_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */ + + --j; + } + j = 63; + } + + inv_EltFp25519_1w_x64_bmi2(A, Qz); + mul_EltFp25519_1w_x64_bmi2((u64 *)shared, Qx, A); + fred_EltFp25519_1w_x64((u64 *)shared); + + return true; +} + +bool curve25519_precomp_bmi2_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE]) +{ + __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519_X64]; + __aligned(32) u8 private_key[CURVE25519_POINT_SIZE]; + + int i = 0, j = 0, k = 0; + u64 *const key = (u64 *)private_key; + u64 *const Ur1 = coordinates + 0; + u64 *const Zr1 = coordinates + 4; + u64 *const Ur2 = coordinates + 8; + u64 *const Zr2 = coordinates + 12; + + u64 *const UZr1 = coordinates + 0; + u64 *const ZUr2 = coordinates + 8; + + u64 *const A = workspace + 0; + u64 *const B = workspace + 4; + u64 *const C = workspace + 8; + u64 *const D = workspace + 12; + + u64 *const AB = workspace + 0; + u64 *const CD = workspace + 8; + + u64 *const buffer_1w = buffer; + u64 *const buffer_2w = buffer; + u64 *P = (u64 *)Table_Ladder_8k; + + const int ite[4] = { 64, 64, 64, 63 }; + const int q = 3; + u64 swap = 1; + + memcpy(private_key, private, sizeof(private_key)); + normalize_secret(private_key); + + setzero_EltFp25519_1w_x64(Ur1); + setzero_EltFp25519_1w_x64(Zr1); + setzero_EltFp25519_1w_x64(Zr2); + Ur1[0] = 1; + Zr1[0] = 1; + Zr2[0] = 1; + + /* G-S */ + Ur2[3] = 0x1eaecdeee27cab34ULL; + Ur2[2] = 0xadc7a0b9235d48e2ULL; + Ur2[1] = 0xbbf095ae14b2edf8ULL; + Ur2[0] = 0x7e94e1fec82faabdULL; + + /* main-loop */ + j = q; + for (i = 0; i < NUM_WORDS_ELTFP25519_X64; ++i) { + while (j < ite[i]) { + u64 bit; + k = (64 * i + j - q); + bit = (key[i] >> j) & 0x1; + swap = swap ^ bit; + cswap_x64(swap, Ur1, Ur2); + cswap_x64(swap, Zr1, Zr2); + swap = bit; + /* Addition */ + sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_EltFp25519_1w_x64_bmi2(C, &P[4 * k], B); /* C = M0-B */ + sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_EltFp25519_1w_x64_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_EltFp25519_2w_x64_bmi2(AB); /* A = A^2 | B = B^2 */ + mul_EltFp25519_2w_x64_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + ++j; + } + j = 0; + } + + /* Doubling */ + for (i = 0; i < q; ++i) { + add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_EltFp25519_2w_x64_bmi2(AB); /* A = A**2 B = B**2 */ + copy_EltFp25519_1w_x64(C, B); /* C = B */ + sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */ + mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */ + add_EltFp25519_1w_x64_bmi2(D, D, C); /* D = D+C */ + mul_EltFp25519_2w_x64_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + } + + /* Convert to affine coordinates */ + inv_EltFp25519_1w_x64_bmi2(A, Zr1); + mul_EltFp25519_1w_x64_bmi2((u64 *)session_key, Ur1, A); + fred_EltFp25519_1w_x64((u64 *)session_key); + + return true; } -bool curve25519_precomp(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE]) +bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE]) { __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64]; __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64]; @@ -1424,40 +1690,40 @@ bool curve25519_precomp(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE u64 swap = bit ^ prev; prev = bit; - add_EltFp25519_1w_x64(A, X2, Z2); /* A = (X2+Z2) */ - sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */ - add_EltFp25519_1w_x64(C, X3, Z3); /* C = (X3+Z3) */ - sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */ - mul_EltFp25519_2w_x64(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ + add_EltFp25519_1w_x64_adx(A, X2, Z2); /* A = (X2+Z2) */ + sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */ + add_EltFp25519_1w_x64_adx(C, X3, Z3); /* C = (X3+Z3) */ + sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */ + mul_EltFp25519_2w_x64_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */ cswap_x64(swap, A, C); cswap_x64(swap, B, D); - sqr_EltFp25519_2w_x64(AB); /* [AA|BB] = [A^2|B^2] */ - add_EltFp25519_1w_x64(X3, DA, CB); /* X3 = (DA+CB) */ - sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */ - sqr_EltFp25519_2w_x64(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ + sqr_EltFp25519_2w_x64_adx(AB); /* [AA|BB] = [A^2|B^2] */ + add_EltFp25519_1w_x64_adx(X3, DA, CB); /* X3 = (DA+CB) */ + sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */ + sqr_EltFp25519_2w_x64_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */ - copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */ - sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */ - mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */ - add_EltFp25519_1w_x64(B, B, X2); /* B = a24*E+B */ - mul_EltFp25519_2w_x64(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ - mul_EltFp25519_1w_x64(Z3, Z3, X1); /* Z3 = Z3*X1 */ + copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */ + sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */ + mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */ + add_EltFp25519_1w_x64_adx(B, B, X2); /* B = a24*E+B */ + mul_EltFp25519_2w_x64_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */ + mul_EltFp25519_1w_x64_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */ --j; } j = 63; } - inv_EltFp25519_1w_x64(A, Qz); - mul_EltFp25519_1w_x64((u64 *)shared, Qx, A); + inv_EltFp25519_1w_x64_adx(A, Qz); + mul_EltFp25519_1w_x64_adx((u64 *)shared, Qx, A); fred_EltFp25519_1w_x64((u64 *)shared); return true; } -bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE]) +bool curve25519_precomp_adx_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE]) { __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64]; __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64]; @@ -1518,13 +1784,13 @@ bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], c cswap_x64(swap, Zr1, Zr2); swap = bit; /* Addition */ - sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - add_EltFp25519_1w_x64(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - mul_EltFp25519_1w_x64(C, &P[4 * k], B); /* C = M0-B */ - sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ - add_EltFp25519_1w_x64(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ - sqr_EltFp25519_2w_x64(AB); /* A = A^2 | B = B^2 */ - mul_EltFp25519_2w_x64(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ + sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + add_EltFp25519_1w_x64_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + mul_EltFp25519_1w_x64_adx(C, &P[4 * k], B); /* C = M0-B */ + sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */ + add_EltFp25519_1w_x64_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */ + sqr_EltFp25519_2w_x64_adx(AB); /* A = A^2 | B = B^2 */ + mul_EltFp25519_2w_x64_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */ ++j; } j = 0; @@ -1532,19 +1798,19 @@ bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], c /* Doubling */ for (i = 0; i < q; ++i) { - add_EltFp25519_1w_x64(A, Ur1, Zr1); /* A = Ur1+Zr1 */ - sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ - sqr_EltFp25519_2w_x64(AB); /* A = A**2 B = B**2 */ - copy_EltFp25519_1w_x64(C, B); /* C = B */ - sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */ - mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */ - add_EltFp25519_1w_x64(D, D, C); /* D = D+C */ - mul_EltFp25519_2w_x64(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ + add_EltFp25519_1w_x64_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */ + sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */ + sqr_EltFp25519_2w_x64_adx(AB); /* A = A**2 B = B**2 */ + copy_EltFp25519_1w_x64(C, B); /* C = B */ + sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */ + mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */ + add_EltFp25519_1w_x64_adx(D, D, C); /* D = D+C */ + mul_EltFp25519_2w_x64_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */ } /* Convert to affine coordinates */ - inv_EltFp25519_1w_x64(A, Zr1); - mul_EltFp25519_1w_x64((u64 *)session_key, Ur1, A); + inv_EltFp25519_1w_x64_adx(A, Zr1); + mul_EltFp25519_1w_x64_adx((u64 *)session_key, Ur1, A); fred_EltFp25519_1w_x64((u64 *)session_key); return true; diff --git a/main.c b/main.c index 09c7376..14eb66e 100644 --- a/main.c +++ b/main.c @@ -48,7 +48,7 @@ static __always_inline int name(void) \ } while (0) #define report_it(name) do { \ - pr_err("%lu: %7s: %llu cycles per call\n", stamp, #name, (end_ ## name - start_ ## name) / TRIALS); \ + pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, (end_ ## name - start_ ## name) / TRIALS); \ } while (0) @@ -57,7 +57,8 @@ declare_it(hacl64) declare_it(fiat64) declare_it(sandy2x) declare_it(amd64) -declare_it(precomp) +declare_it(precomp_bmi2) +declare_it(precomp_adx) declare_it(fiat32) declare_it(donna32) @@ -71,9 +72,13 @@ static bool verify(void) test_it(donna64, {}, {}); test_it(hacl64, {}, {}); test_it(fiat64, {}, {}); - test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end()); + if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end()); + if (boot_cpu_has(X86_FEATURE_BMI2)) + test_it(precomp_bmi2, {}, {}); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + test_it(precomp_adx, {}, {}); test_it(amd64, {}, {}); - test_it(precomp, {}, {}); test_it(fiat32, {}, {}); test_it(donna32, {}, {}); } @@ -87,9 +92,10 @@ static int __init mod_init(void) cycles_t start_donna64, end_donna64; cycles_t start_hacl64, end_hacl64; cycles_t start_fiat64, end_fiat64; - cycles_t start_sandy2x, end_sandy2x; + cycles_t start_sandy2x = 0, end_sandy2x = 0; cycles_t start_amd64, end_amd64; - cycles_t start_precomp, end_precomp; + cycles_t start_precomp_bmi2 = 0, end_precomp_bmi2 = 0; + cycles_t start_precomp_adx = 0, end_precomp_adx = 0; cycles_t start_fiat32, end_fiat32; cycles_t start_donna32, end_donna32; unsigned long flags; @@ -105,11 +111,16 @@ static int __init mod_init(void) do_it(donna64); do_it(hacl64); do_it(fiat64); - kernel_fpu_begin(); - do_it(sandy2x); - kernel_fpu_end(); + if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { + kernel_fpu_begin(); + do_it(sandy2x); + kernel_fpu_end(); + } + if (boot_cpu_has(X86_FEATURE_BMI2)) + do_it(precomp_bmi2); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + do_it(precomp_adx); do_it(amd64); - do_it(precomp); do_it(fiat32); do_it(donna32); @@ -118,9 +129,13 @@ static int __init mod_init(void) report_it(donna64); report_it(hacl64); report_it(fiat64); - report_it(sandy2x); + if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) + report_it(sandy2x); + if (boot_cpu_has(X86_FEATURE_BMI2)) + report_it(precomp_bmi2); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + report_it(precomp_adx); report_it(amd64); - report_it(precomp); report_it(fiat32); report_it(donna32); -- cgit v1.2.3-59-g8ed1b