From a30a7c3f678c1e829c6381acc6ab2f43f0e331f8 Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Thu, 1 Feb 2018 21:55:50 +0100
Subject: Check for CPU features

---
 curve25519-precomp.c | 444 ++++++++++++++++++++++++++++++++++++++++-----------
 main.c               |  39 +++--
 2 files changed, 382 insertions(+), 101 deletions(-)

diff --git a/curve25519-precomp.c b/curve25519-precomp.c
index 1ac8380..45addfa 100644
--- a/curve25519-precomp.c
+++ b/curve25519-precomp.c
@@ -10,25 +10,49 @@
 
 enum { CURVE25519_POINT_SIZE = 32 };
 
+static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE])
+{
+	secret[0] &= 248;
+	secret[31] &= 127;
+	secret[31] |= 64;
+}
+
 #define NUM_WORDS_ELTFP25519_X64 4
 typedef __aligned(32) u64 EltFp25519_1w_x64[NUM_WORDS_ELTFP25519_X64];
 typedef __aligned(32) u64 EltFp25519_1w_Buffer_x64[2 * NUM_WORDS_ELTFP25519_X64];
 
-#define mul_EltFp25519_1w_x64(c, a, b)            \
-	mul_256x256_integer_x64(buffer_1w, a, b); \
-	red_EltFp25519_1w_x64(c, buffer_1w);
+#define mul_EltFp25519_1w_x64_adx(c, a, b)            \
+	mul_256x256_integer_x64_adx(buffer_1w, a, b); \
+	red_EltFp25519_1w_x64_adx(c, buffer_1w);
+
+#define sqr_EltFp25519_1w_x64_adx(a)           \
+	sqr_256x256_integer_x64(buffer_1w, a); \
+	red_EltFp25519_1w_x64_adx(a, buffer_1w);
+
+#define mul_EltFp25519_2w_x64_adx(c, a, b)             \
+	mul2_256x256_integer_x64_adx(buffer_2w, a, b); \
+	red_EltFp25519_2w_x64_adx(c, buffer_2w);
+
+#define sqr_EltFp25519_2w_x64_adx(a)            \
+	sqr2_256x256_integer_x64(buffer_2w, a); \
+	red_EltFp25519_2w_x64_adx(a, buffer_2w);
+
+
+#define mul_EltFp25519_1w_x64_bmi2(c, a, b)            \
+	mul_256x256_integer_x64_bmi2(buffer_1w, a, b); \
+	red_EltFp25519_1w_x64_bmi2(c, buffer_1w);
 
-#define sqr_EltFp25519_1w_x64(a)               \
+#define sqr_EltFp25519_1w_x64_bmi2(a)          \
 	sqr_256x256_integer_x64(buffer_1w, a); \
-	red_EltFp25519_1w_x64(a, buffer_1w);
+	red_EltFp25519_1w_x64_bmi2(a, buffer_1w);
 
-#define mul_EltFp25519_2w_x64(c, a, b)             \
-	mul2_256x256_integer_x64(buffer_2w, a, b); \
-	red_EltFp25519_2w_x64(c, buffer_2w);
+#define mul_EltFp25519_2w_x64_bmi2(c, a, b)             \
+	mul2_256x256_integer_x64_bmi2(buffer_2w, a, b); \
+	red_EltFp25519_2w_x64_bmi2(c, buffer_2w);
 
-#define sqr_EltFp25519_2w_x64(a)                \
+#define sqr_EltFp25519_2w_x64_bmi2(a)           \
 	sqr2_256x256_integer_x64(buffer_2w, a); \
-	red_EltFp25519_2w_x64(a, buffer_2w);
+	red_EltFp25519_2w_x64_bmi2(a, buffer_2w);
 
 #define copy_EltFp25519_1w_x64(C, A) \
 	(C)[0] = (A)[0];             \
@@ -297,9 +321,8 @@ __aligned(32) static const u64 Table_Ladder_8k[252 * NUM_WORDS_ELTFP25519_X64] =
 	/* 252 */ 0xccdfcf2fc18b6d68, 0xa8ebcba8b7806167, 0x980697f95e2937e3, 0x02fbba1cd0126e8c
 };
 
-static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
+static void mul2_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b)
 {
-#ifdef __ADX__
 	__asm__ __volatile__(
 		"movq      (%1), %%rdx  # A[0]            \n\t"
 		"mulx	  (%2),  %%r8,  %%r9  # A[0]*B[0] \n\t"
@@ -459,7 +482,11 @@ static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
 		: "memory", "cc", "%rax", "%rdx",
 		  "%r8", "%r9", "%r10", "%r11",
 		  "%r12", "%r13", "%r14");
-#else
+}
+
+
+static void mul2_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
 	__asm__ __volatile__(
 		"movq   (%1), %%rdx  # A[0] \n\t"
 		"mulx   (%2),  %%r8,  %%r9 # A[0]*B[0] \n\t"
@@ -602,7 +629,6 @@ static void mul2_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
 		: "r"(c), "r"(a), "r"(b)
 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
 		  "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
-#endif
 }
 
 static void sqr2_256x256_integer_x64(u64 *const c, u64 *const a)
@@ -749,9 +775,8 @@ static void sqr2_256x256_integer_x64(u64 *const c, u64 *const a)
 		  "%r12", "%r13", "%r14");
 }
 
-static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a)
+static void red_EltFp25519_2w_x64_adx(u64 *const c, u64 *const a)
 {
-#ifdef __ADX__
 	__asm__ __volatile__(
 		"movl	$38, %%edx     # 2*c = 38 = 2^256  \n\t"
 		"mulx	32(%1),  %%r8, %%r10 # c*C[4]      \n\t"
@@ -802,7 +827,10 @@ static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a)
 		:
 		: "r"(c), "r"(a)
 		: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
-#else
+}
+
+static void red_EltFp25519_2w_x64_bmi2(u64 *const c, u64 *const a)
+{
 	__asm__ __volatile__(
 		"movl $38, %%edx     # 2*c = 38 = 2^256      \n\t"
 		"mulx 32(%1), %%r8,  %%r9  # c*C[4]  \n\t"
@@ -854,12 +882,10 @@ static void red_EltFp25519_2w_x64(u64 *const c, u64 *const a)
 		:
 		: "r"(c), "r"(a)
 		: "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
-#endif
 }
 
-static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
+static void mul_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b)
 {
-#ifdef __ADX__
 	__asm__ __volatile__(
 		"movq      (%1), %%rdx  # A[0]               \n\t"
 		"mulx	  (%2),  %%r8,  %%r9  # A[0]*B[0]   \n\t"
@@ -942,7 +968,10 @@ static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
 		: "memory", "cc", "%rax", "%rdx",
 		  "%r8", "%r9", "%r10", "%r11",
 		  "%r12", "%r13", "%r14");
-#else
+}
+
+static void mul_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
 	__asm__ __volatile__(
 		"movq     (%1), %%rdx  # A[0]                \n\t"
 		"mulx	  (%2),  %%r8,  %%r9  # A[0]*B[0]   \n\t"
@@ -1016,7 +1045,6 @@ static void mul_256x256_integer_x64(u64 *const c, u64 *const a, u64 *const b)
 		: "r"(c), "r"(a), "r"(b)
 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
 		  "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
-#endif
 }
 
 static void sqr_256x256_integer_x64(u64 *const c, u64 *const a)
@@ -1095,9 +1123,8 @@ static void sqr_256x256_integer_x64(u64 *const c, u64 *const a)
 		  "%r12", "%r13", "%r14");
 }
 
-static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a)
+static void red_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a)
 {
-#ifdef __ADX__
 	__asm__ __volatile__(
 		"movl	$38, %%edx     # 2*c = 38 = 2^256  \n\t"
 		"mulx	32(%1),  %%r8, %%r10 # c*C[4]      \n\t"
@@ -1125,7 +1152,10 @@ static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a)
 		:
 		: "r"(c), "r"(a)
 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
-#else
+}
+
+static void red_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a)
+{
 	__asm__ __volatile__(
 		"movl	$38, %%edx     # 2*c = 38 = 2^256      \n\t"
 		"mulx	32(%1), %%r8,  %%r9  # c*C[4]  \n\t"
@@ -1153,12 +1183,10 @@ static void red_EltFp25519_1w_x64(u64 *const c, u64 *const a)
 		:
 		: "r"(c), "r"(a)
 		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
-#endif
 }
 
-static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const b)
+static inline void add_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a, u64 *const b)
 {
-#ifdef __ADX__
 	__asm__ __volatile__(
 		"movq      (%2),  %%rax   \n\t"
 		"movq     8(%2),  %%rcx   \n\t"
@@ -1180,7 +1208,10 @@ static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const
 		:
 		: "r"(c), "r"(a), "r"(b)
 		: "memory", "cc", "%rax", "%rcx", "%r8", "%r9");
-#else
+}
+
+static inline void add_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+{
 	__asm__ __volatile__(
 		"movq      (%2),  %%rax   \n\t"
 		"movq     8(%2),  %%rcx   \n\t"
@@ -1201,7 +1232,6 @@ static inline void add_EltFp25519_1w_x64(u64 *const c, u64 *const a, u64 *const
 		:
 		: "r"(c), "r"(a), "r"(b)
 		: "memory", "cc", "%rax", "%rcx", "%r8", "%r9");
-#endif
 }
 
 static inline void sub_EltFp25519_1w_x64(u64 *const __restrict c, u64 *const __restrict a, u64 *const __restrict b)
@@ -1253,12 +1283,64 @@ static inline void mul_a24_EltFp25519_1w_x64(u64 *const c, u64 *const a)
 		: "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
 }
 
-static void inv_EltFp25519_1w_x64(u64 *const pC, u64 *const pA)
+static void inv_EltFp25519_1w_x64_adx(u64 *const pC, u64 *const pA)
+{
+#define sqrn_EltFp25519_1w_x64(a, times)      \
+	counter = times;                      \
+	while (counter-- > 0) {               \
+		sqr_EltFp25519_1w_x64_adx(a); \
+	}
+
+	EltFp25519_1w_Buffer_x64 buffer_1w;
+	EltFp25519_1w_x64 x0, x1, x2;
+	u64 *T[5];
+	u64 counter;
+
+	T[0] = x0;
+	T[1] = pC; /* x^(-1) */
+	T[2] = x1;
+	T[3] = x2;
+	T[4] = pA; /* x */
+
+	copy_EltFp25519_1w_x64(T[1], pA);
+	sqrn_EltFp25519_1w_x64(T[1], 1);
+	copy_EltFp25519_1w_x64(T[2], T[1]);
+	sqrn_EltFp25519_1w_x64(T[2], 2);
+	mul_EltFp25519_1w_x64_adx(T[0], pA, T[2]);
+	mul_EltFp25519_1w_x64_adx(T[1], T[1], T[0]);
+	copy_EltFp25519_1w_x64(T[2], T[1]);
+	sqrn_EltFp25519_1w_x64(T[2], 1);
+	mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]);
+	copy_EltFp25519_1w_x64(T[2], T[0]);
+	sqrn_EltFp25519_1w_x64(T[2], 5);
+	mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]);
+	copy_EltFp25519_1w_x64(T[2], T[0]);
+	sqrn_EltFp25519_1w_x64(T[2], 10);
+	mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]);
+	copy_EltFp25519_1w_x64(T[3], T[2]);
+	sqrn_EltFp25519_1w_x64(T[3], 20);
+	mul_EltFp25519_1w_x64_adx(T[3], T[3], T[2]);
+	sqrn_EltFp25519_1w_x64(T[3], 10);
+	mul_EltFp25519_1w_x64_adx(T[3], T[3], T[0]);
+	copy_EltFp25519_1w_x64(T[0], T[3]);
+	sqrn_EltFp25519_1w_x64(T[0], 50);
+	mul_EltFp25519_1w_x64_adx(T[0], T[0], T[3]);
+	copy_EltFp25519_1w_x64(T[2], T[0]);
+	sqrn_EltFp25519_1w_x64(T[2], 100);
+	mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]);
+	sqrn_EltFp25519_1w_x64(T[2], 50);
+	mul_EltFp25519_1w_x64_adx(T[2], T[2], T[3]);
+	sqrn_EltFp25519_1w_x64(T[2], 5);
+	mul_EltFp25519_1w_x64_adx(T[1], T[1], T[2]);
+#undef sqrn_EltFp25519_1w_x64
+}
+
+static void inv_EltFp25519_1w_x64_bmi2(u64 *const pC, u64 *const pA)
 {
-#define sqrn_EltFp25519_1w_x64(a, times)  \
-	counter = times;                  \
-	while (counter-- > 0) {           \
-		sqr_EltFp25519_1w_x64(a); \
+#define sqrn_EltFp25519_1w_x64(a, times)       \
+	counter = times;                       \
+	while (counter-- > 0) {                \
+		sqr_EltFp25519_1w_x64_bmi2(a); \
 	}
 
 	EltFp25519_1w_Buffer_x64 buffer_1w;
@@ -1276,32 +1358,32 @@ static void inv_EltFp25519_1w_x64(u64 *const pC, u64 *const pA)
 	sqrn_EltFp25519_1w_x64(T[1], 1);
 	copy_EltFp25519_1w_x64(T[2], T[1]);
 	sqrn_EltFp25519_1w_x64(T[2], 2);
-	mul_EltFp25519_1w_x64(T[0], pA, T[2]);
-	mul_EltFp25519_1w_x64(T[1], T[1], T[0]);
+	mul_EltFp25519_1w_x64_bmi2(T[0], pA, T[2]);
+	mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[0]);
 	copy_EltFp25519_1w_x64(T[2], T[1]);
 	sqrn_EltFp25519_1w_x64(T[2], 1);
-	mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
+	mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]);
 	copy_EltFp25519_1w_x64(T[2], T[0]);
 	sqrn_EltFp25519_1w_x64(T[2], 5);
-	mul_EltFp25519_1w_x64(T[0], T[0], T[2]);
+	mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]);
 	copy_EltFp25519_1w_x64(T[2], T[0]);
 	sqrn_EltFp25519_1w_x64(T[2], 10);
-	mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
+	mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]);
 	copy_EltFp25519_1w_x64(T[3], T[2]);
 	sqrn_EltFp25519_1w_x64(T[3], 20);
-	mul_EltFp25519_1w_x64(T[3], T[3], T[2]);
+	mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[2]);
 	sqrn_EltFp25519_1w_x64(T[3], 10);
-	mul_EltFp25519_1w_x64(T[3], T[3], T[0]);
+	mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[0]);
 	copy_EltFp25519_1w_x64(T[0], T[3]);
 	sqrn_EltFp25519_1w_x64(T[0], 50);
-	mul_EltFp25519_1w_x64(T[0], T[0], T[3]);
+	mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[3]);
 	copy_EltFp25519_1w_x64(T[2], T[0]);
 	sqrn_EltFp25519_1w_x64(T[2], 100);
-	mul_EltFp25519_1w_x64(T[2], T[2], T[0]);
+	mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]);
 	sqrn_EltFp25519_1w_x64(T[2], 50);
-	mul_EltFp25519_1w_x64(T[2], T[2], T[3]);
+	mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[3]);
 	sqrn_EltFp25519_1w_x64(T[2], 5);
-	mul_EltFp25519_1w_x64(T[1], T[1], T[2]);
+	mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[2]);
 #undef sqrn_EltFp25519_1w_x64
 }
 
@@ -1325,7 +1407,7 @@ static inline void cswap_x64(u64 bit, u64 *const px, u64 *const py)
 
 static __always_inline void reduce_point_mod_2_255_19(u64 *p)
 {
-	__asm__ __volatile__ (
+	__asm__ __volatile__(
 		"cmpq $-19, %0\n"
 		"setaeb %%al\n"
 		"cmpq $-1, %1\n"
@@ -1347,18 +1429,202 @@ static __always_inline void reduce_point_mod_2_255_19(u64 *p)
 		"btrq $63, %3\n"
 		: "+r"(p[0]), "+r"(p[1]), "+r"(p[2]), "+r"(p[3])
 		:
-		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx"
-	);
+		: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx");
 }
 
-static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE])
+bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE])
 {
-	secret[0] &= 248;
-	secret[31] &= 127;
-	secret[31] |= 64;
+	__aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
+	__aligned(32) u8 session_key[CURVE25519_POINT_SIZE];
+
+	int i = 0, j = 0;
+	u64 prev = 0;
+	u64 *const X1 = (u64 *)session_key;
+	u64 *const key = (u64 *)private_key;
+	u64 *const Px = coordinates + 0;
+	u64 *const Pz = coordinates + 4;
+	u64 *const Qx = coordinates + 8;
+	u64 *const Qz = coordinates + 12;
+	u64 *const X2 = Qx;
+	u64 *const Z2 = Qz;
+	u64 *const X3 = Px;
+	u64 *const Z3 = Pz;
+	u64 *const X2Z2 = Qx;
+	u64 *const X3Z3 = Px;
+
+	u64 *const A = workspace + 0;
+	u64 *const B = workspace + 4;
+	u64 *const D = workspace + 8;
+	u64 *const C = workspace + 12;
+	u64 *const DA = workspace + 16;
+	u64 *const CB = workspace + 20;
+	u64 *const AB = A;
+	u64 *const DC = D;
+	u64 *const DACB = DA;
+	u64 *const buffer_1w = buffer;
+	u64 *const buffer_2w = buffer;
+
+	memcpy(session_key, session, sizeof(session_key));
+	memcpy(private_key, private, sizeof(private_key));
+	normalize_secret(private_key);
+
+	/* As in the draft:
+	 * When receiving such an array, implementations of curve25519
+	 * MUST mask the most-significant bit in the final byte. This
+	 * is done to preserve compatibility with point formats which
+	 * reserve the sign bit for use in other protocols and to
+	 * increase resistance to implementation fingerprinting
+	 */
+	session_key[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
+	reduce_point_mod_2_255_19((u64 *)session_key);
+	copy_EltFp25519_1w_x64(Px, (u64 *)session_key);
+
+	setzero_EltFp25519_1w_x64(Pz);
+	setzero_EltFp25519_1w_x64(Qx);
+	setzero_EltFp25519_1w_x64(Qz);
+
+	Pz[0] = 1;
+	Qx[0] = 1;
+
+	/* main-loop */
+	prev = 0;
+	j = 62;
+	for (i = 3; i >= 0; --i) {
+		while (j >= 0) {
+			u64 bit = (key[i] >> j) & 0x1;
+			u64 swap = bit ^ prev;
+			prev = bit;
+
+			add_EltFp25519_1w_x64_bmi2(A, X2, Z2);    /* A = (X2+Z2)                   */
+			sub_EltFp25519_1w_x64(B, X2, Z2);	 /* B = (X2-Z2)                   */
+			add_EltFp25519_1w_x64_bmi2(C, X3, Z3);    /* C = (X3+Z3)                   */
+			sub_EltFp25519_1w_x64(D, X3, Z3);	 /* D = (X3-Z3)                   */
+			mul_EltFp25519_2w_x64_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C]         */
+
+			cswap_x64(swap, A, C);
+			cswap_x64(swap, B, D);
+
+			sqr_EltFp25519_2w_x64_bmi2(AB);		/* [AA|BB] = [A^2|B^2]           */
+			add_EltFp25519_1w_x64_bmi2(X3, DA, CB); /* X3 = (DA+CB)                  */
+			sub_EltFp25519_1w_x64(Z3, DA, CB);      /* Z3 = (DA-CB)                  */
+			sqr_EltFp25519_2w_x64_bmi2(X3Z3);       /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+
+			copy_EltFp25519_1w_x64(X2, B);		    /* X2 = B^2                      */
+			sub_EltFp25519_1w_x64(Z2, A, B);	    /* Z2 = E = AA-BB                */
+			mul_a24_EltFp25519_1w_x64(B, Z2);	   /* B = a24*E                     */
+			add_EltFp25519_1w_x64_bmi2(B, B, X2);       /* B = a24*E+B                   */
+			mul_EltFp25519_2w_x64_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B]   */
+			mul_EltFp25519_1w_x64_bmi2(Z3, Z3, X1);     /* Z3 = Z3*X1                    */
+
+			--j;
+		}
+		j = 63;
+	}
+
+	inv_EltFp25519_1w_x64_bmi2(A, Qz);
+	mul_EltFp25519_1w_x64_bmi2((u64 *)shared, Qx, A);
+	fred_EltFp25519_1w_x64((u64 *)shared);
+
+	return true;
+}
+
+bool curve25519_precomp_bmi2_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE])
+{
+	__aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519_X64];
+	__aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
+
+	int i = 0, j = 0, k = 0;
+	u64 *const key = (u64 *)private_key;
+	u64 *const Ur1 = coordinates + 0;
+	u64 *const Zr1 = coordinates + 4;
+	u64 *const Ur2 = coordinates + 8;
+	u64 *const Zr2 = coordinates + 12;
+
+	u64 *const UZr1 = coordinates + 0;
+	u64 *const ZUr2 = coordinates + 8;
+
+	u64 *const A = workspace + 0;
+	u64 *const B = workspace + 4;
+	u64 *const C = workspace + 8;
+	u64 *const D = workspace + 12;
+
+	u64 *const AB = workspace + 0;
+	u64 *const CD = workspace + 8;
+
+	u64 *const buffer_1w = buffer;
+	u64 *const buffer_2w = buffer;
+	u64 *P = (u64 *)Table_Ladder_8k;
+
+	const int ite[4] = { 64, 64, 64, 63 };
+	const int q = 3;
+	u64 swap = 1;
+
+	memcpy(private_key, private, sizeof(private_key));
+	normalize_secret(private_key);
+
+	setzero_EltFp25519_1w_x64(Ur1);
+	setzero_EltFp25519_1w_x64(Zr1);
+	setzero_EltFp25519_1w_x64(Zr2);
+	Ur1[0] = 1;
+	Zr1[0] = 1;
+	Zr2[0] = 1;
+
+	/* G-S */
+	Ur2[3] = 0x1eaecdeee27cab34ULL;
+	Ur2[2] = 0xadc7a0b9235d48e2ULL;
+	Ur2[1] = 0xbbf095ae14b2edf8ULL;
+	Ur2[0] = 0x7e94e1fec82faabdULL;
+
+	/* main-loop */
+	j = q;
+	for (i = 0; i < NUM_WORDS_ELTFP25519_X64; ++i) {
+		while (j < ite[i]) {
+			u64 bit;
+			k = (64 * i + j - q);
+			bit = (key[i] >> j) & 0x1;
+			swap = swap ^ bit;
+			cswap_x64(swap, Ur1, Ur2);
+			cswap_x64(swap, Zr1, Zr2);
+			swap = bit;
+			/* Addition */
+			sub_EltFp25519_1w_x64(B, Ur1, Zr1);	  /* B = Ur1-Zr1                 */
+			add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1);     /* A = Ur1+Zr1                 */
+			mul_EltFp25519_1w_x64_bmi2(C, &P[4 * k], B); /* C = M0-B                    */
+			sub_EltFp25519_1w_x64(B, A, C);		     /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+			add_EltFp25519_1w_x64_bmi2(A, A, C);	 /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+			sqr_EltFp25519_2w_x64_bmi2(AB);		     /* A = A^2      |  B = B^2     */
+			mul_EltFp25519_2w_x64_bmi2(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
+			++j;
+		}
+		j = 0;
+	}
+
+	/* Doubling */
+	for (i = 0; i < q; ++i) {
+		add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1);  /*  A = Ur1+Zr1   */
+		sub_EltFp25519_1w_x64(B, Ur1, Zr1);       /*  B = Ur1-Zr1   */
+		sqr_EltFp25519_2w_x64_bmi2(AB);		  /*  A = A**2     B = B**2   */
+		copy_EltFp25519_1w_x64(C, B);		  /*  C = B         */
+		sub_EltFp25519_1w_x64(B, A, B);		  /*  B = A-B       */
+		mul_a24_EltFp25519_1w_x64(D, B);	  /*  D = my_a24*B  */
+		add_EltFp25519_1w_x64_bmi2(D, D, C);      /*  D = D+C       */
+		mul_EltFp25519_2w_x64_bmi2(UZr1, AB, CD); /*  Ur1 = A*B   Zr1 = Zr1*A */
+	}
+
+	/* Convert to affine coordinates */
+	inv_EltFp25519_1w_x64_bmi2(A, Zr1);
+	mul_EltFp25519_1w_x64_bmi2((u64 *)session_key, Ur1, A);
+	fred_EltFp25519_1w_x64((u64 *)session_key);
+
+	return true;
 }
 
-bool curve25519_precomp(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE])
+bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE])
 {
 	__aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
 	__aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
@@ -1424,40 +1690,40 @@ bool curve25519_precomp(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE
 			u64 swap = bit ^ prev;
 			prev = bit;
 
-			add_EltFp25519_1w_x64(A, X2, Z2);    /* A = (X2+Z2)                   */
-			sub_EltFp25519_1w_x64(B, X2, Z2);    /* B = (X2-Z2)                   */
-			add_EltFp25519_1w_x64(C, X3, Z3);    /* C = (X3+Z3)                   */
-			sub_EltFp25519_1w_x64(D, X3, Z3);    /* D = (X3-Z3)                   */
-			mul_EltFp25519_2w_x64(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C]         */
+			add_EltFp25519_1w_x64_adx(A, X2, Z2);    /* A = (X2+Z2)                   */
+			sub_EltFp25519_1w_x64(B, X2, Z2);	/* B = (X2-Z2)                   */
+			add_EltFp25519_1w_x64_adx(C, X3, Z3);    /* C = (X3+Z3)                   */
+			sub_EltFp25519_1w_x64(D, X3, Z3);	/* D = (X3-Z3)                   */
+			mul_EltFp25519_2w_x64_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C]         */
 
 			cswap_x64(swap, A, C);
 			cswap_x64(swap, B, D);
 
-			sqr_EltFp25519_2w_x64(AB);	 /* [AA|BB] = [A^2|B^2]           */
-			add_EltFp25519_1w_x64(X3, DA, CB); /* X3 = (DA+CB)                  */
-			sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB)                  */
-			sqr_EltFp25519_2w_x64(X3Z3);       /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+			sqr_EltFp25519_2w_x64_adx(AB);	 /* [AA|BB] = [A^2|B^2]           */
+			add_EltFp25519_1w_x64_adx(X3, DA, CB); /* X3 = (DA+CB)                  */
+			sub_EltFp25519_1w_x64(Z3, DA, CB);     /* Z3 = (DA-CB)                  */
+			sqr_EltFp25519_2w_x64_adx(X3Z3);       /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
 
-			copy_EltFp25519_1w_x64(X2, B);	 /* X2 = B^2                      */
-			sub_EltFp25519_1w_x64(Z2, A, B);       /* Z2 = E = AA-BB                */
-			mul_a24_EltFp25519_1w_x64(B, Z2);      /* B = a24*E                     */
-			add_EltFp25519_1w_x64(B, B, X2);       /* B = a24*E+B                   */
-			mul_EltFp25519_2w_x64(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B]   */
-			mul_EltFp25519_1w_x64(Z3, Z3, X1);     /* Z3 = Z3*X1                    */
+			copy_EltFp25519_1w_x64(X2, B);		   /* X2 = B^2                      */
+			sub_EltFp25519_1w_x64(Z2, A, B);	   /* Z2 = E = AA-BB                */
+			mul_a24_EltFp25519_1w_x64(B, Z2);	  /* B = a24*E                     */
+			add_EltFp25519_1w_x64_adx(B, B, X2);       /* B = a24*E+B                   */
+			mul_EltFp25519_2w_x64_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B]   */
+			mul_EltFp25519_1w_x64_adx(Z3, Z3, X1);     /* Z3 = Z3*X1                    */
 
 			--j;
 		}
 		j = 63;
 	}
 
-	inv_EltFp25519_1w_x64(A, Qz);
-	mul_EltFp25519_1w_x64((u64 *)shared, Qx, A);
+	inv_EltFp25519_1w_x64_adx(A, Qz);
+	mul_EltFp25519_1w_x64_adx((u64 *)shared, Qx, A);
 	fred_EltFp25519_1w_x64((u64 *)shared);
 
 	return true;
 }
 
-bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE])
+bool curve25519_precomp_adx_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE])
 {
 	__aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
 	__aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
@@ -1518,13 +1784,13 @@ bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], c
 			cswap_x64(swap, Zr1, Zr2);
 			swap = bit;
 			/* Addition */
-			sub_EltFp25519_1w_x64(B, Ur1, Zr1);     /* B = Ur1-Zr1                 */
-			add_EltFp25519_1w_x64(A, Ur1, Zr1);     /* A = Ur1+Zr1                 */
-			mul_EltFp25519_1w_x64(C, &P[4 * k], B); /* C = M0-B                    */
-			sub_EltFp25519_1w_x64(B, A, C);		/* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
-			add_EltFp25519_1w_x64(A, A, C);		/* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
-			sqr_EltFp25519_2w_x64(AB);		/* A = A^2      |  B = B^2     */
-			mul_EltFp25519_2w_x64(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
+			sub_EltFp25519_1w_x64(B, Ur1, Zr1);	 /* B = Ur1-Zr1                 */
+			add_EltFp25519_1w_x64_adx(A, Ur1, Zr1);     /* A = Ur1+Zr1                 */
+			mul_EltFp25519_1w_x64_adx(C, &P[4 * k], B); /* C = M0-B                    */
+			sub_EltFp25519_1w_x64(B, A, C);		    /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
+			add_EltFp25519_1w_x64_adx(A, A, C);	 /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
+			sqr_EltFp25519_2w_x64_adx(AB);		    /* A = A^2      |  B = B^2     */
+			mul_EltFp25519_2w_x64_adx(UZr1, ZUr2, AB);  /* Ur1 = Zr2*A  |  Zr1 = Ur2*B */
 			++j;
 		}
 		j = 0;
@@ -1532,19 +1798,19 @@ bool curve25519_precomp_generate_public(u8 session_key[CURVE25519_POINT_SIZE], c
 
 	/* Doubling */
 	for (i = 0; i < q; ++i) {
-		add_EltFp25519_1w_x64(A, Ur1, Zr1);  /*  A = Ur1+Zr1   */
-		sub_EltFp25519_1w_x64(B, Ur1, Zr1);  /*  B = Ur1-Zr1   */
-		sqr_EltFp25519_2w_x64(AB);	   /*  A = A**2     B = B**2   */
-		copy_EltFp25519_1w_x64(C, B);	/*  C = B         */
-		sub_EltFp25519_1w_x64(B, A, B);      /*  B = A-B       */
-		mul_a24_EltFp25519_1w_x64(D, B);     /*  D = my_a24*B  */
-		add_EltFp25519_1w_x64(D, D, C);      /*  D = D+C       */
-		mul_EltFp25519_2w_x64(UZr1, AB, CD); /*  Ur1 = A*B   Zr1 = Zr1*A */
+		add_EltFp25519_1w_x64_adx(A, Ur1, Zr1);  /*  A = Ur1+Zr1   */
+		sub_EltFp25519_1w_x64(B, Ur1, Zr1);      /*  B = Ur1-Zr1   */
+		sqr_EltFp25519_2w_x64_adx(AB);		 /*  A = A**2     B = B**2   */
+		copy_EltFp25519_1w_x64(C, B);		 /*  C = B         */
+		sub_EltFp25519_1w_x64(B, A, B);		 /*  B = A-B       */
+		mul_a24_EltFp25519_1w_x64(D, B);	 /*  D = my_a24*B  */
+		add_EltFp25519_1w_x64_adx(D, D, C);      /*  D = D+C       */
+		mul_EltFp25519_2w_x64_adx(UZr1, AB, CD); /*  Ur1 = A*B   Zr1 = Zr1*A */
 	}
 
 	/* Convert to affine coordinates */
-	inv_EltFp25519_1w_x64(A, Zr1);
-	mul_EltFp25519_1w_x64((u64 *)session_key, Ur1, A);
+	inv_EltFp25519_1w_x64_adx(A, Zr1);
+	mul_EltFp25519_1w_x64_adx((u64 *)session_key, Ur1, A);
 	fred_EltFp25519_1w_x64((u64 *)session_key);
 
 	return true;
diff --git a/main.c b/main.c
index 09c7376..14eb66e 100644
--- a/main.c
+++ b/main.c
@@ -48,7 +48,7 @@ static __always_inline int name(void) \
 } while (0)
 
 #define report_it(name) do { \
-	pr_err("%lu: %7s: %llu cycles per call\n", stamp, #name, (end_ ## name - start_ ## name) / TRIALS); \
+	pr_err("%lu: %12s: %6llu cycles per call\n", stamp, #name, (end_ ## name - start_ ## name) / TRIALS); \
 } while (0)
 
 
@@ -57,7 +57,8 @@ declare_it(hacl64)
 declare_it(fiat64)
 declare_it(sandy2x)
 declare_it(amd64)
-declare_it(precomp)
+declare_it(precomp_bmi2)
+declare_it(precomp_adx)
 declare_it(fiat32)
 declare_it(donna32)
 
@@ -71,9 +72,13 @@ static bool verify(void)
 		test_it(donna64, {}, {});
 		test_it(hacl64, {}, {});
 		test_it(fiat64, {}, {});
-		test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end());
+		if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+			test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end());
+		if (boot_cpu_has(X86_FEATURE_BMI2))
+			test_it(precomp_bmi2, {}, {});
+		if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+			test_it(precomp_adx, {}, {});
 		test_it(amd64, {}, {});
-		test_it(precomp, {}, {});
 		test_it(fiat32, {}, {});
 		test_it(donna32, {}, {});
 	}
@@ -87,9 +92,10 @@ static int __init mod_init(void)
 	cycles_t start_donna64, end_donna64;
 	cycles_t start_hacl64, end_hacl64;
 	cycles_t start_fiat64, end_fiat64;
-	cycles_t start_sandy2x, end_sandy2x;
+	cycles_t start_sandy2x = 0, end_sandy2x = 0;
 	cycles_t start_amd64, end_amd64;
-	cycles_t start_precomp, end_precomp;
+	cycles_t start_precomp_bmi2 = 0, end_precomp_bmi2 = 0;
+	cycles_t start_precomp_adx = 0, end_precomp_adx = 0;
 	cycles_t start_fiat32, end_fiat32;
 	cycles_t start_donna32, end_donna32;
 	unsigned long flags;
@@ -105,11 +111,16 @@ static int __init mod_init(void)
 	do_it(donna64);
 	do_it(hacl64);
 	do_it(fiat64);
-	kernel_fpu_begin();
-	do_it(sandy2x);
-	kernel_fpu_end();
+	if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) {
+		kernel_fpu_begin();
+		do_it(sandy2x);
+		kernel_fpu_end();
+	}
+	if (boot_cpu_has(X86_FEATURE_BMI2))
+		do_it(precomp_bmi2);
+	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+		do_it(precomp_adx);
 	do_it(amd64);
-	do_it(precomp);
 	do_it(fiat32);
 	do_it(donna32);
 
@@ -118,9 +129,13 @@ static int __init mod_init(void)
 	report_it(donna64);
 	report_it(hacl64);
 	report_it(fiat64);
-	report_it(sandy2x);
+	if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL))
+		report_it(sandy2x);
+	if (boot_cpu_has(X86_FEATURE_BMI2))
+		report_it(precomp_bmi2);
+	if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX))
+		report_it(precomp_adx);
 	report_it(amd64);
-	report_it(precomp);
 	report_it(fiat32);
 	report_it(donna32);
 
-- 
cgit v1.2.3-59-g8ed1b