aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-02-23 22:21:23 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2018-02-25 22:08:42 +0100
commit02af27da8b1b61d4a1397cfb1fc5184db3983ac2 (patch)
tree3e4dda28231df067aa73c3f95f0931c798550f54
parentDisable turbo via MSR (diff)
downloadkbench9000-02af27da8b1b61d4a1397cfb1fc5184db3983ac2.tar.xz
kbench9000-02af27da8b1b61d4a1397cfb1fc5184db3983ac2.zip
Precomp changes from upstream
-rw-r--r--curve25519-precomp.c2627
-rw-r--r--test_vectors.h3
2 files changed, 1336 insertions, 1294 deletions
diff --git a/curve25519-precomp.c b/curve25519-precomp.c
index 45addfa..a794bf9 100644
--- a/curve25519-precomp.c
+++ b/curve25519-precomp.c
@@ -1,6 +1,6 @@
-/* SPDX-License-Identifier: GPL-3+, but GPL-2 requested from authors; awaiting feedback.
+/* SPDX-License-Identifier: GPL-2.0
*
- * Copyright (C) 2017 Armando Faz <armfazh@ic.unicamp.br>.
+ * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
* Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
@@ -17,56 +17,56 @@ static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE])
secret[31] |= 64;
}
-#define NUM_WORDS_ELTFP25519_X64 4
-typedef __aligned(32) u64 EltFp25519_1w_x64[NUM_WORDS_ELTFP25519_X64];
-typedef __aligned(32) u64 EltFp25519_1w_Buffer_x64[2 * NUM_WORDS_ELTFP25519_X64];
+enum { NUM_WORDS_ELTFP25519 = 4 };
+typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
+typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
-#define mul_EltFp25519_1w_x64_adx(c, a, b) \
- mul_256x256_integer_x64_adx(buffer_1w, a, b); \
- red_EltFp25519_1w_x64_adx(c, buffer_1w);
+#define mul_eltfp25519_1w_adx(c, a, b) \
+ mul_256x256_integer_adx(buffer_1w, a, b); \
+ red_eltfp25519_1w_adx(c, buffer_1w);
-#define sqr_EltFp25519_1w_x64_adx(a) \
- sqr_256x256_integer_x64(buffer_1w, a); \
- red_EltFp25519_1w_x64_adx(a, buffer_1w);
+#define mul_eltfp25519_1w_bmi2(c, a, b) \
+ mul_256x256_integer_bmi2(buffer_1w, a, b); \
+ red_eltfp25519_1w_bmi2(c, buffer_1w);
-#define mul_EltFp25519_2w_x64_adx(c, a, b) \
- mul2_256x256_integer_x64_adx(buffer_2w, a, b); \
- red_EltFp25519_2w_x64_adx(c, buffer_2w);
+#define sqr_eltfp25519_1w_adx(a) \
+ sqr_256x256_integer_adx(buffer_1w, a); \
+ red_eltfp25519_1w_adx(a, buffer_1w);
-#define sqr_EltFp25519_2w_x64_adx(a) \
- sqr2_256x256_integer_x64(buffer_2w, a); \
- red_EltFp25519_2w_x64_adx(a, buffer_2w);
+#define sqr_eltfp25519_1w_bmi2(a) \
+ sqr_256x256_integer_bmi2(buffer_1w, a); \
+ red_eltfp25519_1w_bmi2(a, buffer_1w);
+#define mul_eltfp25519_2w_adx(c, a, b) \
+ mul2_256x256_integer_adx(buffer_2w, a, b); \
+ red_eltfp25519_2w_adx(c, buffer_2w);
-#define mul_EltFp25519_1w_x64_bmi2(c, a, b) \
- mul_256x256_integer_x64_bmi2(buffer_1w, a, b); \
- red_EltFp25519_1w_x64_bmi2(c, buffer_1w);
+#define mul_eltfp25519_2w_bmi2(c, a, b) \
+ mul2_256x256_integer_bmi2(buffer_2w, a, b); \
+ red_eltfp25519_2w_bmi2(c, buffer_2w);
-#define sqr_EltFp25519_1w_x64_bmi2(a) \
- sqr_256x256_integer_x64(buffer_1w, a); \
- red_EltFp25519_1w_x64_bmi2(a, buffer_1w);
+#define sqr_eltfp25519_2w_adx(a) \
+ sqr2_256x256_integer_adx(buffer_2w, a); \
+ red_eltfp25519_2w_adx(a, buffer_2w);
-#define mul_EltFp25519_2w_x64_bmi2(c, a, b) \
- mul2_256x256_integer_x64_bmi2(buffer_2w, a, b); \
- red_EltFp25519_2w_x64_bmi2(c, buffer_2w);
+#define sqr_eltfp25519_2w_bmi2(a) \
+ sqr2_256x256_integer_bmi2(buffer_2w, a); \
+ red_eltfp25519_2w_bmi2(a, buffer_2w);
-#define sqr_EltFp25519_2w_x64_bmi2(a) \
- sqr2_256x256_integer_x64(buffer_2w, a); \
- red_EltFp25519_2w_x64_bmi2(a, buffer_2w);
-
-#define copy_EltFp25519_1w_x64(C, A) \
- (C)[0] = (A)[0]; \
- (C)[1] = (A)[1]; \
- (C)[2] = (A)[2]; \
+#define copy_eltfp25519_1w(C, A) \
+ (C)[0] = (A)[0]; \
+ (C)[1] = (A)[1]; \
+ (C)[2] = (A)[2]; \
(C)[3] = (A)[3];
-#define setzero_EltFp25519_1w_x64(C) \
- (C)[0] = 0; \
- (C)[1] = 0; \
- (C)[2] = 0; \
+#define setzero_eltfp25519_1w(C) \
+ (C)[0] = 0; \
+ (C)[1] = 0; \
+ (C)[2] = 0; \
(C)[3] = 0;
-__aligned(32) static const u64 Table_Ladder_8k[252 * NUM_WORDS_ELTFP25519_X64] = {
+
+__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
/* 1 */ 0xfffffffffffffff3, 0xffffffffffffffff, 0xffffffffffffffff, 0x5fffffffffffffff,
/* 2 */ 0x6b8220f416aafe96, 0x82ebeb2b4f566a34, 0xd5a9a5b075a5950f, 0x5142b2cf4b2488f4,
/* 3 */ 0x6aaebc750069680c, 0x89cf7820a0f99c41, 0x2a58d9183b56d0f4, 0x4b5aca80e36011a4,
@@ -321,1129 +321,1356 @@ __aligned(32) static const u64 Table_Ladder_8k[252 * NUM_WORDS_ELTFP25519_X64] =
/* 252 */ 0xccdfcf2fc18b6d68, 0xa8ebcba8b7806167, 0x980697f95e2937e3, 0x02fbba1cd0126e8c
};
-static void mul2_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b)
+/*
+ * c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
+ * a is two 256-bit integers: a0[0:3] and a1[4:7]
+ * b is two 256-bit integers: b0[0:3] and b1[4:7]
+ */
+static void mul2_256x256_integer_adx(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "movq %%r8, (%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "movq %%r10, 8(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 8(%0), %%r8 \n\t"
- "movq %%r8, 8(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 16(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 16(%0), %%r8 \n\t"
- "movq %%r8, 16(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 24(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 24(%0), %%r8 \n\t"
- "movq %%r8, 24(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 32(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq %%r12, 40(%0) \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq %%r14, 48(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
- "movq %%rax, 56(%0) \n\t"
-
- "movq 32(%1), %%rdx # A[0] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "movq %%r8, 64(%0) \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[0]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "movq %%r10, 72(%0) \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[0]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[0]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
-
- "movq 40(%1), %%rdx # A[1] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 72(%0), %%r8 \n\t"
- "movq %%r8, 72(%0) \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 80(%0) \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 48(%1), %%rdx # A[2] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 80(%0), %%r8 \n\t"
- "movq %%r8, 80(%0) \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 88(%0) \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 56(%1), %%rdx # A[3] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 88(%0), %%r8 \n\t"
- "movq %%r8, 88(%0) \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 96(%0) \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq %%r12, 104(%0) \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq %%r14, 112(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
- "movq %%rax, 120(%0) \n\t"
+ "xorl %%r14d, %%r14d ;"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "adox %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "adox %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r12 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, 64(%0);"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "adox %%r10, %%r12 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adox %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adox %%r10, %%rbx ;"
+ /******************************************/
+ "adox %%r14, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "adox %%r12, %%r8 ;"
+ "movq %%r8, 72(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rax ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rbx ;"
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rcx ;"
+ /******************************************/
+ "adox %%r14, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "adox %%rax, %%r8 ;"
+ "movq %%r8, 80(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rbx ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%rcx ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%r12 ;"
+ /******************************************/
+ "adox %%r14, %%rax ;"
+ "adcx %%r14, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "xorl %%r10d, %%r10d ;"
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "adox %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0);"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adox %%r10, %%r9 ;"
+ "adcx %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adox %%r8, %%r11 ;"
+ "adcx %%r11, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adox %%r10, %%r13 ;"
+ "adcx %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ /******************************************/
+ "adox %%r14, %%rbx ;"
+ "adcx %%r14, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rdx",
- "%r8", "%r9", "%r10", "%r11",
- "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
}
-
-static void mul2_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+static void mul2_256x256_integer_bmi2(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "movq %%r8, (%0) \n\t"
- "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "movq %%r9, 8(%0) \n\t"
- "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t"
- "adcq %%r12, %%rax \n\t"
- "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t"
- "adcq %%r14, %%rbx \n\t"
- "adcq $0, %%rcx \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 8(%0) \n\t"
- "adcq %%rax, %%r9 \n\t"
- "movq %%r9, 16(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adcq %%r11, %%rbx \n\t"
- "adcq %%r13, %%rcx \n\t"
- "adcq %%rdx, %%rax \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 16(%0) \n\t"
- "adcq %%rbx, %%r9 \n\t"
- "movq %%r9, 24(%0) \n\t"
- "movq $0, %%rbx \n\t"
- "adcq %%r11, %%rcx \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq %%rdx, %%rbx \n\t"
-
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 24(%0) \n\t"
- "adcq %%rcx, %%r9 \n\t"
- "movq %%r9, 32(%0) \n\t"
- "movq $0, %%rcx \n\t"
- "adcq %%r11, %%rax \n\t"
- "movq %%rax, 40(%0) \n\t"
- "adcq %%r13, %%rbx \n\t"
- "movq %%rbx, 48(%0) \n\t"
- "adcq %%rdx, %%rcx \n\t"
- "movq %%rcx, 56(%0) \n\t"
-
- "movq 32(%1), %%rdx # A[0] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "movq %%r8, 64(%0) \n\t"
- "mulx 40(%2), %%r10, %%rax # A[0]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "movq %%r9, 72(%0) \n\t"
- "mulx 48(%2), %%r12, %%rbx # A[0]*B[2] \n\t"
- "adcq %%r12, %%rax \n\t"
- "mulx 56(%2), %%r14, %%rcx # A[0]*B[3] \n\t"
- "adcq %%r14, %%rbx \n\t"
- "adcq $0, %%rcx \n\t"
-
- "movq 40(%1), %%rdx # A[1] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 72(%0) \n\t"
- "adcq %%rax, %%r9 \n\t"
- "movq %%r9, 80(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adcq %%r11, %%rbx \n\t"
- "adcq %%r13, %%rcx \n\t"
- "adcq %%rdx, %%rax \n\t"
-
- "movq 48(%1), %%rdx # A[2] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 80(%0) \n\t"
- "adcq %%rbx, %%r9 \n\t"
- "movq %%r9, 88(%0) \n\t"
- "movq $0, %%rbx \n\t"
- "adcq %%r11, %%rcx \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq %%rdx, %%rbx \n\t"
-
- "movq 56(%1), %%rdx # A[3] \n\t"
- "mulx 32(%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "mulx 40(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 48(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 56(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 88(%0) \n\t"
- "adcq %%rcx, %%r9 \n\t"
- "movq %%r9, 96(%0) \n\t"
- "movq $0, %%rcx \n\t"
- "adcq %%r11, %%rax \n\t"
- "movq %%rax, 104(%0) \n\t"
- "adcq %%r13, %%rbx \n\t"
- "movq %%rbx, 112(%0) \n\t"
- "adcq %%rdx, %%rcx \n\t"
- "movq %%rcx, 120(%0) \n\t"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
+
+ "movq 32(%1), %%rdx; " /* C[0] */
+ "mulx 32(%2), %%r8, %%r12; " /* C[0]*D[0] */
+ "movq %%r8, 64(%0) ;"
+ "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 40(%1), %%rdx; " /* C[1] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%r12; " /* C[1]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 48(%1), %%rdx; " /* C[2] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 80(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 56(%1), %%rdx; " /* C[3] */
+ "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 88(%0) ;"
+ "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 96(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 112(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 120(%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
}
-static void sqr2_256x256_integer_x64(u64 *const c, u64 *const a)
+static void sqr2_256x256_integer_adx(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
- "movq %%r8, (%0) \n\t"
- "movq %%r9, 8(%0) \n\t"
- "movq %%r10, 16(%0) \n\t"
- "movq %%r11, 24(%0) \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
- "movq %%r8, 32(%0) \n\t"
- "movq %%r9, 40(%0) \n\t"
- "movq %%r10, 48(%0) \n\t"
- "movq %%r11, 56(%0) \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
- "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
- "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
- "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
-
- "addq %%rax, %%r9 \n\t"
- "adcq %%rdx, %%r10 \n\t"
- "adcq %%rcx, %%r11 \n\t"
- "adcq %%r14, %%r12 \n\t"
- "adcq $0, %%r13 \n\t"
- "movq $0, %%r14 \n\t"
- "adcq $0, %%r14 \n\t"
-
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
-
- "addq %%rax, %%r10 \n\t"
- "adcq %%rdx, %%r11 \n\t"
- "adcq $0, %%r12 \n\t"
- "adcq $0, %%r13 \n\t"
- "adcq $0, %%r14 \n\t"
-
- "shldq $1, %%r13, %%r14 \n\t"
- "shldq $1, %%r12, %%r13 \n\t"
- "shldq $1, %%r11, %%r12 \n\t"
- "shldq $1, %%r10, %%r11 \n\t"
- "shldq $1, %%r9, %%r10 \n\t"
- "shldq $1, %%r8, %%r9 \n\t"
- "shlq $1, %%r8 \n\t"
-
- "addq 8(%0), %%r8 \n\t"
- "movq %%r8, 8(%0) \n\t"
- "adcq 16(%0), %%r9 \n\t"
- "movq %%r9, 16(%0) \n\t"
- "adcq 24(%0), %%r10 \n\t"
- "movq %%r10, 24(%0) \n\t"
- "adcq 32(%0), %%r11 \n\t"
- "movq %%r11, 32(%0) \n\t"
- "adcq 40(%0), %%r12 \n\t"
- "movq %%r12, 40(%0) \n\t"
- "adcq 48(%0), %%r13 \n\t"
- "movq %%r13, 48(%0) \n\t"
- "adcq 56(%0), %%r14 \n\t"
- "movq %%r14, 56(%0) \n\t"
-
-
- "movq 32(%1), %%rdx # A[0] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
- "movq 40(%1), %%rdx # A[1] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
- "movq %%r8, 64(%0) \n\t"
- "movq %%r9, 72(%0) \n\t"
- "movq %%r10, 80(%0) \n\t"
- "movq %%r11, 88(%0) \n\t"
-
- "movq 48(%1), %%rdx # A[2] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
- "movq 56(%1), %%rdx # A[3] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
- "movq %%r8, 96(%0) \n\t"
- "movq %%r9, 104(%0) \n\t"
- "movq %%r10, 112(%0) \n\t"
- "movq %%r11, 120(%0) \n\t"
-
- "movq 40(%1), %%rdx # A[1] \n\t"
- "mulx 32(%1), %%r8, %%r9 # A[0]*A[1] \n\t"
- "mulx 48(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
- "mulx 56(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
-
- "movq 48(%1), %%rdx # A[2] \n\t"
- "mulx 56(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
- "mulx 32(%1), %%rax, %%rdx # A[0]*A[2] \n\t"
-
- "addq %%rax, %%r9 \n\t"
- "adcq %%rdx, %%r10 \n\t"
- "adcq %%rcx, %%r11 \n\t"
- "adcq %%r14, %%r12 \n\t"
- "adcq $0, %%r13 \n\t"
- "movq $0, %%r14 \n\t"
- "adcq $0, %%r14 \n\t"
-
- "movq 32(%1), %%rdx # A[0] \n\t"
- "mulx 56(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
-
- "addq %%rax, %%r10 \n\t"
- "adcq %%rdx, %%r11 \n\t"
- "adcq $0, %%r12 \n\t"
- "adcq $0, %%r13 \n\t"
- "adcq $0, %%r14 \n\t"
-
- "shldq $1, %%r13, %%r14 \n\t"
- "shldq $1, %%r12, %%r13 \n\t"
- "shldq $1, %%r11, %%r12 \n\t"
- "shldq $1, %%r10, %%r11 \n\t"
- "shldq $1, %%r9, %%r10 \n\t"
- "shldq $1, %%r8, %%r9 \n\t"
- "shlq $1, %%r8 \n\t"
-
- "addq 72(%0), %%r8 \n\t"
- "movq %%r8, 72(%0) \n\t"
- "adcq 80(%0), %%r9 \n\t"
- "movq %%r9, 80(%0) \n\t"
- "adcq 88(%0), %%r10 \n\t"
- "movq %%r10, 88(%0) \n\t"
- "adcq 96(%0), %%r11 \n\t"
- "movq %%r11, 96(%0) \n\t"
- "adcq 104(%0), %%r12 \n\t"
- "movq %%r12, 104(%0) \n\t"
- "adcq 112(%0), %%r13 \n\t"
- "movq %%r13, 112(%0) \n\t"
- "adcq 120(%0), %%r14 \n\t"
- "movq %%r14, 120(%0) \n\t"
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 56(%1), %%rdx ;" /* B[3] */
+ "mulx 40(%1), %%r11, %%r12 ;" /* B[1]*B[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
:
: "r"(c), "r"(a)
- : "cc", "%rax", "%rcx", "%rdx",
- "%r8", "%r9", "%r10", "%r11",
- "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
}
-static void red_EltFp25519_2w_x64_adx(u64 *const c, u64 *const a)
+static void sqr2_256x256_integer_bmi2(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
- "mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "adox (%1), %%r8 \n\t"
- "mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t"
- "adcx %%r10, %%r9 \n\t"
- "adox 8(%1), %%r9 \n\t"
- "mulx 48(%1), %%r10, %%rax # c*C[6] \n\t"
- "adcx %%r11, %%r10 \n\t"
- "adox 16(%1), %%r10 \n\t"
- "movq %%r10, 16(%0) \n\t"
- "mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t"
- "adcx %%rax, %%r11 \n\t"
- "adox 24(%1), %%r11 \n\t"
- "movq %%r11, 24(%0) \n\t"
- "adcx %%rbx, %%rcx \n\t"
- "adox %%rbx, %%rcx \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "adcx %%rax, %%r8 \n\t"
- "movq %%r8, (%0) \n\t"
- "adcx %%rcx, %%r9 \n\t"
- "movq %%r9, 8(%0) \n\t"
-
- "mulx 96(%1), %%r8, %%r10 # c*C[4] \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "adox 64(%1), %%r8 \n\t"
- "mulx 104(%1), %%r9, %%r11 # c*C[5] \n\t"
- "adcx %%r10, %%r9 \n\t"
- "adox 72(%1), %%r9 \n\t"
- "mulx 112(%1), %%r10, %%rax # c*C[6] \n\t"
- "adcx %%r11, %%r10 \n\t"
- "adox 80(%1), %%r10 \n\t"
- "movq %%r10, 48(%0) \n\t"
- "mulx 120(%1), %%r11, %%rcx # c*C[7] \n\t"
- "adcx %%rax, %%r11 \n\t"
- "adox 88(%1), %%r11 \n\t"
- "movq %%r11, 56(%0) \n\t"
- "adcx %%rbx, %%rcx \n\t"
- "adox %%rbx, %%rcx \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "adcx %%rax, %%r8 \n\t"
- "movq %%r8, 32(%0) \n\t"
- "adcx %%rcx, %%r9 \n\t"
- "movq %%r9, 40(%0) \n\t"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+
+ "movq 40(%1), %%rdx ;" /* B[1] */
+ "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
+ "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
+ "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
+
+ "movq 48(%1), %%rdx ;" /* B[2] */
+ "mulx 56(%1), %%r12, %%r13 ;" /* B[3]*B[2] */
+ "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq 32(%1), %%rdx ;" /* B[0] */
+ "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
+ /*******************/
+ "movq %%rax, 64(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 72(%0) ;"
+ "movq 40(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 80(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 88(%0) ;"
+ "movq 48(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 96(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 104(%0) ;"
+ "movq 56(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 112(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 120(%0) ;"
:
: "r"(c), "r"(a)
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
}
-static void red_EltFp25519_2w_x64_bmi2(u64 *const c, u64 *const a)
+void red_eltfp25519_2w_adx(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
- "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
- "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t"
- "addq %%r9, %%r10 \n\t"
- "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t"
- "adcq %%r11, %%r12 \n\t"
- "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq $0, %%rcx \n\t"
-
- "addq (%1), %%r8 \n\t"
- "adcq 8(%1), %%r10 \n\t"
- "adcq 16(%1), %%r12 \n\t"
- "movq %%r12, 16(%0) \n\t"
- "adcq 24(%1), %%rax \n\t"
- "movq %%rax, 24(%0) \n\t"
- "adcq $0, %%rcx \n\t"
-
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "addq %%rax, %%r8 \n\t"
- "movq %%r8, (%0) \n\t"
- "adcq %%rcx, %%r10 \n\t"
- "movq %%r10, 8(%0) \n\t"
-
- "mulx 96(%1), %%r8, %%r9 # c*C[4] \n\t"
- "mulx 104(%1), %%r10, %%r11 # c*C[5] \n\t"
- "addq %%r9, %%r10 \n\t"
- "mulx 112(%1), %%r12, %%r13 # c*C[6] \n\t"
- "adcq %%r11, %%r12 \n\t"
- "mulx 120(%1), %%rax, %%rcx # c*C[7] \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq $0, %%rcx \n\t"
-
- "addq 64(%1), %%r8 \n\t"
- "adcq 72(%1), %%r10 \n\t"
- "adcq 80(%1), %%r12 \n\t"
- "movq %%r12, 48(%0) \n\t"
- "adcq 88(%1), %%rax \n\t"
- "movq %%rax, 56(%0) \n\t"
- "adcq $0, %%rcx \n\t"
-
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "addq %%rax, %%r8 \n\t"
- "movq %%r8, 32(%0) \n\t"
- "adcq %%rcx, %%r10 \n\t"
- "movq %%r10, 40(%0) \n\t"
+ "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox 64(%1), %%r8 ;"
+ "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 72(%1), %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 80(%1), %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 88(%1), %%r11 ;"
+ /****************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ; " /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+}
+void red_eltfp25519_2w_bmi2(u64 *const c, u64 *const a)
+{
+ __asm__ __volatile__(
+ "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
+
+ "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /****************************************/
+ "adcq $0, %%rcx ;"
+ "addq 64(%1), %%r8 ;"
+ "adcq 72(%1), %%r9 ;"
+ "adcq 80(%1), %%r10 ;"
+ "adcq 88(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 40(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 48(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 56(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 32(%0) ;"
:
: "r"(c), "r"(a)
- : "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
}
-static void mul_256x256_integer_x64_adx(u64 *const c, u64 *const a, u64 *const b)
+static void mul_256x256_integer_adx(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "movq %%r8, (%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[0]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "movq %%r10, 8(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[0]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[0]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 8(%0), %%r8 \n\t"
- "movq %%r8, 8(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 16(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 16(%0), %%r8 \n\t"
- "movq %%r8, 16(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 24(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
-
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "xorl %%r10d, %%r10d \n\t"
- "adcx 24(%0), %%r8 \n\t"
- "movq %%r8, 24(%0) \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "adox %%r9, %%r10 \n\t"
- "adcx %%r12, %%r10 \n\t"
- "movq %%r10, 32(%0) \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adox %%r11, %%r12 \n\t"
- "adcx %%r14, %%r12 \n\t"
- "movq %%r12, 40(%0) \n\t"
- "movq $0, %%r8 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adox %%r13, %%r14 \n\t"
- "adcx %%rax, %%r14 \n\t"
- "movq %%r14, 48(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adox %%rdx, %%rax \n\t"
- "adcx %%r8, %%rax \n\t"
- "movq %%rax, 56(%0) \n\t"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "movq %%r10, 8(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[0]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 8(%0), %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[1]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 16(%0), %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[2]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "xorl %%r10d, %%r10d ;"
+ "adcx 24(%0), %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adox %%r9, %%r10 ;"
+ "adcx %%r12, %%r10 ;"
+ "movq %%r10, 32(%0) ;"
+ "mulx 16(%2), %%r12, %%r13; " /* A[3]*B[2] */
+ "adox %%r11, %%r12 ;"
+ "adcx %%r14, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq $0, %%r8 ;"
+ "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
+ "adox %%r13, %%r14 ;"
+ "adcx %%rax, %%r14 ;"
+ "movq %%r14, 48(%0) ;"
+ "movq $0, %%rax ;"
+ /******************************************/
+ "adox %%rdx, %%rax ;"
+ "adcx %%r8, %%rax ;"
+ "movq %%rax, 56(%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rdx",
- "%r8", "%r9", "%r10", "%r11",
- "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
}
-static void mul_256x256_integer_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+static void mul_256x256_integer_bmi2(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx (%2), %%r8, %%r9 # A[0]*B[0] \n\t"
- "movq %%r8, (%0) \n\t"
- "mulx 8(%2), %%r10, %%rax # A[0]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "movq %%r9, 8(%0) \n\t"
- "mulx 16(%2), %%r12, %%rbx # A[0]*B[2] \n\t"
- "adcq %%r12, %%rax \n\t"
- "mulx 24(%2), %%r14, %%rcx # A[0]*B[3] \n\t"
- "adcq %%r14, %%rbx \n\t"
- "adcq $0, %%rcx \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%2), %%r8, %%r9 # A[1]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[1]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[1]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[1]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 8(%0) \n\t"
- "adcq %%rax, %%r9 \n\t"
- "movq %%r9, 16(%0) \n\t"
- "movq $0, %%rax \n\t"
- "adcq %%r11, %%rbx \n\t"
- "adcq %%r13, %%rcx \n\t"
- "adcq %%rdx, %%rax \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx (%2), %%r8, %%r9 # A[2]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[2]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[2]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[2]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 16(%0) \n\t"
- "adcq %%rbx, %%r9 \n\t"
- "movq %%r9, 24(%0) \n\t"
- "movq $0, %%rbx \n\t"
- "adcq %%r11, %%rcx \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq %%rdx, %%rbx \n\t"
-
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx (%2), %%r8, %%r9 # A[3]*B[0] \n\t"
- "mulx 8(%2), %%r10, %%r11 # A[3]*B[1] \n\t"
- "addq %%r10, %%r9 \n\t"
- "mulx 16(%2), %%r12, %%r13 # A[3]*B[2] \n\t"
- "adcq %%r12, %%r11 \n\t"
- "mulx 24(%2), %%r14, %%rdx # A[3]*B[3] \n\t"
- "adcq %%r14, %%r13 \n\t"
- "adcq $0, %%rdx \n\t"
-
- "addq %%r8, 24(%0) \n\t"
- "adcq %%rcx, %%r9 \n\t"
- "movq %%r9, 32(%0) \n\t"
- "movq $0, %%rcx \n\t"
- "adcq %%r11, %%rax \n\t"
- "movq %%rax, 40(%0) \n\t"
- "adcq %%r13, %%rbx \n\t"
- "movq %%rbx, 48(%0) \n\t"
- "adcq %%rdx, %%rcx \n\t"
- "movq %%rcx, 56(%0) \n\t"
+ "movq (%1), %%rdx; " /* A[0] */
+ "mulx (%2), %%r8, %%r12; " /* A[0]*B[0] */
+ "movq %%r8, (%0) ;"
+ "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
+ "addq %%r10, %%r12 ;"
+ "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
+ "adcq %%r8, %%rax ;"
+ "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
+ "adcq %%r10, %%rbx ;"
+ /******************************************/
+ "adcq $0, %%rcx ;"
+
+ "movq 8(%1), %%rdx; " /* A[1] */
+ "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
+ "addq %%r12, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%r12; " /* A[1]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%r12 ;"
+
+ "addq %%r9, %%rax ;"
+ "adcq %%r11, %%rbx ;"
+ "adcq %%r13, %%rcx ;"
+ "adcq $0, %%r12 ;"
+
+ "movq 16(%1), %%rdx; " /* A[2] */
+ "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, 16(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rax ;"
+
+ "addq %%r9, %%rbx ;"
+ "adcq %%r11, %%rcx ;"
+ "adcq %%r13, %%r12 ;"
+ "adcq $0, %%rax ;"
+
+ "movq 24(%1), %%rdx; " /* A[3] */
+ "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
+ "addq %%rbx, %%r8 ;"
+ "movq %%r8, 24(%0) ;"
+ "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
+ "adcq %%r10, %%r9 ;"
+ "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
+ "adcq %%r8, %%r11 ;"
+ "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
+ "adcq %%r10, %%r13 ;"
+ /******************************************/
+ "adcq $0, %%rbx ;"
+
+ "addq %%r9, %%rcx ;"
+ "movq %%rcx, 32(%0) ;"
+ "adcq %%r11, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "adcq %%r13, %%rax ;"
+ "movq %%rax, 48(%0) ;"
+ "adcq $0, %%rbx ;"
+ "movq %%rbx, 56(%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8",
- "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
+}
+
+static void sqr_256x256_integer_adx(u64 *const c, u64 *const a)
+{
+ __asm__ __volatile__(
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
+ "xorl %%r15d, %%r15d;"
+ "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
+ "adcx %%r14, %%r9 ;"
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
+ "adcx %%rax, %%r10 ;"
+ "movq 24(%1), %%rdx ;" /* A[3] */
+ "mulx 8(%1), %%r11, %%r12 ;" /* A[1]*A[3] */
+ "adcx %%rcx, %%r11 ;"
+ "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
+ "adcx %%rax, %%r12 ;"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "adcx %%r15, %%r13 ;"
+ "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
+ "movq $0, %%r14 ;"
+ /******************************************/
+ "adcx %%r15, %%r14 ;"
+
+ "xorl %%r15d, %%r15d;"
+ "adox %%rax, %%r10 ;"
+ "adcx %%r8, %%r8 ;"
+ "adox %%rcx, %%r11 ;"
+ "adcx %%r9, %%r9 ;"
+ "adox %%r15, %%r12 ;"
+ "adcx %%r10, %%r10 ;"
+ "adox %%r15, %%r13 ;"
+ "adcx %%r11, %%r11 ;"
+ "adox %%r15, %%r14 ;"
+ "adcx %%r12, %%r12 ;"
+ "adcx %%r13, %%r13 ;"
+ "adcx %%r14, %%r14 ;"
+
+ "movq (%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
+ :
+ : "r"(c), "r"(a)
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15");
}
-static void sqr_256x256_integer_x64(u64 *const c, u64 *const a)
+static void sqr_256x256_integer_bmi2(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movq (%1), %%rdx # A[0] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[0]^2 \n\t"
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[1]^2 \n\t"
- "movq %%r8, (%0) \n\t"
- "movq %%r9, 8(%0) \n\t"
- "movq %%r10, 16(%0) \n\t"
- "movq %%r11, 24(%0) \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx %%rdx, %%r8, %%r9 # A[2]^2 \n\t"
- "movq 24(%1), %%rdx # A[3] \n\t"
- "mulx %%rdx, %%r10, %%r11 # A[3]^2 \n\t"
- "movq %%r8, 32(%0) \n\t"
- "movq %%r9, 40(%0) \n\t"
- "movq %%r10, 48(%0) \n\t"
- "movq %%r11, 56(%0) \n\t"
-
- "movq 8(%1), %%rdx # A[1] \n\t"
- "mulx (%1), %%r8, %%r9 # A[0]*A[1] \n\t"
- "mulx 16(%1), %%r10, %%r11 # A[2]*A[1] \n\t"
- "mulx 24(%1), %%rcx, %%r14 # A[3]*A[1] \n\t"
-
- "movq 16(%1), %%rdx # A[2] \n\t"
- "mulx 24(%1), %%r12, %%r13 # A[3]*A[2] \n\t"
- "mulx (%1), %%rax, %%rdx # A[0]*A[2] \n\t"
-
- " addq %%rax, %%r9 \n\t"
- " adcq %%rdx, %%r10 \n\t"
- " adcq %%rcx, %%r11 \n\t"
- " adcq %%r14, %%r12 \n\t"
- " adcq $0, %%r13 \n\t"
- " movq $0, %%r14 \n\t"
- " adcq $0, %%r14 \n\t"
-
- " movq (%1), %%rdx # A[0] \n\t"
- " mulx 24(%1), %%rax, %%rdx # A[0]*A[3] \n\t"
-
- " addq %%rax, %%r10 \n\t"
- " adcq %%rdx, %%r11 \n\t"
- " adcq $0, %%r12 \n\t"
- " adcq $0, %%r13 \n\t"
- " adcq $0, %%r14 \n\t"
-
- " shldq $1, %%r13, %%r14 \n\t"
- " shldq $1, %%r12, %%r13 \n\t"
- " shldq $1, %%r11, %%r12 \n\t"
- " shldq $1, %%r10, %%r11 \n\t"
- " shldq $1, %%r9, %%r10 \n\t"
- " shldq $1, %%r8, %%r9 \n\t"
- " shlq $1, %%r8 \n\t"
-
- " addq 8(%0), %%r8 \n\t"
- " movq %%r8, 8(%0) \n\t"
- " adcq 16(%0), %%r9 \n\t"
- " movq %%r9, 16(%0) \n\t"
- " adcq 24(%0), %%r10 \n\t"
- " movq %%r10, 24(%0) \n\t"
- " adcq 32(%0), %%r11 \n\t"
- " movq %%r11, 32(%0) \n\t"
- " adcq 40(%0), %%r12 \n\t"
- " movq %%r12, 40(%0) \n\t"
- " adcq 48(%0), %%r13 \n\t"
- " movq %%r13, 48(%0) \n\t"
- " adcq 56(%0), %%r14 \n\t"
- " movq %%r14, 56(%0) \n\t"
+ "movq 8(%1), %%rdx ;" /* A[1] */
+ "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
+ "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
+ "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
+
+ "movq 16(%1), %%rdx ;" /* A[2] */
+ "mulx 24(%1), %%r12, %%r13 ;" /* A[3]*A[2] */
+ "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
+
+ "addq %%rax, %%r9 ;"
+ "adcq %%rdx, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq %%r14, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "movq $0, %%r14 ;"
+ "adcq $0, %%r14 ;"
+
+ "movq (%1), %%rdx ;" /* A[0] */
+ "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
+
+ "addq %%rax, %%r10 ;"
+ "adcq %%rcx, %%r11 ;"
+ "adcq $0, %%r12 ;"
+ "adcq $0, %%r13 ;"
+ "adcq $0, %%r14 ;"
+
+ "shldq $1, %%r13, %%r14 ;"
+ "shldq $1, %%r12, %%r13 ;"
+ "shldq $1, %%r11, %%r12 ;"
+ "shldq $1, %%r10, %%r11 ;"
+ "shldq $1, %%r9, %%r10 ;"
+ "shldq $1, %%r8, %%r9 ;"
+ "shlq $1, %%r8 ;"
+
+ /*******************/
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
+ /*******************/
+ "movq %%rax, 0(%0) ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, 8(%0) ;"
+ "movq 8(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
+ "adcq %%rax, %%r9 ;"
+ "movq %%r9, 16(%0) ;"
+ "adcq %%rcx, %%r10 ;"
+ "movq %%r10, 24(%0) ;"
+ "movq 16(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
+ "adcq %%rax, %%r11 ;"
+ "movq %%r11, 32(%0) ;"
+ "adcq %%rcx, %%r12 ;"
+ "movq %%r12, 40(%0) ;"
+ "movq 24(%1), %%rdx ;"
+ "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
+ "adcq %%rax, %%r13 ;"
+ "movq %%r13, 48(%0) ;"
+ "adcq %%rcx, %%r14 ;"
+ "movq %%r14, 56(%0) ;"
:
: "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx",
- "%r8", "%r9", "%r10", "%r11",
- "%r12", "%r13", "%r14");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14");
}
-static void red_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a)
+static void red_eltfp25519_1w_adx(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
- "mulx 32(%1), %%r8, %%r10 # c*C[4] \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "adox (%1), %%r8 \n\t"
- "mulx 40(%1), %%r9, %%r11 # c*C[5] \n\t"
- "adcx %%r10, %%r9 \n\t"
- "adox 8(%1), %%r9 \n\t"
- "mulx 48(%1), %%r10, %%rax # c*C[6] \n\t"
- "adcx %%r11, %%r10 \n\t"
- "adox 16(%1), %%r10 \n\t"
- "movq %%r10, 16(%0) \n\t"
- "mulx 56(%1), %%r11, %%rcx # c*C[7] \n\t"
- "adcx %%rax, %%r11 \n\t"
- "adox 24(%1), %%r11 \n\t"
- "movq %%r11, 24(%0) \n\t"
- "adcx %%rbx, %%rcx \n\t"
- "adox %%rbx, %%rcx \n\t"
- "xorl %%ebx, %%ebx \n\t"
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "adcx %%rax, %%r8 \n\t"
- "movq %%r8, (%0) \n\t"
- "adcx %%rcx, %%r9 \n\t"
- "movq %%r9, 8(%0) \n\t"
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "xorl %%ebx, %%ebx ;"
+ "adox (%1), %%r8 ;"
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "adcx %%r10, %%r9 ;"
+ "adox 8(%1), %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcx %%r11, %%r10 ;"
+ "adox 16(%1), %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcx %%rax, %%r11 ;"
+ "adox 24(%1), %%r11 ;"
+ /***************************************/
+ "adcx %%rbx, %%rcx ;"
+ "adox %%rbx, %%rcx ;"
+ "clc ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "adcx %%rax, %%r8 ;"
+ "adcx %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rbx, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rbx, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a)
: "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
}
-static void red_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a)
+static void red_eltfp25519_1w_bmi2(u64 *const c, u64 *const a)
{
__asm__ __volatile__(
- "movl $38, %%edx # 2*c = 38 = 2^256 \n\t"
- "mulx 32(%1), %%r8, %%r9 # c*C[4] \n\t"
- "mulx 40(%1), %%r10, %%r11 # c*C[5] \n\t"
- "addq %%r9, %%r10 \n\t"
- "mulx 48(%1), %%r12, %%r13 # c*C[6] \n\t"
- "adcq %%r11, %%r12 \n\t"
- "mulx 56(%1), %%rax, %%rcx # c*C[7] \n\t"
- "adcq %%r13, %%rax \n\t"
- "adcq $0, %%rcx \n\t"
-
- "addq (%1), %%r8 \n\t"
- "adcq 8(%1), %%r10 \n\t"
- "adcq 16(%1), %%r12 \n\t"
- "movq %%r12, 16(%0) \n\t"
- "adcq 24(%1), %%rax \n\t"
- "movq %%rax, 24(%0) \n\t"
- "adcq $0, %%rcx \n\t"
-
- "mulx %%rcx, %%rax, %%rcx \n\t"
- "addq %%rax, %%r8 \n\t"
- "movq %%r8, (%0) \n\t"
- "adcq %%rcx, %%r10 \n\t"
- "movq %%r10, 8(%0) \n\t"
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
+ "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
+ "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
+ "addq %%r10, %%r9 ;"
+ "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
+ "adcq %%r11, %%r10 ;"
+ "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
+ "adcq %%rax, %%r11 ;"
+ /***************************************/
+ "adcq $0, %%rcx ;"
+ "addq (%1), %%r8 ;"
+ "adcq 8(%1), %%r9 ;"
+ "adcq 16(%1), %%r10 ;"
+ "adcq 24(%1), %%r11 ;"
+ "adcq $0, %%rcx ;"
+ "mulx %%rcx, %%rax, %%rcx ;" /* c*C[4] */
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
}
-static inline void add_EltFp25519_1w_x64_adx(u64 *const c, u64 *const a, u64 *const b)
+static __always_inline void add_eltfp25519_1w_adx(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%2), %%rax \n\t"
- "movq 8(%2), %%rcx \n\t"
- "movq 16(%2), %%r8 \n\t"
- "movq 24(%2), %%r9 \n\t"
- "clc \n\t"
- "adcx (%1), %%rax \n\t"
- "adcx 8(%1), %%rcx \n\t"
- "adcx 16(%1), %%r8 \n\t"
- "adcx 24(%1), %%r9 \n\t"
- "movq %%rcx, 8(%0) \n\t"
- "movq %%r8 , 16(%0) \n\t"
- "movq %%r9 , 24(%0) \n\t"
- "setc %%cl \n\t"
- "neg %%rcx \n\t"
- "andq $38, %%rcx \n\t"
- "addq %%rcx, %%rax \n\t"
- "movq %%rax, (%0) \n\t"
+ "mov $38, %%eax ;"
+ "xorl %%ecx, %%ecx ;"
+ "movq (%2), %%r8 ;"
+ "adcx (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcx 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcx 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcx 24(%1), %%r11 ;"
+ "cmovc %%eax, %%ecx ;"
+ "xorl %%eax, %%eax ;"
+ "adcx %%rcx, %%r8 ;"
+ "adcx %%rax, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcx %%rax, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcx %%rax, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $38, %%ecx ;"
+ "cmovc %%ecx, %%eax ;"
+ "addq %%rax, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9");
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
}
-static inline void add_EltFp25519_1w_x64_bmi2(u64 *const c, u64 *const a, u64 *const b)
+static __always_inline void add_eltfp25519_1w_bmi2(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%2), %%rax \n\t"
- "movq 8(%2), %%rcx \n\t"
- "movq 16(%2), %%r8 \n\t"
- "movq 24(%2), %%r9 \n\t"
- "add (%1), %%rax \n\t"
- "adc 8(%1), %%rcx \n\t"
- "adc 16(%1), %%r8 \n\t"
- "adc 24(%1), %%r9 \n\t"
- "movq %%rcx, 8(%0) \n\t"
- "movq %%r8 , 16(%0) \n\t"
- "movq %%r9 , 24(%0) \n\t"
- "setc %%cl \n\t"
- "neg %%rcx \n\t"
- "andq $38, %%rcx \n\t"
- "addq %%rcx, %%rax \n\t"
- "movq %%rax, (%0) \n\t"
+ "mov $38, %%eax ;"
+ "movq (%2), %%r8 ;"
+ "addq (%1), %%r8 ;"
+ "movq 8(%2), %%r9 ;"
+ "adcq 8(%1), %%r9 ;"
+ "movq 16(%2), %%r10 ;"
+ "adcq 16(%1), %%r10 ;"
+ "movq 24(%2), %%r11 ;"
+ "adcq 24(%1), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "adcq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9");
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
}
-static inline void sub_EltFp25519_1w_x64(u64 *const __restrict c, u64 *const __restrict a, u64 *const __restrict b)
+static __always_inline void sub_eltfp25519_1w(u64 *const c, u64 *const a, u64 *const b)
{
__asm__ __volatile__(
- "movq (%1), %%rax \n\t"
- "movq 8(%1), %%rcx \n\t"
- "movq 16(%1), %%r8 \n\t"
- "movq 24(%1), %%r9 \n\t"
- "subq (%2), %%rax \n\t"
- "sbbq 8(%2), %%rcx \n\t"
- "sbbq 16(%2), %%r8 \n\t"
- "sbbq 24(%2), %%r9 \n\t"
- "movq %%rcx, 8(%0) \n\t"
- "movq %%r8 , 16(%0) \n\t"
- "movq %%r9 , 24(%0) \n\t"
- "setc %%cl \n\t"
- "neg %%rcx \n\t"
- "andq $38, %%rcx \n\t"
- "subq %%rcx, %%rax \n\t"
- "movq %%rax, (%0) \n\t"
+ "mov $38, %%eax ;"
+ "movq (%1), %%r8 ;"
+ "subq (%2), %%r8 ;"
+ "movq 8(%1), %%r9 ;"
+ "sbbq 8(%2), %%r9 ;"
+ "movq 16(%1), %%r10 ;"
+ "sbbq 16(%2), %%r10 ;"
+ "movq 24(%1), %%r11 ;"
+ "sbbq 24(%2), %%r11 ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "sbbq $0, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "sbbq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "sbbq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%eax, %%ecx ;"
+ "subq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9");
+ : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
}
-static inline void mul_a24_EltFp25519_1w_x64(u64 *const c, u64 *const a)
+/*
+ * Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666
+ */
+static __always_inline void mul_a24_eltfp25519_1w(u64 *const c, u64 *const a)
{
- /* a24 = (A+2)/4 = (486662+2)/4 = 121666 */
const u64 a24 = 121666;
__asm__ __volatile__(
- "movq %2, %%rdx \n\t"
- "mulx (%1), %%rax, %%r8 \n\t"
- "mulx 8(%1), %%rcx, %%r9 \n\t"
- "movq %%rax, (%0) \n\t"
- "movq %%rcx, 8(%0) \n\t"
- "mulx 16(%1), %%rax, %%r10 \n\t"
- "mulx 24(%1), %%rcx, %%r11 \n\t"
- "movq %%rax, 16(%0) \n\t"
- "movq %%rcx, 24(%0) \n\t"
- "movq $38, %%rdx \n\t"
- "mulx %%r11, %%rax, %%rcx \n\t"
- "addq %%rax, (%0) \n\t"
- "adcq %%r8, 8(%0) \n\t"
- "adcq %%r9, 16(%0) \n\t"
- "adcq %%r10, 24(%0) \n\t"
+ "movq %2, %%rdx ;"
+ "mulx (%1), %%r8, %%r10 ;"
+ "mulx 8(%1), %%r9, %%r11 ;"
+ "addq %%r10, %%r9 ;"
+ "mulx 16(%1), %%r10, %%rax ;"
+ "adcq %%r11, %%r10 ;"
+ "mulx 24(%1), %%r11, %%rcx ;"
+ "adcq %%rax, %%r11 ;"
+ /**************************/
+ "adcq $0, %%rcx ;"
+ "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
+ "mulx %%rcx, %%rax, %%rcx ;"
+ "addq %%rax, %%r8 ;"
+ "adcq %%rcx, %%r9 ;"
+ "movq %%r9, 8(%0) ;"
+ "adcq $0, %%r10 ;"
+ "movq %%r10, 16(%0) ;"
+ "adcq $0, %%r11 ;"
+ "movq %%r11, 24(%0) ;"
+ "mov $0, %%ecx ;"
+ "cmovc %%edx, %%ecx ;"
+ "addq %%rcx, %%r8 ;"
+ "movq %%r8, (%0) ;"
:
: "r"(c), "r"(a), "r"(a24)
- : "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
+ : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11");
}
-static void inv_EltFp25519_1w_x64_adx(u64 *const pC, u64 *const pA)
+static void inv_eltfp25519_1w_adx(u64 *const c, u64 *const a)
{
-#define sqrn_EltFp25519_1w_x64(a, times) \
- counter = times; \
- while (counter-- > 0) { \
- sqr_EltFp25519_1w_x64_adx(a); \
+#define sqrn_eltfp25519_1w_adx(A, times) \
+ counter = times; \
+ while (counter-- > 0) { \
+ sqr_eltfp25519_1w_adx(A); \
}
- EltFp25519_1w_Buffer_x64 buffer_1w;
- EltFp25519_1w_x64 x0, x1, x2;
+ eltfp25519_1w_buffer buffer_1w;
+ eltfp25519_1w x0, x1, x2;
u64 *T[5];
u64 counter;
T[0] = x0;
- T[1] = pC; /* x^(-1) */
+ T[1] = c; /* x^(-1) */
T[2] = x1;
T[3] = x2;
- T[4] = pA; /* x */
-
- copy_EltFp25519_1w_x64(T[1], pA);
- sqrn_EltFp25519_1w_x64(T[1], 1);
- copy_EltFp25519_1w_x64(T[2], T[1]);
- sqrn_EltFp25519_1w_x64(T[2], 2);
- mul_EltFp25519_1w_x64_adx(T[0], pA, T[2]);
- mul_EltFp25519_1w_x64_adx(T[1], T[1], T[0]);
- copy_EltFp25519_1w_x64(T[2], T[1]);
- sqrn_EltFp25519_1w_x64(T[2], 1);
- mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 5);
- mul_EltFp25519_1w_x64_adx(T[0], T[0], T[2]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 10);
- mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]);
- copy_EltFp25519_1w_x64(T[3], T[2]);
- sqrn_EltFp25519_1w_x64(T[3], 20);
- mul_EltFp25519_1w_x64_adx(T[3], T[3], T[2]);
- sqrn_EltFp25519_1w_x64(T[3], 10);
- mul_EltFp25519_1w_x64_adx(T[3], T[3], T[0]);
- copy_EltFp25519_1w_x64(T[0], T[3]);
- sqrn_EltFp25519_1w_x64(T[0], 50);
- mul_EltFp25519_1w_x64_adx(T[0], T[0], T[3]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 100);
- mul_EltFp25519_1w_x64_adx(T[2], T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 50);
- mul_EltFp25519_1w_x64_adx(T[2], T[2], T[3]);
- sqrn_EltFp25519_1w_x64(T[2], 5);
- mul_EltFp25519_1w_x64_adx(T[1], T[1], T[2]);
-#undef sqrn_EltFp25519_1w_x64
+ T[4] = a; /* x */
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_adx(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 2);
+ mul_eltfp25519_1w_adx(T[0], a, T[2]);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_adx(T[2], 1);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 10);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 20);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_adx(T[3], 10);
+ mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_adx(T[0], 50);
+ mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 100);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_adx(T[2], 50);
+ mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_adx(T[2], 5);
+ mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
+#undef sqrn_eltfp25519_1w_adx
}
-static void inv_EltFp25519_1w_x64_bmi2(u64 *const pC, u64 *const pA)
+static void inv_eltfp25519_1w_bmi2(u64 *const c, u64 *const a)
{
-#define sqrn_EltFp25519_1w_x64(a, times) \
- counter = times; \
- while (counter-- > 0) { \
- sqr_EltFp25519_1w_x64_bmi2(a); \
+#define sqrn_eltfp25519_1w_bmi2(A, times) \
+ counter = times; \
+ while (counter-- > 0) { \
+ sqr_eltfp25519_1w_bmi2(A); \
}
- EltFp25519_1w_Buffer_x64 buffer_1w;
- EltFp25519_1w_x64 x0, x1, x2;
+ eltfp25519_1w_buffer buffer_1w;
+ eltfp25519_1w x0, x1, x2;
u64 *T[5];
u64 counter;
T[0] = x0;
- T[1] = pC; /* x^(-1) */
+ T[1] = c; /* x^(-1) */
T[2] = x1;
T[3] = x2;
- T[4] = pA; /* x */
-
- copy_EltFp25519_1w_x64(T[1], pA);
- sqrn_EltFp25519_1w_x64(T[1], 1);
- copy_EltFp25519_1w_x64(T[2], T[1]);
- sqrn_EltFp25519_1w_x64(T[2], 2);
- mul_EltFp25519_1w_x64_bmi2(T[0], pA, T[2]);
- mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[0]);
- copy_EltFp25519_1w_x64(T[2], T[1]);
- sqrn_EltFp25519_1w_x64(T[2], 1);
- mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 5);
- mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[2]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 10);
- mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]);
- copy_EltFp25519_1w_x64(T[3], T[2]);
- sqrn_EltFp25519_1w_x64(T[3], 20);
- mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[2]);
- sqrn_EltFp25519_1w_x64(T[3], 10);
- mul_EltFp25519_1w_x64_bmi2(T[3], T[3], T[0]);
- copy_EltFp25519_1w_x64(T[0], T[3]);
- sqrn_EltFp25519_1w_x64(T[0], 50);
- mul_EltFp25519_1w_x64_bmi2(T[0], T[0], T[3]);
- copy_EltFp25519_1w_x64(T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 100);
- mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[0]);
- sqrn_EltFp25519_1w_x64(T[2], 50);
- mul_EltFp25519_1w_x64_bmi2(T[2], T[2], T[3]);
- sqrn_EltFp25519_1w_x64(T[2], 5);
- mul_EltFp25519_1w_x64_bmi2(T[1], T[1], T[2]);
-#undef sqrn_EltFp25519_1w_x64
+ T[4] = a; /* x */
+
+ copy_eltfp25519_1w(T[1], a);
+ sqrn_eltfp25519_1w_bmi2(T[1], 1);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 2);
+ mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
+ copy_eltfp25519_1w(T[2], T[1]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 1);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 10);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ copy_eltfp25519_1w(T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 20);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
+ sqrn_eltfp25519_1w_bmi2(T[3], 10);
+ mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
+ copy_eltfp25519_1w(T[0], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[0], 50);
+ mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
+ copy_eltfp25519_1w(T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 100);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 50);
+ mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
+ sqrn_eltfp25519_1w_bmi2(T[2], 5);
+ mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
+#undef sqrn_eltfp25519_1w_bmi2
}
-static inline void fred_EltFp25519_1w_x64(u64 *const c)
+/*
+ * Given C, a 256-bit number, fred_eltfp25519_1w updates C
+ * with a number such that 0 <= C < 2**255-19.
+ */
+static __always_inline void fred_eltfp25519_1w(u64 *const c)
{
- s64 last = (((s64 *)c)[3]) >> 63;
- c[3] &= (1ULL << 63) - 1;
- c[0] += 19 & last;
+ __asm__ __volatile__(
+ /* First, obtains a number less than 2^255. */
+ "btrq $63, 24(%0) ;"
+ "sbbl %%ecx, %%ecx ;"
+ "andq $19, %%rcx ;"
+ "addq %%rcx, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+
+ "btrq $63, 24(%0) ;"
+ "sbbl %%ecx, %%ecx ;"
+ "andq $19, %%rcx ;"
+ "addq %%rcx, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+
+ /* Then, in case the number fall into [2^255-19, 2^255-1] */
+ "cmpq $-19, (%0) ;"
+ "setaeb %%al ;"
+ "cmpq $-1, 8(%0) ;"
+ "setzb %%bl ;"
+ "cmpq $-1, 16(%0) ;"
+ "setzb %%cl ;"
+ "movq 24(%0), %%rdx ;"
+ "addq $1, %%rdx ;"
+ "shrq $63, %%rdx ;"
+ "andb %%bl, %%al ;"
+ "andb %%dl, %%cl ;"
+ "test %%cl, %%al ;"
+ "movl $0, %%eax ;"
+ "movl $19, %%ecx ;"
+ "cmovnz %%rcx, %%rax ;"
+ "addq %%rax, (%0) ;"
+ "adcq $0, 8(%0) ;"
+ "adcq $0, 16(%0) ;"
+ "adcq $0, 24(%0) ;"
+ "btrq $63, 24(%0) ;"
+ :
+ : "r"(c)
+ : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx");
}
-static inline void cswap_x64(u64 bit, u64 *const px, u64 *const py)
+static __always_inline void cswap(u64 bit, u64 *const px, u64 *const py)
{
- int i = 0;
+ int i;
u64 mask = 0ULL - bit;
- for (i = 0; i < NUM_WORDS_ELTFP25519_X64; ++i) {
+
+ for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
u64 t = mask & (px[i] ^ py[i]);
px[i] = px[i] ^ t;
py[i] = py[i] ^ t;
}
}
-static __always_inline void reduce_point_mod_2_255_19(u64 *p)
+bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE], const u8 session_key[CURVE25519_POINT_SIZE])
{
- __asm__ __volatile__(
- "cmpq $-19, %0\n"
- "setaeb %%al\n"
- "cmpq $-1, %1\n"
- "setzb %%bl\n"
- "cmpq $-1, %2\n"
- "setzb %%cl\n"
- "leaq 1(%3), %%rdx\n"
- "shrq $63, %%rdx\n"
- "andb %%bl, %%al\n"
- "andb %%dl, %%cl\n"
- "testb %%cl, %%al\n"
- "movl $0, %%eax\n"
- "movl $19, %%ecx\n"
- "cmovnzq %%rcx, %%rax\n"
- "addq %%rax, %0\n"
- "adcq $0, %1\n"
- "adcq $0, %2\n"
- "adcq $0, %3\n"
- "btrq $63, %3\n"
- : "+r"(p[0]), "+r"(p[1]), "+r"(p[2]), "+r"(p[3])
- :
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx");
-}
-
-bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE])
-{
- __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
- __aligned(32) u8 session_key[CURVE25519_POINT_SIZE];
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 session[CURVE25519_POINT_SIZE];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
int i = 0, j = 0;
u64 prev = 0;
- u64 *const X1 = (u64 *)session_key;
- u64 *const key = (u64 *)private_key;
+ u64 *const X1 = (u64 *)session;
+ u64 *const key = (u64 *)private;
u64 *const Px = coordinates + 0;
u64 *const Pz = coordinates + 4;
u64 *const Qx = coordinates + 8;
@@ -1467,24 +1694,25 @@ bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private[
u64 *const buffer_1w = buffer;
u64 *const buffer_2w = buffer;
- memcpy(session_key, session, sizeof(session_key));
- memcpy(private_key, private, sizeof(private_key));
- normalize_secret(private_key);
+ memcpy(private, private_key, sizeof(private));
+ memcpy(session, session_key, sizeof(session));
+
+ normalize_secret(private);
- /* As in the draft:
+ /*
+ * As in the draft:
* When receiving such an array, implementations of curve25519
* MUST mask the most-significant bit in the final byte. This
* is done to preserve compatibility with point formats which
* reserve the sign bit for use in other protocols and to
* increase resistance to implementation fingerprinting
*/
- session_key[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
- reduce_point_mod_2_255_19((u64 *)session_key);
- copy_EltFp25519_1w_x64(Px, (u64 *)session_key);
+ session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
- setzero_EltFp25519_1w_x64(Pz);
- setzero_EltFp25519_1w_x64(Qx);
- setzero_EltFp25519_1w_x64(Qz);
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
Pz[0] = 1;
Qx[0] = 1;
@@ -1498,144 +1726,51 @@ bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private[
u64 swap = bit ^ prev;
prev = bit;
- add_EltFp25519_1w_x64_bmi2(A, X2, Z2); /* A = (X2+Z2) */
- sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */
- add_EltFp25519_1w_x64_bmi2(C, X3, Z3); /* C = (X3+Z3) */
- sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */
- mul_EltFp25519_2w_x64_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+ add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
- cswap_x64(swap, A, C);
- cswap_x64(swap, B, D);
+ cswap(swap, A, C);
+ cswap(swap, B, D);
- sqr_EltFp25519_2w_x64_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
- add_EltFp25519_1w_x64_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
- sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_EltFp25519_2w_x64_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+ sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
- copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */
- sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */
- mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */
- add_EltFp25519_1w_x64_bmi2(B, B, X2); /* B = a24*E+B */
- mul_EltFp25519_2w_x64_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_EltFp25519_1w_x64_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
--j;
}
j = 63;
}
- inv_EltFp25519_1w_x64_bmi2(A, Qz);
- mul_EltFp25519_1w_x64_bmi2((u64 *)shared, Qx, A);
- fred_EltFp25519_1w_x64((u64 *)shared);
+ inv_eltfp25519_1w_adx(A, Qz);
+ mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
return true;
}
-bool curve25519_precomp_bmi2_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE])
+bool curve25519_precomp_bmi2(u8 shared[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE], const u8 session_key[CURVE25519_POINT_SIZE])
{
- __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)private_key;
- u64 *const Ur1 = coordinates + 0;
- u64 *const Zr1 = coordinates + 4;
- u64 *const Ur2 = coordinates + 8;
- u64 *const Zr2 = coordinates + 12;
-
- u64 *const UZr1 = coordinates + 0;
- u64 *const ZUr2 = coordinates + 8;
-
- u64 *const A = workspace + 0;
- u64 *const B = workspace + 4;
- u64 *const C = workspace + 8;
- u64 *const D = workspace + 12;
-
- u64 *const AB = workspace + 0;
- u64 *const CD = workspace + 8;
-
- u64 *const buffer_1w = buffer;
- u64 *const buffer_2w = buffer;
- u64 *P = (u64 *)Table_Ladder_8k;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
- u64 swap = 1;
-
- memcpy(private_key, private, sizeof(private_key));
- normalize_secret(private_key);
-
- setzero_EltFp25519_1w_x64(Ur1);
- setzero_EltFp25519_1w_x64(Zr1);
- setzero_EltFp25519_1w_x64(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34ULL;
- Ur2[2] = 0xadc7a0b9235d48e2ULL;
- Ur2[1] = 0xbbf095ae14b2edf8ULL;
- Ur2[0] = 0x7e94e1fec82faabdULL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519_X64; ++i) {
- while (j < ite[i]) {
- u64 bit;
- k = (64 * i + j - q);
- bit = (key[i] >> j) & 0x1;
- swap = swap ^ bit;
- cswap_x64(swap, Ur1, Ur2);
- cswap_x64(swap, Zr1, Zr2);
- swap = bit;
- /* Addition */
- sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_EltFp25519_1w_x64_bmi2(C, &P[4 * k], B); /* C = M0-B */
- sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_EltFp25519_1w_x64_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_EltFp25519_2w_x64_bmi2(AB); /* A = A^2 | B = B^2 */
- mul_EltFp25519_2w_x64_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
- ++j;
- }
- j = 0;
- }
-
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_EltFp25519_1w_x64_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_EltFp25519_2w_x64_bmi2(AB); /* A = A**2 B = B**2 */
- copy_EltFp25519_1w_x64(C, B); /* C = B */
- sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */
- mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */
- add_EltFp25519_1w_x64_bmi2(D, D, C); /* D = D+C */
- mul_EltFp25519_2w_x64_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
- }
-
- /* Convert to affine coordinates */
- inv_EltFp25519_1w_x64_bmi2(A, Zr1);
- mul_EltFp25519_1w_x64_bmi2((u64 *)session_key, Ur1, A);
- fred_EltFp25519_1w_x64((u64 *)session_key);
-
- return true;
-}
-
-bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE], const u8 session[CURVE25519_POINT_SIZE])
-{
- __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
- __aligned(32) u8 session_key[CURVE25519_POINT_SIZE];
+ __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u64 workspace[6 * NUM_WORDS_ELTFP25519];
+ __aligned(32) u8 session[CURVE25519_POINT_SIZE];
+ __aligned(32) u8 private[CURVE25519_POINT_SIZE];
int i = 0, j = 0;
u64 prev = 0;
- u64 *const X1 = (u64 *)session_key;
- u64 *const key = (u64 *)private_key;
+ u64 *const X1 = (u64 *)session;
+ u64 *const key = (u64 *)private;
u64 *const Px = coordinates + 0;
u64 *const Pz = coordinates + 4;
u64 *const Qx = coordinates + 8;
@@ -1659,24 +1794,25 @@ bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private[C
u64 *const buffer_1w = buffer;
u64 *const buffer_2w = buffer;
- memcpy(session_key, session, sizeof(session_key));
- memcpy(private_key, private, sizeof(private_key));
- normalize_secret(private_key);
+ memcpy(private, private_key, sizeof(private));
+ memcpy(session, session_key, sizeof(session));
+
+ normalize_secret(private);
- /* As in the draft:
+ /*
+ * As in the draft:
* When receiving such an array, implementations of curve25519
* MUST mask the most-significant bit in the final byte. This
* is done to preserve compatibility with point formats which
* reserve the sign bit for use in other protocols and to
* increase resistance to implementation fingerprinting
*/
- session_key[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
- reduce_point_mod_2_255_19((u64 *)session_key);
- copy_EltFp25519_1w_x64(Px, (u64 *)session_key);
+ session[CURVE25519_POINT_SIZE - 1] &= (1 << (255 % 8)) - 1;
- setzero_EltFp25519_1w_x64(Pz);
- setzero_EltFp25519_1w_x64(Qx);
- setzero_EltFp25519_1w_x64(Qz);
+ copy_eltfp25519_1w(Px, X1);
+ setzero_eltfp25519_1w(Pz);
+ setzero_eltfp25519_1w(Qx);
+ setzero_eltfp25519_1w(Qz);
Pz[0] = 1;
Qx[0] = 1;
@@ -1690,128 +1826,35 @@ bool curve25519_precomp_adx(u8 shared[CURVE25519_POINT_SIZE], const u8 private[C
u64 swap = bit ^ prev;
prev = bit;
- add_EltFp25519_1w_x64_adx(A, X2, Z2); /* A = (X2+Z2) */
- sub_EltFp25519_1w_x64(B, X2, Z2); /* B = (X2-Z2) */
- add_EltFp25519_1w_x64_adx(C, X3, Z3); /* C = (X3+Z3) */
- sub_EltFp25519_1w_x64(D, X3, Z3); /* D = (X3-Z3) */
- mul_EltFp25519_2w_x64_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
+ add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
+ sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
+ add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
+ sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
+ mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
- cswap_x64(swap, A, C);
- cswap_x64(swap, B, D);
+ cswap(swap, A, C);
+ cswap(swap, B, D);
- sqr_EltFp25519_2w_x64_adx(AB); /* [AA|BB] = [A^2|B^2] */
- add_EltFp25519_1w_x64_adx(X3, DA, CB); /* X3 = (DA+CB) */
- sub_EltFp25519_1w_x64(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_EltFp25519_2w_x64_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
+ sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
+ add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
+ sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
+ sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
- copy_EltFp25519_1w_x64(X2, B); /* X2 = B^2 */
- sub_EltFp25519_1w_x64(Z2, A, B); /* Z2 = E = AA-BB */
- mul_a24_EltFp25519_1w_x64(B, Z2); /* B = a24*E */
- add_EltFp25519_1w_x64_adx(B, B, X2); /* B = a24*E+B */
- mul_EltFp25519_2w_x64_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_EltFp25519_1w_x64_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
+ copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
+ sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
+ mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
+ add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
+ mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
+ mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
--j;
}
j = 63;
}
- inv_EltFp25519_1w_x64_adx(A, Qz);
- mul_EltFp25519_1w_x64_adx((u64 *)shared, Qx, A);
- fred_EltFp25519_1w_x64((u64 *)shared);
-
- return true;
-}
-
-bool curve25519_precomp_adx_base(u8 session_key[CURVE25519_POINT_SIZE], const u8 private[CURVE25519_POINT_SIZE])
-{
- __aligned(32) u64 buffer[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 coordinates[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u64 workspace[4 * NUM_WORDS_ELTFP25519_X64];
- __aligned(32) u8 private_key[CURVE25519_POINT_SIZE];
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)private_key;
- u64 *const Ur1 = coordinates + 0;
- u64 *const Zr1 = coordinates + 4;
- u64 *const Ur2 = coordinates + 8;
- u64 *const Zr2 = coordinates + 12;
-
- u64 *const UZr1 = coordinates + 0;
- u64 *const ZUr2 = coordinates + 8;
-
- u64 *const A = workspace + 0;
- u64 *const B = workspace + 4;
- u64 *const C = workspace + 8;
- u64 *const D = workspace + 12;
-
- u64 *const AB = workspace + 0;
- u64 *const CD = workspace + 8;
-
- u64 *const buffer_1w = buffer;
- u64 *const buffer_2w = buffer;
- u64 *P = (u64 *)Table_Ladder_8k;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
- u64 swap = 1;
-
- memcpy(private_key, private, sizeof(private_key));
- normalize_secret(private_key);
-
- setzero_EltFp25519_1w_x64(Ur1);
- setzero_EltFp25519_1w_x64(Zr1);
- setzero_EltFp25519_1w_x64(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34ULL;
- Ur2[2] = 0xadc7a0b9235d48e2ULL;
- Ur2[1] = 0xbbf095ae14b2edf8ULL;
- Ur2[0] = 0x7e94e1fec82faabdULL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519_X64; ++i) {
- while (j < ite[i]) {
- u64 bit;
- k = (64 * i + j - q);
- bit = (key[i] >> j) & 0x1;
- swap = swap ^ bit;
- cswap_x64(swap, Ur1, Ur2);
- cswap_x64(swap, Zr1, Zr2);
- swap = bit;
- /* Addition */
- sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_EltFp25519_1w_x64_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_EltFp25519_1w_x64_adx(C, &P[4 * k], B); /* C = M0-B */
- sub_EltFp25519_1w_x64(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_EltFp25519_1w_x64_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_EltFp25519_2w_x64_adx(AB); /* A = A^2 | B = B^2 */
- mul_EltFp25519_2w_x64_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
- ++j;
- }
- j = 0;
- }
-
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_EltFp25519_1w_x64_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_EltFp25519_1w_x64(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_EltFp25519_2w_x64_adx(AB); /* A = A**2 B = B**2 */
- copy_EltFp25519_1w_x64(C, B); /* C = B */
- sub_EltFp25519_1w_x64(B, A, B); /* B = A-B */
- mul_a24_EltFp25519_1w_x64(D, B); /* D = my_a24*B */
- add_EltFp25519_1w_x64_adx(D, D, C); /* D = D+C */
- mul_EltFp25519_2w_x64_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
- }
-
- /* Convert to affine coordinates */
- inv_EltFp25519_1w_x64_adx(A, Zr1);
- mul_EltFp25519_1w_x64_adx((u64 *)session_key, Ur1, A);
- fred_EltFp25519_1w_x64((u64 *)session_key);
+ inv_eltfp25519_1w_bmi2(A, Qz);
+ mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
+ fred_eltfp25519_1w((u64 *)shared);
return true;
}
diff --git a/test_vectors.h b/test_vectors.h
index fa0523d..61067cc 100644
--- a/test_vectors.h
+++ b/test_vectors.h
@@ -45,7 +45,6 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
.public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 },
.result = { 0 }
},
- /* These currently break precomp, so removing, but obviously this means that precomp is _incorrect_.
{
.private = { 0xff, 0xff, 0xff, 0xff, 0x0a, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
.public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0a, 0x00, 0xfb, 0x9f },
@@ -55,5 +54,5 @@ static const struct curve25519_test_vector curve25519_test_vectors[] __initconst
.private = { 0x8e, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
.public = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x8e, 0x06 },
.result = { 0x5a, 0xdf, 0xaa, 0x25, 0x86, 0x8e, 0x32, 0x3d, 0xae, 0x49, 0x62, 0xc1, 0x01, 0x5c, 0xb3, 0x12, 0xe1, 0xc5, 0xc7, 0x9e, 0x95, 0x3f, 0x03, 0x99, 0xb0, 0xba, 0x16, 0x22, 0xf3, 0xb6, 0xf7, 0x0c }
- }*/
+ }
};