aboutsummaryrefslogtreecommitdiffstatshomepage
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2020-01-21 16:07:52 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2020-01-21 16:07:52 +0100
commit55df79c6c08e31f24370f3ddc69e37729b4c5676 (patch)
treea1efdab7f7202a86cbda4ece13d9a38e7a722c79
parentdevice: skb_list_walk_safe moved upstream (diff)
downloadwireguard-linux-compat-55df79c6c08e31f24370f3ddc69e37729b4c5676.tar.xz
wireguard-linux-compat-55df79c6c08e31f24370f3ddc69e37729b4c5676.zip
curve25519: x86_64: replace with formally verified implementation
This comes from INRIA's HACL*/Vale. It implements the same algorithm and implementation strategy as the code it replaces, only this code has been formally verified, sans the base point multiplication, which uses code similar to prior, only it uses the formally verified field arithmetic alongside reproducable ladder generation steps. This doesn't have a pure-bmi2 version, which means haswell no longer benefits, but the increased (doubled) code complexity is not worth it for a single generation of chips that's already old. Performance-wise, this is around 1% slower on older microarchitectures, and slightly faster on newer microarchitectures, mainly 10nm ones or backports of 10nm to 14nm. This implementation is "everest" below: Xeon E5-2680 v4 (Broadwell) armfazh: 133340 cycles per call everest: 133436 cycles per call Xeon Gold 5120 (Sky Lake Server) armfazh: 112636 cycles per call everest: 113906 cycles per call Core i5-6300U (Sky Lake Client) armfazh: 116810 cycles per call everest: 117916 cycles per call Core i7-7600U (Kaby Lake) armfazh: 119523 cycles per call everest: 119040 cycles per call Core i7-8750H (Coffee Lake) armfazh: 113914 cycles per call everest: 113650 cycles per call Core i9-9880H (Coffee Lake Refresh) armfazh: 112616 cycles per call everest: 114082 cycles per call Core i3-8121U (Cannon Lake) armfazh: 113202 cycles per call everest: 111382 cycles per call Core i7-8265U (Whiskey Lake) armfazh: 127307 cycles per call everest: 127697 cycles per call Core i7-8550U (Kaby Lake Refresh) armfazh: 127522 cycles per call everest: 127083 cycles per call Xeon Platinum 8275CL (Cascade Lake) armfazh: 114380 cycles per call everest: 114656 cycles per call Achieving these kind of results with formally verified code is quite remarkable, especialy considering that performance is favorable for newer chips. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
-rw-r--r--src/crypto/zinc/curve25519/curve25519-hacl64.c17
-rw-r--r--src/crypto/zinc/curve25519/curve25519-x86_64-glue.c30
-rw-r--r--src/crypto/zinc/curve25519/curve25519-x86_64.c3561
3 files changed, 1300 insertions, 2308 deletions
diff --git a/src/crypto/zinc/curve25519/curve25519-hacl64.c b/src/crypto/zinc/curve25519/curve25519-hacl64.c
index 0f729ec..d6dcd0c 100644
--- a/src/crypto/zinc/curve25519/curve25519-hacl64.c
+++ b/src/crypto/zinc/curve25519/curve25519-hacl64.c
@@ -427,11 +427,6 @@ static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
fproduct_copy_from_wide_(output, tmp);
}
-static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
-{
- fmul_fmul(output, a, b);
-}
-
static __always_inline void crecip(u64 *output, u64 *input)
{
crecip_crecip(output, input);
@@ -498,8 +493,8 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
fsum(xprime, zprime);
fdifference(zprime, origxprime0);
- fmul(xxprime0, xprime, z);
- fmul(zzprime0, x, zprime);
+ fmul_fmul(xxprime0, xprime, z);
+ fmul_fmul(zzprime0, x, zprime);
origxprime = buf + 5;
{
u64 *xx0;
@@ -517,7 +512,7 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
fdifference(zzprime, origxprime);
fsquare_fsquare_times(x3, xxprime, 1);
fsquare_fsquare_times(zzzprime, zzprime, 1);
- fmul(z3, zzzprime, qx);
+ fmul_fmul(z3, zzzprime, qx);
fsquare_fsquare_times(xx0, x, 1);
fsquare_fsquare_times(zz0, z, 1);
{
@@ -528,12 +523,12 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
zzz = buf + 10;
xx = buf + 15;
zz = buf + 20;
- fmul(x2, xx, zz);
+ fmul_fmul(x2, xx, zz);
fdifference(zz, xx);
scalar = 121665;
fscalar(zzz, zz, scalar);
fsum(zzz, xx);
- fmul(z2, zzz, zz);
+ fmul_fmul(z2, zzz, zz);
}
}
}
@@ -748,7 +743,7 @@ static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
u64 *zmone = buf;
u64 *sc = buf + 5;
crecip(zmone, z);
- fmul(sc, x, zmone);
+ fmul_fmul(sc, x, zmone);
format_fcontract(scalar, sc);
}
diff --git a/src/crypto/zinc/curve25519/curve25519-x86_64-glue.c b/src/crypto/zinc/curve25519/curve25519-x86_64-glue.c
index d62bd37..e08cc2b 100644
--- a/src/crypto/zinc/curve25519/curve25519-x86_64-glue.c
+++ b/src/crypto/zinc/curve25519/curve25519-x86_64-glue.c
@@ -8,29 +8,25 @@
#include "curve25519-x86_64.c"
-static bool curve25519_use_bmi2 __ro_after_init;
-static bool curve25519_use_adx __ro_after_init;
+static bool curve25519_use_bmi2_adx __ro_after_init;
static bool *const curve25519_nobs[] __initconst = {
- &curve25519_use_bmi2, &curve25519_use_adx };
+ &curve25519_use_bmi2_adx };
static void __init curve25519_fpu_init(void)
{
- curve25519_use_bmi2 = IS_ENABLED(CONFIG_AS_BMI2) &&
- boot_cpu_has(X86_FEATURE_BMI2);
- curve25519_use_adx = IS_ENABLED(CONFIG_AS_ADX) &&
- boot_cpu_has(X86_FEATURE_BMI2) &&
- boot_cpu_has(X86_FEATURE_ADX);
+ curve25519_use_bmi2_adx = IS_ENABLED(CONFIG_AS_BMI2) &&
+ IS_ENABLED(CONFIG_AS_ADX) &&
+ boot_cpu_has(X86_FEATURE_BMI2) &&
+ boot_cpu_has(X86_FEATURE_ADX);
}
static inline bool curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE],
const u8 basepoint[CURVE25519_KEY_SIZE])
{
- if (IS_ENABLED(CONFIG_AS_ADX) && curve25519_use_adx) {
- curve25519_adx(mypublic, secret, basepoint);
- return true;
- } else if (IS_ENABLED(CONFIG_AS_BMI2) && curve25519_use_bmi2) {
- curve25519_bmi2(mypublic, secret, basepoint);
+ if (IS_ENABLED(CONFIG_AS_ADX) && IS_ENABLED(CONFIG_AS_BMI2) &&
+ curve25519_use_bmi2_adx) {
+ curve25519_ever64(mypublic, secret, basepoint);
return true;
}
return false;
@@ -39,11 +35,9 @@ static inline bool curve25519_arch(u8 mypublic[CURVE25519_KEY_SIZE],
static inline bool curve25519_base_arch(u8 pub[CURVE25519_KEY_SIZE],
const u8 secret[CURVE25519_KEY_SIZE])
{
- if (IS_ENABLED(CONFIG_AS_ADX) && curve25519_use_adx) {
- curve25519_adx_base(pub, secret);
- return true;
- } else if (IS_ENABLED(CONFIG_AS_BMI2) && curve25519_use_bmi2) {
- curve25519_bmi2_base(pub, secret);
+ if (IS_ENABLED(CONFIG_AS_ADX) && IS_ENABLED(CONFIG_AS_BMI2) &&
+ curve25519_use_bmi2_adx) {
+ curve25519_ever64_base(pub, secret);
return true;
}
return false;
diff --git a/src/crypto/zinc/curve25519/curve25519-x86_64.c b/src/crypto/zinc/curve25519/curve25519-x86_64.c
index e8af3e2..74570cb 100644
--- a/src/crypto/zinc/curve25519/curve25519-x86_64.c
+++ b/src/crypto/zinc/curve25519/curve25519-x86_64.c
@@ -1,2366 +1,1369 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright (c) 2017 Armando Faz <armfazh@ic.unicamp.br>. All Rights Reserved.
- * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
- * Copyright (C) 2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
+ * Copyright (C) 2020 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
+ * Copyright (c) 2016-2020 INRIA, CMU and Microsoft Corporation
*/
-enum { NUM_WORDS_ELTFP25519 = 4 };
-typedef __aligned(32) u64 eltfp25519_1w[NUM_WORDS_ELTFP25519];
-typedef __aligned(32) u64 eltfp25519_1w_buffer[2 * NUM_WORDS_ELTFP25519];
-
-#define mul_eltfp25519_1w_adx(c, a, b) do { \
- mul_256x256_integer_adx(m.buffer, a, b); \
- red_eltfp25519_1w_adx(c, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_1w_bmi2(c, a, b) do { \
- mul_256x256_integer_bmi2(m.buffer, a, b); \
- red_eltfp25519_1w_bmi2(c, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_1w_adx(a) do { \
- sqr_256x256_integer_adx(m.buffer, a); \
- red_eltfp25519_1w_adx(a, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_1w_bmi2(a) do { \
- sqr_256x256_integer_bmi2(m.buffer, a); \
- red_eltfp25519_1w_bmi2(a, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_2w_adx(c, a, b) do { \
- mul2_256x256_integer_adx(m.buffer, a, b); \
- red_eltfp25519_2w_adx(c, m.buffer); \
-} while (0)
-
-#define mul_eltfp25519_2w_bmi2(c, a, b) do { \
- mul2_256x256_integer_bmi2(m.buffer, a, b); \
- red_eltfp25519_2w_bmi2(c, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_2w_adx(a) do { \
- sqr2_256x256_integer_adx(m.buffer, a); \
- red_eltfp25519_2w_adx(a, m.buffer); \
-} while (0)
-
-#define sqr_eltfp25519_2w_bmi2(a) do { \
- sqr2_256x256_integer_bmi2(m.buffer, a); \
- red_eltfp25519_2w_bmi2(a, m.buffer); \
-} while (0)
-
-#define sqrn_eltfp25519_1w_adx(a, times) do { \
- int ____counter = (times); \
- while (____counter-- > 0) \
- sqr_eltfp25519_1w_adx(a); \
-} while (0)
-
-#define sqrn_eltfp25519_1w_bmi2(a, times) do { \
- int ____counter = (times); \
- while (____counter-- > 0) \
- sqr_eltfp25519_1w_bmi2(a); \
-} while (0)
-
-#define copy_eltfp25519_1w(C, A) do { \
- (C)[0] = (A)[0]; \
- (C)[1] = (A)[1]; \
- (C)[2] = (A)[2]; \
- (C)[3] = (A)[3]; \
-} while (0)
-
-#define setzero_eltfp25519_1w(C) do { \
- (C)[0] = 0; \
- (C)[1] = 0; \
- (C)[2] = 0; \
- (C)[3] = 0; \
-} while (0)
-
-__aligned(32) static const u64 table_ladder_8k[252 * NUM_WORDS_ELTFP25519] = {
- /* 1 */ 0xfffffffffffffff3UL, 0xffffffffffffffffUL,
- 0xffffffffffffffffUL, 0x5fffffffffffffffUL,
- /* 2 */ 0x6b8220f416aafe96UL, 0x82ebeb2b4f566a34UL,
- 0xd5a9a5b075a5950fUL, 0x5142b2cf4b2488f4UL,
- /* 3 */ 0x6aaebc750069680cUL, 0x89cf7820a0f99c41UL,
- 0x2a58d9183b56d0f4UL, 0x4b5aca80e36011a4UL,
- /* 4 */ 0x329132348c29745dUL, 0xf4a2e616e1642fd7UL,
- 0x1e45bb03ff67bc34UL, 0x306912d0f42a9b4aUL,
- /* 5 */ 0xff886507e6af7154UL, 0x04f50e13dfeec82fUL,
- 0xaa512fe82abab5ceUL, 0x174e251a68d5f222UL,
- /* 6 */ 0xcf96700d82028898UL, 0x1743e3370a2c02c5UL,
- 0x379eec98b4e86eaaUL, 0x0c59888a51e0482eUL,
- /* 7 */ 0xfbcbf1d699b5d189UL, 0xacaef0d58e9fdc84UL,
- 0xc1c20d06231f7614UL, 0x2938218da274f972UL,
- /* 8 */ 0xf6af49beff1d7f18UL, 0xcc541c22387ac9c2UL,
- 0x96fcc9ef4015c56bUL, 0x69c1627c690913a9UL,
- /* 9 */ 0x7a86fd2f4733db0eUL, 0xfdb8c4f29e087de9UL,
- 0x095e4b1a8ea2a229UL, 0x1ad7a7c829b37a79UL,
- /* 10 */ 0x342d89cad17ea0c0UL, 0x67bedda6cced2051UL,
- 0x19ca31bf2bb42f74UL, 0x3df7b4c84980acbbUL,
- /* 11 */ 0xa8c6444dc80ad883UL, 0xb91e440366e3ab85UL,
- 0xc215cda00164f6d8UL, 0x3d867c6ef247e668UL,
- /* 12 */ 0xc7dd582bcc3e658cUL, 0xfd2c4748ee0e5528UL,
- 0xa0fd9b95cc9f4f71UL, 0x7529d871b0675ddfUL,
- /* 13 */ 0xb8f568b42d3cbd78UL, 0x1233011b91f3da82UL,
- 0x2dce6ccd4a7c3b62UL, 0x75e7fc8e9e498603UL,
- /* 14 */ 0x2f4f13f1fcd0b6ecUL, 0xf1a8ca1f29ff7a45UL,
- 0xc249c1a72981e29bUL, 0x6ebe0dbb8c83b56aUL,
- /* 15 */ 0x7114fa8d170bb222UL, 0x65a2dcd5bf93935fUL,
- 0xbdc41f68b59c979aUL, 0x2f0eef79a2ce9289UL,
- /* 16 */ 0x42ecbf0c083c37ceUL, 0x2930bc09ec496322UL,
- 0xf294b0c19cfeac0dUL, 0x3780aa4bedfabb80UL,
- /* 17 */ 0x56c17d3e7cead929UL, 0xe7cb4beb2e5722c5UL,
- 0x0ce931732dbfe15aUL, 0x41b883c7621052f8UL,
- /* 18 */ 0xdbf75ca0c3d25350UL, 0x2936be086eb1e351UL,
- 0xc936e03cb4a9b212UL, 0x1d45bf82322225aaUL,
- /* 19 */ 0xe81ab1036a024cc5UL, 0xe212201c304c9a72UL,
- 0xc5d73fba6832b1fcUL, 0x20ffdb5a4d839581UL,
- /* 20 */ 0xa283d367be5d0fadUL, 0x6c2b25ca8b164475UL,
- 0x9d4935467caaf22eUL, 0x5166408eee85ff49UL,
- /* 21 */ 0x3c67baa2fab4e361UL, 0xb3e433c67ef35cefUL,
- 0x5259729241159b1cUL, 0x6a621892d5b0ab33UL,
- /* 22 */ 0x20b74a387555cdcbUL, 0x532aa10e1208923fUL,
- 0xeaa17b7762281dd1UL, 0x61ab3443f05c44bfUL,
- /* 23 */ 0x257a6c422324def8UL, 0x131c6c1017e3cf7fUL,
- 0x23758739f630a257UL, 0x295a407a01a78580UL,
- /* 24 */ 0xf8c443246d5da8d9UL, 0x19d775450c52fa5dUL,
- 0x2afcfc92731bf83dUL, 0x7d10c8e81b2b4700UL,
- /* 25 */ 0xc8e0271f70baa20bUL, 0x993748867ca63957UL,
- 0x5412efb3cb7ed4bbUL, 0x3196d36173e62975UL,
- /* 26 */ 0xde5bcad141c7dffcUL, 0x47cc8cd2b395c848UL,
- 0xa34cd942e11af3cbUL, 0x0256dbf2d04ecec2UL,
- /* 27 */ 0x875ab7e94b0e667fUL, 0xcad4dd83c0850d10UL,
- 0x47f12e8f4e72c79fUL, 0x5f1a87bb8c85b19bUL,
- /* 28 */ 0x7ae9d0b6437f51b8UL, 0x12c7ce5518879065UL,
- 0x2ade09fe5cf77aeeUL, 0x23a05a2f7d2c5627UL,
- /* 29 */ 0x5908e128f17c169aUL, 0xf77498dd8ad0852dUL,
- 0x74b4c4ceab102f64UL, 0x183abadd10139845UL,
- /* 30 */ 0xb165ba8daa92aaacUL, 0xd5c5ef9599386705UL,
- 0xbe2f8f0cf8fc40d1UL, 0x2701e635ee204514UL,
- /* 31 */ 0x629fa80020156514UL, 0xf223868764a8c1ceUL,
- 0x5b894fff0b3f060eUL, 0x60d9944cf708a3faUL,
- /* 32 */ 0xaeea001a1c7a201fUL, 0xebf16a633ee2ce63UL,
- 0x6f7709594c7a07e1UL, 0x79b958150d0208cbUL,
- /* 33 */ 0x24b55e5301d410e7UL, 0xe3a34edff3fdc84dUL,
- 0xd88768e4904032d8UL, 0x131384427b3aaeecUL,
- /* 34 */ 0x8405e51286234f14UL, 0x14dc4739adb4c529UL,
- 0xb8a2b5b250634ffdUL, 0x2fe2a94ad8a7ff93UL,
- /* 35 */ 0xec5c57efe843faddUL, 0x2843ce40f0bb9918UL,
- 0xa4b561d6cf3d6305UL, 0x743629bde8fb777eUL,
- /* 36 */ 0x343edd46bbaf738fUL, 0xed981828b101a651UL,
- 0xa401760b882c797aUL, 0x1fc223e28dc88730UL,
- /* 37 */ 0x48604e91fc0fba0eUL, 0xb637f78f052c6fa4UL,
- 0x91ccac3d09e9239cUL, 0x23f7eed4437a687cUL,
- /* 38 */ 0x5173b1118d9bd800UL, 0x29d641b63189d4a7UL,
- 0xfdbf177988bbc586UL, 0x2959894fcad81df5UL,
- /* 39 */ 0xaebc8ef3b4bbc899UL, 0x4148995ab26992b9UL,
- 0x24e20b0134f92cfbUL, 0x40d158894a05dee8UL,
- /* 40 */ 0x46b00b1185af76f6UL, 0x26bac77873187a79UL,
- 0x3dc0bf95ab8fff5fUL, 0x2a608bd8945524d7UL,
- /* 41 */ 0x26449588bd446302UL, 0x7c4bc21c0388439cUL,
- 0x8e98a4f383bd11b2UL, 0x26218d7bc9d876b9UL,
- /* 42 */ 0xe3081542997c178aUL, 0x3c2d29a86fb6606fUL,
- 0x5c217736fa279374UL, 0x7dde05734afeb1faUL,
- /* 43 */ 0x3bf10e3906d42babUL, 0xe4f7803e1980649cUL,
- 0xe6053bf89595bf7aUL, 0x394faf38da245530UL,
- /* 44 */ 0x7a8efb58896928f4UL, 0xfbc778e9cc6a113cUL,
- 0x72670ce330af596fUL, 0x48f222a81d3d6cf7UL,
- /* 45 */ 0xf01fce410d72caa7UL, 0x5a20ecc7213b5595UL,
- 0x7bc21165c1fa1483UL, 0x07f89ae31da8a741UL,
- /* 46 */ 0x05d2c2b4c6830ff9UL, 0xd43e330fc6316293UL,
- 0xa5a5590a96d3a904UL, 0x705edb91a65333b6UL,
- /* 47 */ 0x048ee15e0bb9a5f7UL, 0x3240cfca9e0aaf5dUL,
- 0x8f4b71ceedc4a40bUL, 0x621c0da3de544a6dUL,
- /* 48 */ 0x92872836a08c4091UL, 0xce8375b010c91445UL,
- 0x8a72eb524f276394UL, 0x2667fcfa7ec83635UL,
- /* 49 */ 0x7f4c173345e8752aUL, 0x061b47feee7079a5UL,
- 0x25dd9afa9f86ff34UL, 0x3780cef5425dc89cUL,
- /* 50 */ 0x1a46035a513bb4e9UL, 0x3e1ef379ac575adaUL,
- 0xc78c5f1c5fa24b50UL, 0x321a967634fd9f22UL,
- /* 51 */ 0x946707b8826e27faUL, 0x3dca84d64c506fd0UL,
- 0xc189218075e91436UL, 0x6d9284169b3b8484UL,
- /* 52 */ 0x3a67e840383f2ddfUL, 0x33eec9a30c4f9b75UL,
- 0x3ec7c86fa783ef47UL, 0x26ec449fbac9fbc4UL,
- /* 53 */ 0x5c0f38cba09b9e7dUL, 0x81168cc762a3478cUL,
- 0x3e23b0d306fc121cUL, 0x5a238aa0a5efdcddUL,
- /* 54 */ 0x1ba26121c4ea43ffUL, 0x36f8c77f7c8832b5UL,
- 0x88fbea0b0adcf99aUL, 0x5ca9938ec25bebf9UL,
- /* 55 */ 0xd5436a5e51fccda0UL, 0x1dbc4797c2cd893bUL,
- 0x19346a65d3224a08UL, 0x0f5034e49b9af466UL,
- /* 56 */ 0xf23c3967a1e0b96eUL, 0xe58b08fa867a4d88UL,
- 0xfb2fabc6a7341679UL, 0x2a75381eb6026946UL,
- /* 57 */ 0xc80a3be4c19420acUL, 0x66b1f6c681f2b6dcUL,
- 0x7cf7036761e93388UL, 0x25abbbd8a660a4c4UL,
- /* 58 */ 0x91ea12ba14fd5198UL, 0x684950fc4a3cffa9UL,
- 0xf826842130f5ad28UL, 0x3ea988f75301a441UL,
- /* 59 */ 0xc978109a695f8c6fUL, 0x1746eb4a0530c3f3UL,
- 0x444d6d77b4459995UL, 0x75952b8c054e5cc7UL,
- /* 60 */ 0xa3703f7915f4d6aaUL, 0x66c346202f2647d8UL,
- 0xd01469df811d644bUL, 0x77fea47d81a5d71fUL,
- /* 61 */ 0xc5e9529ef57ca381UL, 0x6eeeb4b9ce2f881aUL,
- 0xb6e91a28e8009bd6UL, 0x4b80be3e9afc3fecUL,
- /* 62 */ 0x7e3773c526aed2c5UL, 0x1b4afcb453c9a49dUL,
- 0xa920bdd7baffb24dUL, 0x7c54699f122d400eUL,
- /* 63 */ 0xef46c8e14fa94bc8UL, 0xe0b074ce2952ed5eUL,
- 0xbea450e1dbd885d5UL, 0x61b68649320f712cUL,
- /* 64 */ 0x8a485f7309ccbdd1UL, 0xbd06320d7d4d1a2dUL,
- 0x25232973322dbef4UL, 0x445dc4758c17f770UL,
- /* 65 */ 0xdb0434177cc8933cUL, 0xed6fe82175ea059fUL,
- 0x1efebefdc053db34UL, 0x4adbe867c65daf99UL,
- /* 66 */ 0x3acd71a2a90609dfUL, 0xe5e991856dd04050UL,
- 0x1ec69b688157c23cUL, 0x697427f6885cfe4dUL,
- /* 67 */ 0xd7be7b9b65e1a851UL, 0xa03d28d522c536ddUL,
- 0x28399d658fd2b645UL, 0x49e5b7e17c2641e1UL,
- /* 68 */ 0x6f8c3a98700457a4UL, 0x5078f0a25ebb6778UL,
- 0xd13c3ccbc382960fUL, 0x2e003258a7df84b1UL,
- /* 69 */ 0x8ad1f39be6296a1cUL, 0xc1eeaa652a5fbfb2UL,
- 0x33ee0673fd26f3cbUL, 0x59256173a69d2cccUL,
- /* 70 */ 0x41ea07aa4e18fc41UL, 0xd9fc19527c87a51eUL,
- 0xbdaacb805831ca6fUL, 0x445b652dc916694fUL,
- /* 71 */ 0xce92a3a7f2172315UL, 0x1edc282de11b9964UL,
- 0xa1823aafe04c314aUL, 0x790a2d94437cf586UL,
- /* 72 */ 0x71c447fb93f6e009UL, 0x8922a56722845276UL,
- 0xbf70903b204f5169UL, 0x2f7a89891ba319feUL,
- /* 73 */ 0x02a08eb577e2140cUL, 0xed9a4ed4427bdcf4UL,
- 0x5253ec44e4323cd1UL, 0x3e88363c14e9355bUL,
- /* 74 */ 0xaa66c14277110b8cUL, 0x1ae0391610a23390UL,
- 0x2030bd12c93fc2a2UL, 0x3ee141579555c7abUL,
- /* 75 */ 0x9214de3a6d6e7d41UL, 0x3ccdd88607f17efeUL,
- 0x674f1288f8e11217UL, 0x5682250f329f93d0UL,
- /* 76 */ 0x6cf00b136d2e396eUL, 0x6e4cf86f1014debfUL,
- 0x5930b1b5bfcc4e83UL, 0x047069b48aba16b6UL,
- /* 77 */ 0x0d4ce4ab69b20793UL, 0xb24db91a97d0fb9eUL,
- 0xcdfa50f54e00d01dUL, 0x221b1085368bddb5UL,
- /* 78 */ 0xe7e59468b1e3d8d2UL, 0x53c56563bd122f93UL,
- 0xeee8a903e0663f09UL, 0x61efa662cbbe3d42UL,
- /* 79 */ 0x2cf8ddddde6eab2aUL, 0x9bf80ad51435f231UL,
- 0x5deadacec9f04973UL, 0x29275b5d41d29b27UL,
- /* 80 */ 0xcfde0f0895ebf14fUL, 0xb9aab96b054905a7UL,
- 0xcae80dd9a1c420fdUL, 0x0a63bf2f1673bbc7UL,
- /* 81 */ 0x092f6e11958fbc8cUL, 0x672a81e804822fadUL,
- 0xcac8351560d52517UL, 0x6f3f7722c8f192f8UL,
- /* 82 */ 0xf8ba90ccc2e894b7UL, 0x2c7557a438ff9f0dUL,
- 0x894d1d855ae52359UL, 0x68e122157b743d69UL,
- /* 83 */ 0xd87e5570cfb919f3UL, 0x3f2cdecd95798db9UL,
- 0x2121154710c0a2ceUL, 0x3c66a115246dc5b2UL,
- /* 84 */ 0xcbedc562294ecb72UL, 0xba7143c36a280b16UL,
- 0x9610c2efd4078b67UL, 0x6144735d946a4b1eUL,
- /* 85 */ 0x536f111ed75b3350UL, 0x0211db8c2041d81bUL,
- 0xf93cb1000e10413cUL, 0x149dfd3c039e8876UL,
- /* 86 */ 0xd479dde46b63155bUL, 0xb66e15e93c837976UL,
- 0xdafde43b1f13e038UL, 0x5fafda1a2e4b0b35UL,
- /* 87 */ 0x3600bbdf17197581UL, 0x3972050bbe3cd2c2UL,
- 0x5938906dbdd5be86UL, 0x34fce5e43f9b860fUL,
- /* 88 */ 0x75a8a4cd42d14d02UL, 0x828dabc53441df65UL,
- 0x33dcabedd2e131d3UL, 0x3ebad76fb814d25fUL,
- /* 89 */ 0xd4906f566f70e10fUL, 0x5d12f7aa51690f5aUL,
- 0x45adb16e76cefcf2UL, 0x01f768aead232999UL,
- /* 90 */ 0x2b6cc77b6248febdUL, 0x3cd30628ec3aaffdUL,
- 0xce1c0b80d4ef486aUL, 0x4c3bff2ea6f66c23UL,
- /* 91 */ 0x3f2ec4094aeaeb5fUL, 0x61b19b286e372ca7UL,
- 0x5eefa966de2a701dUL, 0x23b20565de55e3efUL,
- /* 92 */ 0xe301ca5279d58557UL, 0x07b2d4ce27c2874fUL,
- 0xa532cd8a9dcf1d67UL, 0x2a52fee23f2bff56UL,
- /* 93 */ 0x8624efb37cd8663dUL, 0xbbc7ac20ffbd7594UL,
- 0x57b85e9c82d37445UL, 0x7b3052cb86a6ec66UL,
- /* 94 */ 0x3482f0ad2525e91eUL, 0x2cb68043d28edca0UL,
- 0xaf4f6d052e1b003aUL, 0x185f8c2529781b0aUL,
- /* 95 */ 0xaa41de5bd80ce0d6UL, 0x9407b2416853e9d6UL,
- 0x563ec36e357f4c3aUL, 0x4cc4b8dd0e297bceUL,
- /* 96 */ 0xa2fc1a52ffb8730eUL, 0x1811f16e67058e37UL,
- 0x10f9a366cddf4ee1UL, 0x72f4a0c4a0b9f099UL,
- /* 97 */ 0x8c16c06f663f4ea7UL, 0x693b3af74e970fbaUL,
- 0x2102e7f1d69ec345UL, 0x0ba53cbc968a8089UL,
- /* 98 */ 0xca3d9dc7fea15537UL, 0x4c6824bb51536493UL,
- 0xb9886314844006b1UL, 0x40d2a72ab454cc60UL,
- /* 99 */ 0x5936a1b712570975UL, 0x91b9d648debda657UL,
- 0x3344094bb64330eaUL, 0x006ba10d12ee51d0UL,
- /* 100 */ 0x19228468f5de5d58UL, 0x0eb12f4c38cc05b0UL,
- 0xa1039f9dd5601990UL, 0x4502d4ce4fff0e0bUL,
- /* 101 */ 0xeb2054106837c189UL, 0xd0f6544c6dd3b93cUL,
- 0x40727064c416d74fUL, 0x6e15c6114b502ef0UL,
- /* 102 */ 0x4df2a398cfb1a76bUL, 0x11256c7419f2f6b1UL,
- 0x4a497962066e6043UL, 0x705b3aab41355b44UL,
- /* 103 */ 0x365ef536d797b1d8UL, 0x00076bd622ddf0dbUL,
- 0x3bbf33b0e0575a88UL, 0x3777aa05c8e4ca4dUL,
- /* 104 */ 0x392745c85578db5fUL, 0x6fda4149dbae5ae2UL,
- 0xb1f0b00b8adc9867UL, 0x09963437d36f1da3UL,
- /* 105 */ 0x7e824e90a5dc3853UL, 0xccb5f6641f135cbdUL,
- 0x6736d86c87ce8fccUL, 0x625f3ce26604249fUL,
- /* 106 */ 0xaf8ac8059502f63fUL, 0x0c05e70a2e351469UL,
- 0x35292e9c764b6305UL, 0x1a394360c7e23ac3UL,
- /* 107 */ 0xd5c6d53251183264UL, 0x62065abd43c2b74fUL,
- 0xb5fbf5d03b973f9bUL, 0x13a3da3661206e5eUL,
- /* 108 */ 0xc6bd5837725d94e5UL, 0x18e30912205016c5UL,
- 0x2088ce1570033c68UL, 0x7fba1f495c837987UL,
- /* 109 */ 0x5a8c7423f2f9079dUL, 0x1735157b34023fc5UL,
- 0xe4f9b49ad2fab351UL, 0x6691ff72c878e33cUL,
- /* 110 */ 0x122c2adedc5eff3eUL, 0xf8dd4bf1d8956cf4UL,
- 0xeb86205d9e9e5bdaUL, 0x049b92b9d975c743UL,
- /* 111 */ 0xa5379730b0f6c05aUL, 0x72a0ffacc6f3a553UL,
- 0xb0032c34b20dcd6dUL, 0x470e9dbc88d5164aUL,
- /* 112 */ 0xb19cf10ca237c047UL, 0xb65466711f6c81a2UL,
- 0xb3321bd16dd80b43UL, 0x48c14f600c5fbe8eUL,
- /* 113 */ 0x66451c264aa6c803UL, 0xb66e3904a4fa7da6UL,
- 0xd45f19b0b3128395UL, 0x31602627c3c9bc10UL,
- /* 114 */ 0x3120dc4832e4e10dUL, 0xeb20c46756c717f7UL,
- 0x00f52e3f67280294UL, 0x566d4fc14730c509UL,
- /* 115 */ 0x7e3a5d40fd837206UL, 0xc1e926dc7159547aUL,
- 0x216730fba68d6095UL, 0x22e8c3843f69cea7UL,
- /* 116 */ 0x33d074e8930e4b2bUL, 0xb6e4350e84d15816UL,
- 0x5534c26ad6ba2365UL, 0x7773c12f89f1f3f3UL,
- /* 117 */ 0x8cba404da57962aaUL, 0x5b9897a81999ce56UL,
- 0x508e862f121692fcUL, 0x3a81907fa093c291UL,
- /* 118 */ 0x0dded0ff4725a510UL, 0x10d8cc10673fc503UL,
- 0x5b9d151c9f1f4e89UL, 0x32a5c1d5cb09a44cUL,
- /* 119 */ 0x1e0aa442b90541fbUL, 0x5f85eb7cc1b485dbUL,
- 0xbee595ce8a9df2e5UL, 0x25e496c722422236UL,
- /* 120 */ 0x5edf3c46cd0fe5b9UL, 0x34e75a7ed2a43388UL,
- 0xe488de11d761e352UL, 0x0e878a01a085545cUL,
- /* 121 */ 0xba493c77e021bb04UL, 0x2b4d1843c7df899aUL,
- 0x9ea37a487ae80d67UL, 0x67a9958011e41794UL,
- /* 122 */ 0x4b58051a6697b065UL, 0x47e33f7d8d6ba6d4UL,
- 0xbb4da8d483ca46c1UL, 0x68becaa181c2db0dUL,
- /* 123 */ 0x8d8980e90b989aa5UL, 0xf95eb14a2c93c99bUL,
- 0x51c6c7c4796e73a2UL, 0x6e228363b5efb569UL,
- /* 124 */ 0xc6bbc0b02dd624c8UL, 0x777eb47dec8170eeUL,
- 0x3cde15a004cfafa9UL, 0x1dc6bc087160bf9bUL,
- /* 125 */ 0x2e07e043eec34002UL, 0x18e9fc677a68dc7fUL,
- 0xd8da03188bd15b9aUL, 0x48fbc3bb00568253UL,
- /* 126 */ 0x57547d4cfb654ce1UL, 0xd3565b82a058e2adUL,
- 0xf63eaf0bbf154478UL, 0x47531ef114dfbb18UL,
- /* 127 */ 0xe1ec630a4278c587UL, 0x5507d546ca8e83f3UL,
- 0x85e135c63adc0c2bUL, 0x0aa7efa85682844eUL,
- /* 128 */ 0x72691ba8b3e1f615UL, 0x32b4e9701fbe3ffaUL,
- 0x97b6d92e39bb7868UL, 0x2cfe53dea02e39e8UL,
- /* 129 */ 0x687392cd85cd52b0UL, 0x27ff66c910e29831UL,
- 0x97134556a9832d06UL, 0x269bb0360a84f8a0UL,
- /* 130 */ 0x706e55457643f85cUL, 0x3734a48c9b597d1bUL,
- 0x7aee91e8c6efa472UL, 0x5cd6abc198a9d9e0UL,
- /* 131 */ 0x0e04de06cb3ce41aUL, 0xd8c6eb893402e138UL,
- 0x904659bb686e3772UL, 0x7215c371746ba8c8UL,
- /* 132 */ 0xfd12a97eeae4a2d9UL, 0x9514b7516394f2c5UL,
- 0x266fd5809208f294UL, 0x5c847085619a26b9UL,
- /* 133 */ 0x52985410fed694eaUL, 0x3c905b934a2ed254UL,
- 0x10bb47692d3be467UL, 0x063b3d2d69e5e9e1UL,
- /* 134 */ 0x472726eedda57debUL, 0xefb6c4ae10f41891UL,
- 0x2b1641917b307614UL, 0x117c554fc4f45b7cUL,
- /* 135 */ 0xc07cf3118f9d8812UL, 0x01dbd82050017939UL,
- 0xd7e803f4171b2827UL, 0x1015e87487d225eaUL,
- /* 136 */ 0xc58de3fed23acc4dUL, 0x50db91c294a7be2dUL,
- 0x0b94d43d1c9cf457UL, 0x6b1640fa6e37524aUL,
- /* 137 */ 0x692f346c5fda0d09UL, 0x200b1c59fa4d3151UL,
- 0xb8c46f760777a296UL, 0x4b38395f3ffdfbcfUL,
- /* 138 */ 0x18d25e00be54d671UL, 0x60d50582bec8aba6UL,
- 0x87ad8f263b78b982UL, 0x50fdf64e9cda0432UL,
- /* 139 */ 0x90f567aac578dcf0UL, 0xef1e9b0ef2a3133bUL,
- 0x0eebba9242d9de71UL, 0x15473c9bf03101c7UL,
- /* 140 */ 0x7c77e8ae56b78095UL, 0xb678e7666e6f078eUL,
- 0x2da0b9615348ba1fUL, 0x7cf931c1ff733f0bUL,
- /* 141 */ 0x26b357f50a0a366cUL, 0xe9708cf42b87d732UL,
- 0xc13aeea5f91cb2c0UL, 0x35d90c991143bb4cUL,
- /* 142 */ 0x47c1c404a9a0d9dcUL, 0x659e58451972d251UL,
- 0x3875a8c473b38c31UL, 0x1fbd9ed379561f24UL,
- /* 143 */ 0x11fabc6fd41ec28dUL, 0x7ef8dfe3cd2a2dcaUL,
- 0x72e73b5d8c404595UL, 0x6135fa4954b72f27UL,
- /* 144 */ 0xccfc32a2de24b69cUL, 0x3f55698c1f095d88UL,
- 0xbe3350ed5ac3f929UL, 0x5e9bf806ca477eebUL,
- /* 145 */ 0xe9ce8fb63c309f68UL, 0x5376f63565e1f9f4UL,
- 0xd1afcfb35a6393f1UL, 0x6632a1ede5623506UL,
- /* 146 */ 0x0b7d6c390c2ded4cUL, 0x56cb3281df04cb1fUL,
- 0x66305a1249ecc3c7UL, 0x5d588b60a38ca72aUL,
- /* 147 */ 0xa6ecbf78e8e5f42dUL, 0x86eeb44b3c8a3eecUL,
- 0xec219c48fbd21604UL, 0x1aaf1af517c36731UL,
- /* 148 */ 0xc306a2836769bde7UL, 0x208280622b1e2adbUL,
- 0x8027f51ffbff94a6UL, 0x76cfa1ce1124f26bUL,
- /* 149 */ 0x18eb00562422abb6UL, 0xf377c4d58f8c29c3UL,
- 0x4dbbc207f531561aUL, 0x0253b7f082128a27UL,
- /* 150 */ 0x3d1f091cb62c17e0UL, 0x4860e1abd64628a9UL,
- 0x52d17436309d4253UL, 0x356f97e13efae576UL,
- /* 151 */ 0xd351e11aa150535bUL, 0x3e6b45bb1dd878ccUL,
- 0x0c776128bed92c98UL, 0x1d34ae93032885b8UL,
- /* 152 */ 0x4ba0488ca85ba4c3UL, 0x985348c33c9ce6ceUL,
- 0x66124c6f97bda770UL, 0x0f81a0290654124aUL,
- /* 153 */ 0x9ed09ca6569b86fdUL, 0x811009fd18af9a2dUL,
- 0xff08d03f93d8c20aUL, 0x52a148199faef26bUL,
- /* 154 */ 0x3e03f9dc2d8d1b73UL, 0x4205801873961a70UL,
- 0xc0d987f041a35970UL, 0x07aa1f15a1c0d549UL,
- /* 155 */ 0xdfd46ce08cd27224UL, 0x6d0a024f934e4239UL,
- 0x808a7a6399897b59UL, 0x0a4556e9e13d95a2UL,
- /* 156 */ 0xd21a991fe9c13045UL, 0x9b0e8548fe7751b8UL,
- 0x5da643cb4bf30035UL, 0x77db28d63940f721UL,
- /* 157 */ 0xfc5eeb614adc9011UL, 0x5229419ae8c411ebUL,
- 0x9ec3e7787d1dcf74UL, 0x340d053e216e4cb5UL,
- /* 158 */ 0xcac7af39b48df2b4UL, 0xc0faec2871a10a94UL,
- 0x140a69245ca575edUL, 0x0cf1c37134273a4cUL,
- /* 159 */ 0xc8ee306ac224b8a5UL, 0x57eaee7ccb4930b0UL,
- 0xa1e806bdaacbe74fUL, 0x7d9a62742eeb657dUL,
- /* 160 */ 0x9eb6b6ef546c4830UL, 0x885cca1fddb36e2eUL,
- 0xe6b9f383ef0d7105UL, 0x58654fef9d2e0412UL,
- /* 161 */ 0xa905c4ffbe0e8e26UL, 0x942de5df9b31816eUL,
- 0x497d723f802e88e1UL, 0x30684dea602f408dUL,
- /* 162 */ 0x21e5a278a3e6cb34UL, 0xaefb6e6f5b151dc4UL,
- 0xb30b8e049d77ca15UL, 0x28c3c9cf53b98981UL,
- /* 163 */ 0x287fb721556cdd2aUL, 0x0d317ca897022274UL,
- 0x7468c7423a543258UL, 0x4a7f11464eb5642fUL,
- /* 164 */ 0xa237a4774d193aa6UL, 0xd865986ea92129a1UL,
- 0x24c515ecf87c1a88UL, 0x604003575f39f5ebUL,
- /* 165 */ 0x47b9f189570a9b27UL, 0x2b98cede465e4b78UL,
- 0x026df551dbb85c20UL, 0x74fcd91047e21901UL,
- /* 166 */ 0x13e2a90a23c1bfa3UL, 0x0cb0074e478519f6UL,
- 0x5ff1cbbe3af6cf44UL, 0x67fe5438be812dbeUL,
- /* 167 */ 0xd13cf64fa40f05b0UL, 0x054dfb2f32283787UL,
- 0x4173915b7f0d2aeaUL, 0x482f144f1f610d4eUL,
- /* 168 */ 0xf6210201b47f8234UL, 0x5d0ae1929e70b990UL,
- 0xdcd7f455b049567cUL, 0x7e93d0f1f0916f01UL,
- /* 169 */ 0xdd79cbf18a7db4faUL, 0xbe8391bf6f74c62fUL,
- 0x027145d14b8291bdUL, 0x585a73ea2cbf1705UL,
- /* 170 */ 0x485ca03e928a0db2UL, 0x10fc01a5742857e7UL,
- 0x2f482edbd6d551a7UL, 0x0f0433b5048fdb8aUL,
- /* 171 */ 0x60da2e8dd7dc6247UL, 0x88b4c9d38cd4819aUL,
- 0x13033ac001f66697UL, 0x273b24fe3b367d75UL,
- /* 172 */ 0xc6e8f66a31b3b9d4UL, 0x281514a494df49d5UL,
- 0xd1726fdfc8b23da7UL, 0x4b3ae7d103dee548UL,
- /* 173 */ 0xc6256e19ce4b9d7eUL, 0xff5c5cf186e3c61cUL,
- 0xacc63ca34b8ec145UL, 0x74621888fee66574UL,
- /* 174 */ 0x956f409645290a1eUL, 0xef0bf8e3263a962eUL,
- 0xed6a50eb5ec2647bUL, 0x0694283a9dca7502UL,
- /* 175 */ 0x769b963643a2dcd1UL, 0x42b7c8ea09fc5353UL,
- 0x4f002aee13397eabUL, 0x63005e2c19b7d63aUL,
- /* 176 */ 0xca6736da63023beaUL, 0x966c7f6db12a99b7UL,
- 0xace09390c537c5e1UL, 0x0b696063a1aa89eeUL,
- /* 177 */ 0xebb03e97288c56e5UL, 0x432a9f9f938c8be8UL,
- 0xa6a5a93d5b717f71UL, 0x1a5fb4c3e18f9d97UL,
- /* 178 */ 0x1c94e7ad1c60cdceUL, 0xee202a43fc02c4a0UL,
- 0x8dafe4d867c46a20UL, 0x0a10263c8ac27b58UL,
- /* 179 */ 0xd0dea9dfe4432a4aUL, 0x856af87bbe9277c5UL,
- 0xce8472acc212c71aUL, 0x6f151b6d9bbb1e91UL,
- /* 180 */ 0x26776c527ceed56aUL, 0x7d211cb7fbf8faecUL,
- 0x37ae66a6fd4609ccUL, 0x1f81b702d2770c42UL,
- /* 181 */ 0x2fb0b057eac58392UL, 0xe1dd89fe29744e9dUL,
- 0xc964f8eb17beb4f8UL, 0x29571073c9a2d41eUL,
- /* 182 */ 0xa948a18981c0e254UL, 0x2df6369b65b22830UL,
- 0xa33eb2d75fcfd3c6UL, 0x078cd6ec4199a01fUL,
- /* 183 */ 0x4a584a41ad900d2fUL, 0x32142b78e2c74c52UL,
- 0x68c4e8338431c978UL, 0x7f69ea9008689fc2UL,
- /* 184 */ 0x52f2c81e46a38265UL, 0xfd78072d04a832fdUL,
- 0x8cd7d5fa25359e94UL, 0x4de71b7454cc29d2UL,
- /* 185 */ 0x42eb60ad1eda6ac9UL, 0x0aad37dfdbc09c3aUL,
- 0x81004b71e33cc191UL, 0x44e6be345122803cUL,
- /* 186 */ 0x03fe8388ba1920dbUL, 0xf5d57c32150db008UL,
- 0x49c8c4281af60c29UL, 0x21edb518de701aeeUL,
- /* 187 */ 0x7fb63e418f06dc99UL, 0xa4460d99c166d7b8UL,
- 0x24dd5248ce520a83UL, 0x5ec3ad712b928358UL,
- /* 188 */ 0x15022a5fbd17930fUL, 0xa4f64a77d82570e3UL,
- 0x12bc8d6915783712UL, 0x498194c0fc620abbUL,
- /* 189 */ 0x38a2d9d255686c82UL, 0x785c6bd9193e21f0UL,
- 0xe4d5c81ab24a5484UL, 0x56307860b2e20989UL,
- /* 190 */ 0x429d55f78b4d74c4UL, 0x22f1834643350131UL,
- 0x1e60c24598c71fffUL, 0x59f2f014979983efUL,
- /* 191 */ 0x46a47d56eb494a44UL, 0x3e22a854d636a18eUL,
- 0xb346e15274491c3bUL, 0x2ceafd4e5390cde7UL,
- /* 192 */ 0xba8a8538be0d6675UL, 0x4b9074bb50818e23UL,
- 0xcbdab89085d304c3UL, 0x61a24fe0e56192c4UL,
- /* 193 */ 0xcb7615e6db525bcbUL, 0xdd7d8c35a567e4caUL,
- 0xe6b4153acafcdd69UL, 0x2d668e097f3c9766UL,
- /* 194 */ 0xa57e7e265ce55ef0UL, 0x5d9f4e527cd4b967UL,
- 0xfbc83606492fd1e5UL, 0x090d52beb7c3f7aeUL,
- /* 195 */ 0x09b9515a1e7b4d7cUL, 0x1f266a2599da44c0UL,
- 0xa1c49548e2c55504UL, 0x7ef04287126f15ccUL,
- /* 196 */ 0xfed1659dbd30ef15UL, 0x8b4ab9eec4e0277bUL,
- 0x884d6236a5df3291UL, 0x1fd96ea6bf5cf788UL,
- /* 197 */ 0x42a161981f190d9aUL, 0x61d849507e6052c1UL,
- 0x9fe113bf285a2cd5UL, 0x7c22d676dbad85d8UL,
- /* 198 */ 0x82e770ed2bfbd27dUL, 0x4c05b2ece996f5a5UL,
- 0xcd40a9c2b0900150UL, 0x5895319213d9bf64UL,
- /* 199 */ 0xe7cc5d703fea2e08UL, 0xb50c491258e2188cUL,
- 0xcce30baa48205bf0UL, 0x537c659ccfa32d62UL,
- /* 200 */ 0x37b6623a98cfc088UL, 0xfe9bed1fa4d6aca4UL,
- 0x04d29b8e56a8d1b0UL, 0x725f71c40b519575UL,
- /* 201 */ 0x28c7f89cd0339ce6UL, 0x8367b14469ddc18bUL,
- 0x883ada83a6a1652cUL, 0x585f1974034d6c17UL,
- /* 202 */ 0x89cfb266f1b19188UL, 0xe63b4863e7c35217UL,
- 0xd88c9da6b4c0526aUL, 0x3e035c9df0954635UL,
- /* 203 */ 0xdd9d5412fb45de9dUL, 0xdd684532e4cff40dUL,
- 0x4b5c999b151d671cUL, 0x2d8c2cc811e7f690UL,
- /* 204 */ 0x7f54be1d90055d40UL, 0xa464c5df464aaf40UL,
- 0x33979624f0e917beUL, 0x2c018dc527356b30UL,
- /* 205 */ 0xa5415024e330b3d4UL, 0x73ff3d96691652d3UL,
- 0x94ec42c4ef9b59f1UL, 0x0747201618d08e5aUL,
- /* 206 */ 0x4d6ca48aca411c53UL, 0x66415f2fcfa66119UL,
- 0x9c4dd40051e227ffUL, 0x59810bc09a02f7ebUL,
- /* 207 */ 0x2a7eb171b3dc101dUL, 0x441c5ab99ffef68eUL,
- 0x32025c9b93b359eaUL, 0x5e8ce0a71e9d112fUL,
- /* 208 */ 0xbfcccb92429503fdUL, 0xd271ba752f095d55UL,
- 0x345ead5e972d091eUL, 0x18c8df11a83103baUL,
- /* 209 */ 0x90cd949a9aed0f4cUL, 0xc5d1f4cb6660e37eUL,
- 0xb8cac52d56c52e0bUL, 0x6e42e400c5808e0dUL,
- /* 210 */ 0xa3b46966eeaefd23UL, 0x0c4f1f0be39ecdcaUL,
- 0x189dc8c9d683a51dUL, 0x51f27f054c09351bUL,
- /* 211 */ 0x4c487ccd2a320682UL, 0x587ea95bb3df1c96UL,
- 0xc8ccf79e555cb8e8UL, 0x547dc829a206d73dUL,
- /* 212 */ 0xb822a6cd80c39b06UL, 0xe96d54732000d4c6UL,
- 0x28535b6f91463b4dUL, 0x228f4660e2486e1dUL,
- /* 213 */ 0x98799538de8d3abfUL, 0x8cd8330045ebca6eUL,
- 0x79952a008221e738UL, 0x4322e1a7535cd2bbUL,
- /* 214 */ 0xb114c11819d1801cUL, 0x2016e4d84f3f5ec7UL,
- 0xdd0e2df409260f4cUL, 0x5ec362c0ae5f7266UL,
- /* 215 */ 0xc0462b18b8b2b4eeUL, 0x7cc8d950274d1afbUL,
- 0xf25f7105436b02d2UL, 0x43bbf8dcbff9ccd3UL,
- /* 216 */ 0xb6ad1767a039e9dfUL, 0xb0714da8f69d3583UL,
- 0x5e55fa18b42931f5UL, 0x4ed5558f33c60961UL,
- /* 217 */ 0x1fe37901c647a5ddUL, 0x593ddf1f8081d357UL,
- 0x0249a4fd813fd7a6UL, 0x69acca274e9caf61UL,
- /* 218 */ 0x047ba3ea330721c9UL, 0x83423fc20e7e1ea0UL,
- 0x1df4c0af01314a60UL, 0x09a62dab89289527UL,
- /* 219 */ 0xa5b325a49cc6cb00UL, 0xe94b5dc654b56cb6UL,
- 0x3be28779adc994a0UL, 0x4296e8f8ba3a4aadUL,
- /* 220 */ 0x328689761e451eabUL, 0x2e4d598bff59594aUL,
- 0x49b96853d7a7084aUL, 0x4980a319601420a8UL,
- /* 221 */ 0x9565b9e12f552c42UL, 0x8a5318db7100fe96UL,
- 0x05c90b4d43add0d7UL, 0x538b4cd66a5d4edaUL,
- /* 222 */ 0xf4e94fc3e89f039fUL, 0x592c9af26f618045UL,
- 0x08a36eb5fd4b9550UL, 0x25fffaf6c2ed1419UL,
- /* 223 */ 0x34434459cc79d354UL, 0xeeecbfb4b1d5476bUL,
- 0xddeb34a061615d99UL, 0x5129cecceb64b773UL,
- /* 224 */ 0xee43215894993520UL, 0x772f9c7cf14c0b3bUL,
- 0xd2e2fce306bedad5UL, 0x715f42b546f06a97UL,
- /* 225 */ 0x434ecdceda5b5f1aUL, 0x0da17115a49741a9UL,
- 0x680bd77c73edad2eUL, 0x487c02354edd9041UL,
- /* 226 */ 0xb8efeff3a70ed9c4UL, 0x56a32aa3e857e302UL,
- 0xdf3a68bd48a2a5a0UL, 0x07f650b73176c444UL,
- /* 227 */ 0xe38b9b1626e0ccb1UL, 0x79e053c18b09fb36UL,
- 0x56d90319c9f94964UL, 0x1ca941e7ac9ff5c4UL,
- /* 228 */ 0x49c4df29162fa0bbUL, 0x8488cf3282b33305UL,
- 0x95dfda14cabb437dUL, 0x3391f78264d5ad86UL,
- /* 229 */ 0x729ae06ae2b5095dUL, 0xd58a58d73259a946UL,
- 0xe9834262d13921edUL, 0x27fedafaa54bb592UL,
- /* 230 */ 0xa99dc5b829ad48bbUL, 0x5f025742499ee260UL,
- 0x802c8ecd5d7513fdUL, 0x78ceb3ef3f6dd938UL,
- /* 231 */ 0xc342f44f8a135d94UL, 0x7b9edb44828cdda3UL,
- 0x9436d11a0537cfe7UL, 0x5064b164ec1ab4c8UL,
- /* 232 */ 0x7020eccfd37eb2fcUL, 0x1f31ea3ed90d25fcUL,
- 0x1b930d7bdfa1bb34UL, 0x5344467a48113044UL,
- /* 233 */ 0x70073170f25e6dfbUL, 0xe385dc1a50114cc8UL,
- 0x2348698ac8fc4f00UL, 0x2a77a55284dd40d8UL,
- /* 234 */ 0xfe06afe0c98c6ce4UL, 0xc235df96dddfd6e4UL,
- 0x1428d01e33bf1ed3UL, 0x785768ec9300bdafUL,
- /* 235 */ 0x9702e57a91deb63bUL, 0x61bdb8bfe5ce8b80UL,
- 0x645b426f3d1d58acUL, 0x4804a82227a557bcUL,
- /* 236 */ 0x8e57048ab44d2601UL, 0x68d6501a4b3a6935UL,
- 0xc39c9ec3f9e1c293UL, 0x4172f257d4de63e2UL,
- /* 237 */ 0xd368b450330c6401UL, 0x040d3017418f2391UL,
- 0x2c34bb6090b7d90dUL, 0x16f649228fdfd51fUL,
- /* 238 */ 0xbea6818e2b928ef5UL, 0xe28ccf91cdc11e72UL,
- 0x594aaa68e77a36cdUL, 0x313034806c7ffd0fUL,
- /* 239 */ 0x8a9d27ac2249bd65UL, 0x19a3b464018e9512UL,
- 0xc26ccff352b37ec7UL, 0x056f68341d797b21UL,
- /* 240 */ 0x5e79d6757efd2327UL, 0xfabdbcb6553afe15UL,
- 0xd3e7222c6eaf5a60UL, 0x7046c76d4dae743bUL,
- /* 241 */ 0x660be872b18d4a55UL, 0x19992518574e1496UL,
- 0xc103053a302bdcbbUL, 0x3ed8e9800b218e8eUL,
- /* 242 */ 0x7b0b9239fa75e03eUL, 0xefe9fb684633c083UL,
- 0x98a35fbe391a7793UL, 0x6065510fe2d0fe34UL,
- /* 243 */ 0x55cb668548abad0cUL, 0xb4584548da87e527UL,
- 0x2c43ecea0107c1ddUL, 0x526028809372de35UL,
- /* 244 */ 0x3415c56af9213b1fUL, 0x5bee1a4d017e98dbUL,
- 0x13f6b105b5cf709bUL, 0x5ff20e3482b29ab6UL,
- /* 245 */ 0x0aa29c75cc2e6c90UL, 0xfc7d73ca3a70e206UL,
- 0x899fc38fc4b5c515UL, 0x250386b124ffc207UL,
- /* 246 */ 0x54ea28d5ae3d2b56UL, 0x9913149dd6de60ceUL,
- 0x16694fc58f06d6c1UL, 0x46b23975eb018fc7UL,
- /* 247 */ 0x470a6a0fb4b7b4e2UL, 0x5d92475a8f7253deUL,
- 0xabeee5b52fbd3adbUL, 0x7fa20801a0806968UL,
- /* 248 */ 0x76f3faf19f7714d2UL, 0xb3e840c12f4660c3UL,
- 0x0fb4cd8df212744eUL, 0x4b065a251d3a2dd2UL,
- /* 249 */ 0x5cebde383d77cd4aUL, 0x6adf39df882c9cb1UL,
- 0xa2dd242eb09af759UL, 0x3147c0e50e5f6422UL,
- /* 250 */ 0x164ca5101d1350dbUL, 0xf8d13479c33fc962UL,
- 0xe640ce4d13e5da08UL, 0x4bdee0c45061f8baUL,
- /* 251 */ 0xd7c46dc1a4edb1c9UL, 0x5514d7b6437fd98aUL,
- 0x58942f6bb2a1c00bUL, 0x2dffb2ab1d70710eUL,
- /* 252 */ 0xccdfcf2fc18b6d68UL, 0xa8ebcba8b7806167UL,
- 0x980697f95e2937e3UL, 0x02fbba1cd0126e8cUL
-};
-
-#ifdef CONFIG_AS_ADX
-/* c is two 512-bit products: c0[0:7]=a0[0:3]*b0[0:3] and c1[8:15]=a1[4:7]*b1[4:7]
- * a is two 256-bit integers: a0[0:3] and a1[4:7]
- * b is two 256-bit integers: b0[0:3] and b1[4:7]
- */
-static void mul2_256x256_integer_adx(u64 *const c, const u64 *const a,
- const u64 *const b)
+static __always_inline u64 eq_mask(u64 a, u64 b)
{
- asm volatile(
- "xorl %%r14d, %%r14d ;"
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "adox %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adox %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adox %%r10, %%rbx ;"
- /******************************************/
- "adox %%r14, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "adox %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rax ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rbx ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rcx ;"
- /******************************************/
- "adox %%r14, %%r15 ;"
- "adcx %%r14, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "xorl %%r10d, %%r10d ;"
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "adox %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rbx ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rcx ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%r15 ;"
- /******************************************/
- "adox %%r14, %%rax ;"
- "adcx %%r14, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "xorl %%r10d, %%r10d ;"
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "adox %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- /******************************************/
- "adox %%r14, %%rbx ;"
- "adcx %%r14, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
-
- "movq 32(%1), %%rdx; " /* C[0] */
- "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, 64(%0);"
- "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
- "adox %%r10, %%r15 ;"
- "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
- "adox %%r8, %%rax ;"
- "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
- "adox %%r10, %%rbx ;"
- /******************************************/
- "adox %%r14, %%rcx ;"
-
- "movq 40(%1), %%rdx; " /* C[1] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
- "adox %%r15, %%r8 ;"
- "movq %%r8, 72(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rax ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rbx ;"
- "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rcx ;"
- /******************************************/
- "adox %%r14, %%r15 ;"
- "adcx %%r14, %%r15 ;"
-
- "movq 48(%1), %%rdx; " /* C[2] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
- "adox %%rax, %%r8 ;"
- "movq %%r8, 80(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rbx ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%rcx ;"
- "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%r15 ;"
- /******************************************/
- "adox %%r14, %%rax ;"
- "adcx %%r14, %%rax ;"
-
- "movq 56(%1), %%rdx; " /* C[3] */
- "xorl %%r10d, %%r10d ;"
- "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
- "adox %%rbx, %%r8 ;"
- "movq %%r8, 88(%0);"
- "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
- "adox %%r10, %%r9 ;"
- "adcx %%r9, %%rcx ;"
- "movq %%rcx, 96(%0) ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
- "adox %%r8, %%r11 ;"
- "adcx %%r11, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
- "adox %%r10, %%r13 ;"
- "adcx %%r13, %%rax ;"
- "movq %%rax, 112(%0) ;"
- /******************************************/
- "adox %%r14, %%rbx ;"
- "adcx %%r14, %%rbx ;"
- "movq %%rbx, 120(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
+ u64 x = a ^ b;
+ u64 minus_x = ~x + (u64)1U;
+ u64 x_or_minus_x = x | minus_x;
+ u64 xnx = x_or_minus_x >> (u32)63U;
+ return xnx - (u64)1U;
}
-#endif
-#ifdef CONFIG_AS_BMI2
-static void mul2_256x256_integer_bmi2(u64 *const c, const u64 *const a,
- const u64 *const b)
+static __always_inline u64 gte_mask(u64 a, u64 b)
{
- asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "addq %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adcq %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
-
- "movq 32(%1), %%rdx; " /* C[0] */
- "mulx 32(%2), %%r8, %%r15; " /* C[0]*D[0] */
- "movq %%r8, 64(%0) ;"
- "mulx 40(%2), %%r10, %%rax; " /* C[0]*D[1] */
- "addq %%r10, %%r15 ;"
- "mulx 48(%2), %%r8, %%rbx; " /* C[0]*D[2] */
- "adcq %%r8, %%rax ;"
- "mulx 56(%2), %%r10, %%rcx; " /* C[0]*D[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 40(%1), %%rdx; " /* C[1] */
- "mulx 32(%2), %%r8, %%r9; " /* C[1]*D[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[1]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[1]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%r15; " /* C[1]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 48(%1), %%rdx; " /* C[2] */
- "mulx 32(%2), %%r8, %%r9; " /* C[2]*D[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 80(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[2]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[2]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%rax; " /* C[2]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 56(%1), %%rdx; " /* C[3] */
- "mulx 32(%2), %%r8, %%r9; " /* C[3]*D[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 88(%0) ;"
- "mulx 40(%2), %%r10, %%r11; " /* C[3]*D[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 48(%2), %%r8, %%r13; " /* C[3]*D[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 56(%2), %%r10, %%rbx; " /* C[3]*D[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 96(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 112(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 120(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r15");
+ u64 x = a;
+ u64 y = b;
+ u64 x_xor_y = x ^ y;
+ u64 x_sub_y = x - y;
+ u64 x_sub_y_xor_y = x_sub_y ^ y;
+ u64 q = x_xor_y | x_sub_y_xor_y;
+ u64 x_xor_q = x ^ q;
+ u64 x_xor_q_ = x_xor_q >> (u32)63U;
+ return x_xor_q_ - (u64)1U;
}
-#endif
-#ifdef CONFIG_AS_ADX
-static void sqr2_256x256_integer_adx(u64 *const c, const u64 *const a)
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
+static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
{
- asm volatile(
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
- "adcx %%rax, %%r10 ;"
- "movq 24(%1), %%rdx ;" /* A[3] */
- "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
- "adcx %%rax, %%rbx ;"
- "movq 8(%1), %%rdx ;" /* A[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq (%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
-
-
- "movq 32(%1), %%rdx ;" /* B[0] */
- "mulx 40(%1), %%r8, %%r14 ;" /* B[1]*B[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 48(%1), %%r9, %%r10 ;" /* B[2]*B[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 56(%1), %%rax, %%rcx ;" /* B[3]*B[0] */
- "adcx %%rax, %%r10 ;"
- "movq 56(%1), %%rdx ;" /* B[3] */
- "mulx 40(%1), %%r11, %%rbx ;" /* B[1]*B[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 48(%1), %%rax, %%r13 ;" /* B[2]*B[3] */
- "adcx %%rax, %%rbx ;"
- "movq 40(%1), %%rdx ;" /* B[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 48(%1), %%rax, %%rcx ;" /* B[2]*B[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq 32(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[0]^2 */
- /*******************/
- "movq %%rax, 64(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "movq 40(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 80(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 88(%0) ;"
- "movq 48(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 96(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 104(%0) ;"
- "movq 56(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* B[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 112(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 120(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
-}
-#endif
+ u64 carry_r;
-#ifdef CONFIG_AS_BMI2
-static void sqr2_256x256_integer_bmi2(u64 *const c, const u64 *const a)
-{
asm volatile(
- "movq 8(%1), %%rdx ;" /* A[1] */
- "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
- "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
- "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
-
- "movq 16(%1), %%rdx ;" /* A[2] */
- "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
- "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ; " /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
-
- "movq 40(%1), %%rdx ;" /* B[1] */
- "mulx 32(%1), %%r8, %%r9 ;" /* B[0]*B[1] */
- "mulx 48(%1), %%r10, %%r11 ;" /* B[2]*B[1] */
- "mulx 56(%1), %%rcx, %%r14 ;" /* B[3]*B[1] */
-
- "movq 48(%1), %%rdx ;" /* B[2] */
- "mulx 56(%1), %%r15, %%r13 ;" /* B[3]*B[2] */
- "mulx 32(%1), %%rax, %%rdx ;" /* B[0]*B[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq 32(%1), %%rdx ;" /* B[0] */
- "mulx 56(%1), %%rax, %%rcx ;" /* B[0]*B[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ; " /* B[0]^2 */
- /*******************/
- "movq %%rax, 64(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 72(%0) ;"
- "movq 40(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 80(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 88(%0) ;"
- "movq 48(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 96(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 104(%0) ;"
- "movq 56(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ; " /* B[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 112(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 120(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11", "%r13", "%r14", "%r15");
-}
-#endif
-
-#ifdef CONFIG_AS_ADX
-static void red_eltfp25519_2w_adx(u64 *const c, const u64 *const a)
-{
- asm volatile(
- "movl $38, %%edx; " /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10; " /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox (%1), %%r8 ;"
- "mulx 40(%1), %%r9, %%r11; " /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 8(%1), %%r9 ;"
- "mulx 48(%1), %%r10, %%rax; " /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 16(%1), %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx; " /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 24(%1), %%r11 ;"
- /***************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
-
- "mulx 96(%1), %%r8, %%r10; " /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox 64(%1), %%r8 ;"
- "mulx 104(%1), %%r9, %%r11; " /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 72(%1), %%r9 ;"
- "mulx 112(%1), %%r10, %%rax; " /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 80(%1), %%r10 ;"
- "mulx 120(%1), %%r11, %%rcx; " /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 88(%1), %%r11 ;"
- /****************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 40(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 48(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 56(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 32(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11");
-}
-#endif
+ /* Clear registers to propagate the carry bit */
+ " xor %%r8, %%r8;"
+ " xor %%r9, %%r9;"
+ " xor %%r10, %%r10;"
+ " xor %%r11, %%r11;"
+ " xor %1, %1;"
+
+ /* Begin addition chain */
+ " addq 0(%3), %0;"
+ " movq %0, 0(%2);"
+ " adcxq 8(%3), %%r8;"
+ " movq %%r8, 8(%2);"
+ " adcxq 16(%3), %%r9;"
+ " movq %%r9, 16(%2);"
+ " adcxq 24(%3), %%r10;"
+ " movq %%r10, 24(%2);"
+
+ /* Return the carry bit in a register */
+ " adcx %%r11, %1;"
+ : "+&r" (f2), "=&r" (carry_r)
+ : "r" (out), "r" (f1)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
-#ifdef CONFIG_AS_BMI2
-static void red_eltfp25519_2w_bmi2(u64 *const c, const u64 *const a)
-{
- asm volatile(
- "movl $38, %%edx ; " /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /***************************************/
- "adcq $0, %%rcx ;"
- "addq (%1), %%r8 ;"
- "adcq 8(%1), %%r9 ;"
- "adcq 16(%1), %%r10 ;"
- "adcq 24(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
-
- "mulx 96(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 104(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 112(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 120(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /****************************************/
- "adcq $0, %%rcx ;"
- "addq 64(%1), %%r8 ;"
- "adcq 72(%1), %%r9 ;"
- "adcq 80(%1), %%r10 ;"
- "adcq 88(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 40(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 48(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 56(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 32(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ return carry_r;
}
-#endif
-#ifdef CONFIG_AS_ADX
-static void mul_256x256_integer_adx(u64 *const c, const u64 *const a,
- const u64 *const b)
+/* Computes the field addition of two field elements */
+static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r9; " /* A[0]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[0]*B[1] */
- "adox %%r9, %%r10 ;"
- "movq %%r10, 8(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[0]*B[2] */
- "adox %%r11, %%r15 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[0]*B[3] */
- "adox %%r13, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 8(%0), %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[1]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[1]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 16(%0), %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[2]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[2]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "xorl %%r10d, %%r10d ;"
- "adcx 24(%0), %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adox %%r9, %%r10 ;"
- "adcx %%r15, %%r10 ;"
- "movq %%r10, 32(%0) ;"
- "mulx 16(%2), %%r15, %%r13; " /* A[3]*B[2] */
- "adox %%r11, %%r15 ;"
- "adcx %%r14, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq $0, %%r8 ;"
- "mulx 24(%2), %%r14, %%rdx; " /* A[3]*B[3] */
- "adox %%r13, %%r14 ;"
- "adcx %%rax, %%r14 ;"
- "movq %%r14, 48(%0) ;"
- "movq $0, %%rax ;"
- /******************************************/
- "adox %%rdx, %%rax ;"
- "adcx %%r8, %%rax ;"
- "movq %%rax, 56(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11",
- "%r13", "%r14", "%r15");
+ /* Compute the raw addition of f1 + f2 */
+ " movq 0(%0), %%r8;"
+ " addq 0(%2), %%r8;"
+ " movq 8(%0), %%r9;"
+ " adcxq 8(%2), %%r9;"
+ " movq 16(%0), %%r10;"
+ " adcxq 16(%2), %%r10;"
+ " movq 24(%0), %%r11;"
+ " adcxq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %0;"
+ " cmovc %0, %%rax;"
+
+ /* Step 2: Add carry*38 to the original sum */
+ " xor %%rcx, %%rcx;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %0, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r" (f2)
+ : "r" (out), "r" (f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_BMI2
-static void mul_256x256_integer_bmi2(u64 *const c, const u64 *const a,
- const u64 *const b)
+/* Computes the field substraction of two field elements */
+static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- "movq (%1), %%rdx; " /* A[0] */
- "mulx (%2), %%r8, %%r15; " /* A[0]*B[0] */
- "movq %%r8, (%0) ;"
- "mulx 8(%2), %%r10, %%rax; " /* A[0]*B[1] */
- "addq %%r10, %%r15 ;"
- "mulx 16(%2), %%r8, %%rbx; " /* A[0]*B[2] */
- "adcq %%r8, %%rax ;"
- "mulx 24(%2), %%r10, %%rcx; " /* A[0]*B[3] */
- "adcq %%r10, %%rbx ;"
- /******************************************/
- "adcq $0, %%rcx ;"
-
- "movq 8(%1), %%rdx; " /* A[1] */
- "mulx (%2), %%r8, %%r9; " /* A[1]*B[0] */
- "addq %%r15, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[1]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[1]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%r15; " /* A[1]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%r15 ;"
-
- "addq %%r9, %%rax ;"
- "adcq %%r11, %%rbx ;"
- "adcq %%r13, %%rcx ;"
- "adcq $0, %%r15 ;"
-
- "movq 16(%1), %%rdx; " /* A[2] */
- "mulx (%2), %%r8, %%r9; " /* A[2]*B[0] */
- "addq %%rax, %%r8 ;"
- "movq %%r8, 16(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[2]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[2]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rax; " /* A[2]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rax ;"
-
- "addq %%r9, %%rbx ;"
- "adcq %%r11, %%rcx ;"
- "adcq %%r13, %%r15 ;"
- "adcq $0, %%rax ;"
-
- "movq 24(%1), %%rdx; " /* A[3] */
- "mulx (%2), %%r8, %%r9; " /* A[3]*B[0] */
- "addq %%rbx, %%r8 ;"
- "movq %%r8, 24(%0) ;"
- "mulx 8(%2), %%r10, %%r11; " /* A[3]*B[1] */
- "adcq %%r10, %%r9 ;"
- "mulx 16(%2), %%r8, %%r13; " /* A[3]*B[2] */
- "adcq %%r8, %%r11 ;"
- "mulx 24(%2), %%r10, %%rbx; " /* A[3]*B[3] */
- "adcq %%r10, %%r13 ;"
- /******************************************/
- "adcq $0, %%rbx ;"
-
- "addq %%r9, %%rcx ;"
- "movq %%rcx, 32(%0) ;"
- "adcq %%r11, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "adcq %%r13, %%rax ;"
- "movq %%rax, 48(%0) ;"
- "adcq $0, %%rbx ;"
- "movq %%rbx, 56(%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r15");
+ /* Compute the raw substraction of f1-f2 */
+ " movq 0(%1), %%r8;"
+ " subq 0(%2), %%r8;"
+ " movq 8(%1), %%r9;"
+ " sbbq 8(%2), %%r9;"
+ " movq 16(%1), %%r10;"
+ " sbbq 16(%2), %%r10;"
+ " movq 24(%1), %%r11;"
+ " sbbq 24(%2), %%r11;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $0, %%rax;"
+ " mov $38, %%rcx;"
+ " cmovc %%rcx, %%rax;"
+
+ /* Step 2: Substract carry*38 from the original difference */
+ " sub %%rax, %%r8;"
+ " sbb $0, %%r9;"
+ " sbb $0, %%r10;"
+ " sbb $0, %%r11;"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+
+ /* Store the result */
+ " movq %%r8, 0(%0);"
+ " movq %%r9, 8(%0);"
+ " movq %%r10, 16(%0);"
+ " movq %%r11, 24(%0);"
+ :
+ : "r" (out), "r" (f1), "r" (f2)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_ADX
-static void sqr_256x256_integer_adx(u64 *const c, const u64 *const a)
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 8(%1), %%r8, %%r14 ;" /* A[1]*A[0] */
- "xorl %%r15d, %%r15d;"
- "mulx 16(%1), %%r9, %%r10 ;" /* A[2]*A[0] */
- "adcx %%r14, %%r9 ;"
- "mulx 24(%1), %%rax, %%rcx ;" /* A[3]*A[0] */
- "adcx %%rax, %%r10 ;"
- "movq 24(%1), %%rdx ;" /* A[3] */
- "mulx 8(%1), %%r11, %%rbx ;" /* A[1]*A[3] */
- "adcx %%rcx, %%r11 ;"
- "mulx 16(%1), %%rax, %%r13 ;" /* A[2]*A[3] */
- "adcx %%rax, %%rbx ;"
- "movq 8(%1), %%rdx ;" /* A[1] */
- "adcx %%r15, %%r13 ;"
- "mulx 16(%1), %%rax, %%rcx ;" /* A[2]*A[1] */
- "movq $0, %%r14 ;"
- /******************************************/
- "adcx %%r15, %%r14 ;"
-
- "xorl %%r15d, %%r15d;"
- "adox %%rax, %%r10 ;"
- "adcx %%r8, %%r8 ;"
- "adox %%rcx, %%r11 ;"
- "adcx %%r9, %%r9 ;"
- "adox %%r15, %%rbx ;"
- "adcx %%r10, %%r10 ;"
- "adox %%r15, %%r13 ;"
- "adcx %%r11, %%r11 ;"
- "adox %%r15, %%r14 ;"
- "adcx %%rbx, %%rbx ;"
- "adcx %%r13, %%r13 ;"
- "adcx %%r14, %%r14 ;"
-
- "movq (%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%rbx ;"
- "movq %%rbx, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11", "%r13", "%r14", "%r15");
+ /* Compute the raw multiplication: tmp <- src1 * src2 */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ /* Compute src1[1] * src2 */
+ " movq 8(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[2] * src2 */
+ " movq 16(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[3] * src2 */
+ " movq 24(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+ /* Line up pointers */
+ " mov %0, %1;"
+ " mov %2, %0;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%1), %%r8, %%r13;"
+ " xor %3, %3;"
+ " adoxq 0(%1), %%r8;"
+ " mulxq 40(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%1), %%r9;"
+ " mulxq 48(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%1), %%r10;"
+ " mulxq 56(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%1), %%r11;"
+ " adcx %3, %%rax;"
+ " adox %3, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %3, %%r9;"
+ " movq %%r9, 8(%0);"
+ " adcx %3, %%r10;"
+ " movq %%r10, 16(%0);"
+ " adcx %3, %%r11;"
+ " movq %%r11, 24(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%0);"
+ : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+ :
+ : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_BMI2
-static void sqr_256x256_integer_bmi2(u64 *const c, const u64 *const a)
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results. */
+static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- "movq 8(%1), %%rdx ;" /* A[1] */
- "mulx (%1), %%r8, %%r9 ;" /* A[0]*A[1] */
- "mulx 16(%1), %%r10, %%r11 ;" /* A[2]*A[1] */
- "mulx 24(%1), %%rcx, %%r14 ;" /* A[3]*A[1] */
-
- "movq 16(%1), %%rdx ;" /* A[2] */
- "mulx 24(%1), %%r15, %%r13 ;" /* A[3]*A[2] */
- "mulx (%1), %%rax, %%rdx ;" /* A[0]*A[2] */
-
- "addq %%rax, %%r9 ;"
- "adcq %%rdx, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq %%r14, %%r15 ;"
- "adcq $0, %%r13 ;"
- "movq $0, %%r14 ;"
- "adcq $0, %%r14 ;"
-
- "movq (%1), %%rdx ;" /* A[0] */
- "mulx 24(%1), %%rax, %%rcx ;" /* A[0]*A[3] */
-
- "addq %%rax, %%r10 ;"
- "adcq %%rcx, %%r11 ;"
- "adcq $0, %%r15 ;"
- "adcq $0, %%r13 ;"
- "adcq $0, %%r14 ;"
-
- "shldq $1, %%r13, %%r14 ;"
- "shldq $1, %%r15, %%r13 ;"
- "shldq $1, %%r11, %%r15 ;"
- "shldq $1, %%r10, %%r11 ;"
- "shldq $1, %%r9, %%r10 ;"
- "shldq $1, %%r8, %%r9 ;"
- "shlq $1, %%r8 ;"
-
- /*******************/
- "mulx %%rdx, %%rax, %%rcx ;" /* A[0]^2 */
- /*******************/
- "movq %%rax, 0(%0) ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, 8(%0) ;"
- "movq 8(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[1]^2 */
- "adcq %%rax, %%r9 ;"
- "movq %%r9, 16(%0) ;"
- "adcq %%rcx, %%r10 ;"
- "movq %%r10, 24(%0) ;"
- "movq 16(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[2]^2 */
- "adcq %%rax, %%r11 ;"
- "movq %%r11, 32(%0) ;"
- "adcq %%rcx, %%r15 ;"
- "movq %%r15, 40(%0) ;"
- "movq 24(%1), %%rdx ;"
- "mulx %%rdx, %%rax, %%rcx ;" /* A[3]^2 */
- "adcq %%rax, %%r13 ;"
- "movq %%r13, 48(%0) ;"
- "adcq %%rcx, %%r14 ;"
- "movq %%r14, 56(%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11", "%r13", "%r14", "%r15");
+ /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
+
+ /* Compute src1[0] * src2 */
+ " movq 0(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ /* Compute src1[1] * src2 */
+ " movq 8(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[2] * src2 */
+ " movq 16(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[3] * src2 */
+ " movq 24(%1), %%rdx;"
+ " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
+ " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
+ " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
+ " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
+
+ /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
+
+ /* Compute src1[0] * src2 */
+ " movq 32(%1), %%rdx;"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
+ " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
+ " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ /* Compute src1[1] * src2 */
+ " movq 40(%1), %%rdx;"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);"
+ " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[2] * src2 */
+ " movq 48(%1), %%rdx;"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);"
+ " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
+ " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
+ /* Compute src1[3] * src2 */
+ " movq 56(%1), %%rdx;"
+ " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
+ " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);"
+ " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;"
+ " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
+ " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
+ /* Line up pointers */
+ " mov %0, %1;"
+ " mov %2, %0;"
+
+ /* Wrap the results back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%1), %%r8, %%r13;"
+ " xor %3, %3;"
+ " adoxq 0(%1), %%r8;"
+ " mulxq 40(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%1), %%r9;"
+ " mulxq 48(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%1), %%r10;"
+ " mulxq 56(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%1), %%r11;"
+ " adcx %3, %%rax;"
+ " adox %3, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %3, %%r9;"
+ " movq %%r9, 8(%0);"
+ " adcx %3, %%r10;"
+ " movq %%r10, 16(%0);"
+ " adcx %3, %%r11;"
+ " movq %%r11, 24(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%0);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%1), %%r8, %%r13;"
+ " xor %3, %3;"
+ " adoxq 64(%1), %%r8;"
+ " mulxq 104(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%1), %%r9;"
+ " mulxq 112(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 80(%1), %%r10;"
+ " mulxq 120(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%1), %%r11;"
+ " adcx %3, %%rax;"
+ " adox %3, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %3, %%r9;"
+ " movq %%r9, 40(%0);"
+ " adcx %3, %%r10;"
+ " movq %%r10, 48(%0);"
+ " adcx %3, %%r11;"
+ " movq %%r11, 56(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%0);"
+ : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
+ :
+ : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_ADX
-static void red_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+/* Computes the field multiplication of four-element f1 with value in f2 */
+static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
- asm volatile(
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "xorl %%ebx, %%ebx ;"
- "adox (%1), %%r8 ;"
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "adcx %%r10, %%r9 ;"
- "adox 8(%1), %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcx %%r11, %%r10 ;"
- "adox 16(%1), %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcx %%rax, %%r11 ;"
- "adox 24(%1), %%r11 ;"
- /***************************************/
- "adcx %%rbx, %%rcx ;"
- "adox %%rbx, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0, of=0 */
- "adcx %%rcx, %%r8 ;"
- "adcx %%rbx, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rbx, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rbx, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9",
- "%r10", "%r11");
-}
-#endif
+ register u64 f2_r asm("rdx") = f2;
-#ifdef CONFIG_AS_BMI2
-static void red_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
-{
asm volatile(
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 */
- "mulx 32(%1), %%r8, %%r10 ;" /* c*C[4] */
- "mulx 40(%1), %%r9, %%r11 ;" /* c*C[5] */
- "addq %%r10, %%r9 ;"
- "mulx 48(%1), %%r10, %%rax ;" /* c*C[6] */
- "adcq %%r11, %%r10 ;"
- "mulx 56(%1), %%r11, %%rcx ;" /* c*C[7] */
- "adcq %%rax, %%r11 ;"
- /***************************************/
- "adcq $0, %%rcx ;"
- "addq (%1), %%r8 ;"
- "adcq 8(%1), %%r9 ;"
- "adcq 16(%1), %%r10 ;"
- "adcq 24(%1), %%r11 ;"
- "adcq $0, %%rcx ;"
- "imul %%rdx, %%rcx ;" /* c*C[4], cf=0 */
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ /* Compute the raw multiplication of f1*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */
+ " add %%rcx, %%r9;"
+ " mov $0, %%rcx;"
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
+ " adcx %%r12, %%r10;"
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
+ " adcx %%r13, %%r11;"
+ " adcx %%rcx, %%rax;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute carry*38 */
+ " mov $38, %%rdx;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%1);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%1);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%1);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%1);"
+ : "+&r" (f2_r)
+ : "r" (out), "r" (f1)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_ADX
-static __always_inline void
-add_eltfp25519_1w_adx(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes p1 <- bit ? p2 : p1 in constant time */
+static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- "mov $38, %%eax ;"
- "xorl %%ecx, %%ecx ;"
- "movq (%2), %%r8 ;"
- "adcx (%1), %%r8 ;"
- "movq 8(%2), %%r9 ;"
- "adcx 8(%1), %%r9 ;"
- "movq 16(%2), %%r10 ;"
- "adcx 16(%1), %%r10 ;"
- "movq 24(%2), %%r11 ;"
- "adcx 24(%1), %%r11 ;"
- "cmovc %%eax, %%ecx ;"
- "xorl %%eax, %%eax ;"
- "adcx %%rcx, %%r8 ;"
- "adcx %%rax, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcx %%rax, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcx %%rax, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $38, %%ecx ;"
- "cmovc %%ecx, %%eax ;"
- "addq %%rax, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Invert the polarity of bit to match cmov expectations */
+ " add $18446744073709551615, %0;"
+
+ /* cswap p1[0], p2[0] */
+ " movq 0(%1), %%r8;"
+ " movq 0(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 0(%1);"
+ " movq %%r9, 0(%2);"
+
+ /* cswap p1[1], p2[1] */
+ " movq 8(%1), %%r8;"
+ " movq 8(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 8(%1);"
+ " movq %%r9, 8(%2);"
+
+ /* cswap p1[2], p2[2] */
+ " movq 16(%1), %%r8;"
+ " movq 16(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 16(%1);"
+ " movq %%r9, 16(%2);"
+
+ /* cswap p1[3], p2[3] */
+ " movq 24(%1), %%r8;"
+ " movq 24(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 24(%1);"
+ " movq %%r9, 24(%2);"
+
+ /* cswap p1[4], p2[4] */
+ " movq 32(%1), %%r8;"
+ " movq 32(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 32(%1);"
+ " movq %%r9, 32(%2);"
+
+ /* cswap p1[5], p2[5] */
+ " movq 40(%1), %%r8;"
+ " movq 40(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 40(%1);"
+ " movq %%r9, 40(%2);"
+
+ /* cswap p1[6], p2[6] */
+ " movq 48(%1), %%r8;"
+ " movq 48(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 48(%1);"
+ " movq %%r9, 48(%2);"
+
+ /* cswap p1[7], p2[7] */
+ " movq 56(%1), %%r8;"
+ " movq 56(%2), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 56(%1);"
+ " movq %%r9, 56(%2);"
+ : "+&r" (bit)
+ : "r" (p1), "r" (p2)
+ : "%r8", "%r9", "%r10", "memory", "cc"
+ );
}
-#endif
-#ifdef CONFIG_AS_BMI2
-static __always_inline void
-add_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
+static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- "mov $38, %%eax ;"
- "movq (%2), %%r8 ;"
- "addq (%1), %%r8 ;"
- "movq 8(%2), %%r9 ;"
- "adcq 8(%1), %%r9 ;"
- "movq 16(%2), %%r10 ;"
- "adcq 16(%1), %%r10 ;"
- "movq 24(%2), %%r11 ;"
- "adcq 24(%1), %%r11 ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Compute the raw multiplication: tmp <- f * f */
+
+ /* Step 1: Compute all partial products */
+ " movq 0(%1), %%rdx;" /* f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%1), %%rdx;" /* f[3] */
+ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%0);"
+ " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
+ " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
+ " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
+ " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
+ " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
+ " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
+ " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+
+ /* Line up pointers */
+ " mov %0, %1;"
+ " mov %2, %0;"
+
+ /* Wrap the result back into the field */
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%1), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%1), %%r8;"
+ " mulxq 40(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%1), %%r9;"
+ " mulxq 48(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%1), %%r10;"
+ " mulxq 56(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%1), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%0);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%0);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%0);"
+ : "+&r" (tmp), "+&r" (f), "+&r" (out)
+ :
+ : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
}
-#endif
-static __always_inline void
-sub_eltfp25519_1w(u64 *const c, const u64 *const a, const u64 *const b)
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
+static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- "mov $38, %%eax ;"
- "movq (%1), %%r8 ;"
- "subq (%2), %%r8 ;"
- "movq 8(%1), %%r9 ;"
- "sbbq 8(%2), %%r9 ;"
- "movq 16(%1), %%r10 ;"
- "sbbq 16(%2), %%r10 ;"
- "movq 24(%1), %%r11 ;"
- "sbbq 24(%2), %%r11 ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "subq %%rcx, %%r8 ;"
- "sbbq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "sbbq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "sbbq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%eax, %%ecx ;"
- "subq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(b)
- : "memory", "cc", "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11");
+ /* Step 1: Compute all partial products */
+ " movq 0(%1), %%rdx;" /* f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%1), %%rdx;" /* f[3] */
+ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 0(%0);"
+ " add %%rcx, %%r8;" " movq %%r8, 8(%0);"
+ " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
+ " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
+ " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
+ " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
+ " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
+ " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
+
+ /* Step 1: Compute all partial products */
+ " movq 32(%1), %%rdx;" /* f[0] */
+ " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%1), %%rdx;" /* f[3] */
+ " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+
+ /* Step 3: Compute intermediate squares */
+ " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
+ " movq %%rax, 64(%0);"
+ " add %%rcx, %%r8;" " movq %%r8, 72(%0);"
+ " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
+ " adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
+ " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
+ " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
+ " adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
+ " adcx %%rcx, %%r12;" " movq %%r12, 104(%0);"
+ " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
+ " adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
+ " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
+
+ /* Line up pointers */
+ " mov %0, %1;"
+ " mov %2, %0;"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 32(%1), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%1), %%r8;"
+ " mulxq 40(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%1), %%r9;"
+ " mulxq 48(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%1), %%r10;"
+ " mulxq 56(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%1), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%0);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%0);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%0);"
+
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
+ " mov $38, %%rdx;"
+ " mulxq 96(%1), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 64(%1), %%r8;"
+ " mulxq 104(%1), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%1), %%r9;"
+ " mulxq 112(%1), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 80(%1), %%r10;"
+ " mulxq 120(%1), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%1), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+
+ /* Step 2: Fold the carry back into dst */
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%0);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%0);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%0);"
+
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%0);"
+ : "+&r" (tmp), "+&r" (f), "+&r" (out)
+ :
+ : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
}
-/* Multiplication by a24 = (A+2)/4 = (486662+2)/4 = 121666 */
-static __always_inline void
-mul_a24_eltfp25519_1w(u64 *const c, const u64 *const a)
+static void point_add_and_double(u64 *q, u64 *p01_tmp1, u64 *tmp2)
{
- const u64 a24 = 121666;
- asm volatile(
- "movq %2, %%rdx ;"
- "mulx (%1), %%r8, %%r10 ;"
- "mulx 8(%1), %%r9, %%r11 ;"
- "addq %%r10, %%r9 ;"
- "mulx 16(%1), %%r10, %%rax ;"
- "adcq %%r11, %%r10 ;"
- "mulx 24(%1), %%r11, %%rcx ;"
- "adcq %%rax, %%r11 ;"
- /**************************/
- "adcq $0, %%rcx ;"
- "movl $38, %%edx ;" /* 2*c = 38 = 2^256 mod 2^255-19*/
- "imul %%rdx, %%rcx ;"
- "addq %%rcx, %%r8 ;"
- "adcq $0, %%r9 ;"
- "movq %%r9, 8(%0) ;"
- "adcq $0, %%r10 ;"
- "movq %%r10, 16(%0) ;"
- "adcq $0, %%r11 ;"
- "movq %%r11, 24(%0) ;"
- "mov $0, %%ecx ;"
- "cmovc %%edx, %%ecx ;"
- "addq %%rcx, %%r8 ;"
- "movq %%r8, (%0) ;"
- :
- : "r"(c), "r"(a), "r"(a24)
- : "memory", "cc", "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10",
- "%r11");
+ u64 *nq = p01_tmp1;
+ u64 *nq_p1 = p01_tmp1 + (u32)8U;
+ u64 *tmp1 = p01_tmp1 + (u32)16U;
+ u64 *x1 = q;
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *z3 = nq_p1 + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ u64 *x3;
+ u64 *z31;
+ u64 *d0;
+ u64 *c0;
+ u64 *a1;
+ u64 *b1;
+ u64 *d;
+ u64 *c;
+ u64 *ab1;
+ u64 *dc1;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ x3 = nq_p1;
+ z31 = nq_p1 + (u32)4U;
+ d0 = dc;
+ c0 = dc + (u32)4U;
+ fadd(c0, x3, z31);
+ fsub(d0, x3, z31);
+ fmul2(dc, dc, ab, tmp2);
+ fadd(x3, d0, c0);
+ fsub(z31, d0, c0);
+ a1 = tmp1;
+ b1 = tmp1 + (u32)4U;
+ d = tmp1 + (u32)8U;
+ c = tmp1 + (u32)12U;
+ ab1 = tmp1;
+ dc1 = tmp1 + (u32)8U;
+ fsqr2(dc1, ab1, tmp2);
+ fsqr2(nq_p1, nq_p1, tmp2);
+ a1[0U] = c[0U];
+ a1[1U] = c[1U];
+ a1[2U] = c[2U];
+ a1[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b1, c, (u64)121665U);
+ fadd(b1, b1, d);
+ fmul2(nq, dc1, ab1, tmp2);
+ fmul(z3, z3, x1, tmp2);
}
-#ifdef CONFIG_AS_ADX
-static void inv_eltfp25519_1w_adx(u64 *const c, const u64 *const a)
+static void point_double(u64 *nq, u64 *tmp1, u64 *tmp2)
{
- struct {
- eltfp25519_1w_buffer buffer;
- eltfp25519_1w x0, x1, x2;
- } __aligned(32) m;
- u64 *T[4];
-
- T[0] = m.x0;
- T[1] = c; /* x^(-1) */
- T[2] = m.x1;
- T[3] = m.x2;
-
- copy_eltfp25519_1w(T[1], a);
- sqrn_eltfp25519_1w_adx(T[1], 1);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_adx(T[2], 2);
- mul_eltfp25519_1w_adx(T[0], a, T[2]);
- mul_eltfp25519_1w_adx(T[1], T[1], T[0]);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_adx(T[2], 1);
- mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 5);
- mul_eltfp25519_1w_adx(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 10);
- mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
- copy_eltfp25519_1w(T[3], T[2]);
- sqrn_eltfp25519_1w_adx(T[3], 20);
- mul_eltfp25519_1w_adx(T[3], T[3], T[2]);
- sqrn_eltfp25519_1w_adx(T[3], 10);
- mul_eltfp25519_1w_adx(T[3], T[3], T[0]);
- copy_eltfp25519_1w(T[0], T[3]);
- sqrn_eltfp25519_1w_adx(T[0], 50);
- mul_eltfp25519_1w_adx(T[0], T[0], T[3]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 100);
- mul_eltfp25519_1w_adx(T[2], T[2], T[0]);
- sqrn_eltfp25519_1w_adx(T[2], 50);
- mul_eltfp25519_1w_adx(T[2], T[2], T[3]);
- sqrn_eltfp25519_1w_adx(T[2], 5);
- mul_eltfp25519_1w_adx(T[1], T[1], T[2]);
-
- memzero_explicit(&m, sizeof(m));
+ u64 *x2 = nq;
+ u64 *z2 = nq + (u32)4U;
+ u64 *a = tmp1;
+ u64 *b = tmp1 + (u32)4U;
+ u64 *d = tmp1 + (u32)8U;
+ u64 *c = tmp1 + (u32)12U;
+ u64 *ab = tmp1;
+ u64 *dc = tmp1 + (u32)8U;
+ fadd(a, x2, z2);
+ fsub(b, x2, z2);
+ fsqr2(dc, ab, tmp2);
+ a[0U] = c[0U];
+ a[1U] = c[1U];
+ a[2U] = c[2U];
+ a[3U] = c[3U];
+ fsub(c, d, c);
+ fmul_scalar(b, c, (u64)121665U);
+ fadd(b, b, d);
+ fmul2(nq, dc, ab, tmp2);
}
-#endif
-#ifdef CONFIG_AS_BMI2
-static void inv_eltfp25519_1w_bmi2(u64 *const c, const u64 *const a)
+static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
{
- struct {
- eltfp25519_1w_buffer buffer;
- eltfp25519_1w x0, x1, x2;
- } __aligned(32) m;
- u64 *T[5];
-
- T[0] = m.x0;
- T[1] = c; /* x^(-1) */
- T[2] = m.x1;
- T[3] = m.x2;
-
- copy_eltfp25519_1w(T[1], a);
- sqrn_eltfp25519_1w_bmi2(T[1], 1);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_bmi2(T[2], 2);
- mul_eltfp25519_1w_bmi2(T[0], a, T[2]);
- mul_eltfp25519_1w_bmi2(T[1], T[1], T[0]);
- copy_eltfp25519_1w(T[2], T[1]);
- sqrn_eltfp25519_1w_bmi2(T[2], 1);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 5);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[2]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 10);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
- copy_eltfp25519_1w(T[3], T[2]);
- sqrn_eltfp25519_1w_bmi2(T[3], 20);
- mul_eltfp25519_1w_bmi2(T[3], T[3], T[2]);
- sqrn_eltfp25519_1w_bmi2(T[3], 10);
- mul_eltfp25519_1w_bmi2(T[3], T[3], T[0]);
- copy_eltfp25519_1w(T[0], T[3]);
- sqrn_eltfp25519_1w_bmi2(T[0], 50);
- mul_eltfp25519_1w_bmi2(T[0], T[0], T[3]);
- copy_eltfp25519_1w(T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 100);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[0]);
- sqrn_eltfp25519_1w_bmi2(T[2], 50);
- mul_eltfp25519_1w_bmi2(T[2], T[2], T[3]);
- sqrn_eltfp25519_1w_bmi2(T[2], 5);
- mul_eltfp25519_1w_bmi2(T[1], T[1], T[2]);
-
- memzero_explicit(&m, sizeof(m));
+ u64 tmp2[16U] = { 0U };
+ u64 p01_tmp1_swap[33U] = { 0U };
+ u64 *p0 = p01_tmp1_swap;
+ u64 *p01 = p01_tmp1_swap;
+ u64 *p03 = p01;
+ u64 *p11 = p01 + (u32)8U;
+ u64 *x0;
+ u64 *z0;
+ u64 *p01_tmp1;
+ u64 *p01_tmp11;
+ u64 *nq10;
+ u64 *nq_p11;
+ u64 *swap1;
+ u64 sw0;
+ u64 *nq1;
+ u64 *tmp1;
+ memcpy(p11, init1, (u32)8U * sizeof(init1[0U]));
+ x0 = p03;
+ z0 = p03 + (u32)4U;
+ x0[0U] = (u64)1U;
+ x0[1U] = (u64)0U;
+ x0[2U] = (u64)0U;
+ x0[3U] = (u64)0U;
+ z0[0U] = (u64)0U;
+ z0[1U] = (u64)0U;
+ z0[2U] = (u64)0U;
+ z0[3U] = (u64)0U;
+ p01_tmp1 = p01_tmp1_swap;
+ p01_tmp11 = p01_tmp1_swap;
+ nq10 = p01_tmp1_swap;
+ nq_p11 = p01_tmp1_swap + (u32)8U;
+ swap1 = p01_tmp1_swap + (u32)32U;
+ cswap2((u64)1U, nq10, nq_p11);
+ point_add_and_double(init1, p01_tmp11, tmp2);
+ swap1[0U] = (u64)1U;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)251U; i = i + (u32)1U) {
+ u64 *p01_tmp12 = p01_tmp1_swap;
+ u64 *swap2 = p01_tmp1_swap + (u32)32U;
+ u64 *nq2 = p01_tmp12;
+ u64 *nq_p12 = p01_tmp12 + (u32)8U;
+ u64 bit = (u64)(key[((u32)253U - i) / (u32)8U] >> ((u32)253U - i) % (u32)8U & (u8)1U);
+ u64 sw = swap2[0U] ^ bit;
+ cswap2(sw, nq2, nq_p12);
+ point_add_and_double(init1, p01_tmp12, tmp2);
+ swap2[0U] = bit;
+ }
+ }
+ sw0 = swap1[0U];
+ cswap2(sw0, nq10, nq_p11);
+ nq1 = p01_tmp1;
+ tmp1 = p01_tmp1 + (u32)16U;
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ point_double(nq1, tmp1, tmp2);
+ memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+ memzero_explicit(tmp2, sizeof(tmp2));
+ memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
}
-#endif
-/* Given c, a 256-bit number, fred_eltfp25519_1w updates c
- * with a number such that 0 <= C < 2**255-19.
- */
-static __always_inline void fred_eltfp25519_1w(u64 *const c)
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
{
- u64 tmp0 = 38, tmp1 = 19;
- asm volatile(
- "btrq $63, %3 ;" /* Put bit 255 in carry flag and clear */
- "cmovncl %k5, %k4 ;" /* c[255] ? 38 : 19 */
-
- /* Add either 19 or 38 to c */
- "addq %4, %0 ;"
- "adcq $0, %1 ;"
- "adcq $0, %2 ;"
- "adcq $0, %3 ;"
-
- /* Test for bit 255 again; only triggered on overflow modulo 2^255-19 */
- "movl $0, %k4 ;"
- "cmovnsl %k5, %k4 ;" /* c[255] ? 0 : 19 */
- "btrq $63, %3 ;" /* Clear bit 255 */
-
- /* Subtract 19 if necessary */
- "subq %4, %0 ;"
- "sbbq $0, %1 ;"
- "sbbq $0, %2 ;"
- "sbbq $0, %3 ;"
-
- : "+r"(c[0]), "+r"(c[1]), "+r"(c[2]), "+r"(c[3]), "+r"(tmp0),
- "+r"(tmp1)
- :
- : "memory", "cc");
+ u32 i;
+ fsqr(o, inp, tmp);
+ for (i = (u32)0U; i < n1 - (u32)1U; i = i + (u32)1U)
+ fsqr(o, o, tmp);
}
-static __always_inline void cswap(u8 bit, u64 *const px, u64 *const py)
+static void finv(u64 *o, const u64 *i, u64 *tmp)
{
- u64 temp;
- asm volatile(
- "test %9, %9 ;"
- "movq %0, %8 ;"
- "cmovnzq %4, %0 ;"
- "cmovnzq %8, %4 ;"
- "movq %1, %8 ;"
- "cmovnzq %5, %1 ;"
- "cmovnzq %8, %5 ;"
- "movq %2, %8 ;"
- "cmovnzq %6, %2 ;"
- "cmovnzq %8, %6 ;"
- "movq %3, %8 ;"
- "cmovnzq %7, %3 ;"
- "cmovnzq %8, %7 ;"
- : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3]),
- "+r"(py[0]), "+r"(py[1]), "+r"(py[2]), "+r"(py[3]),
- "=r"(temp)
- : "r"(bit)
- : "cc"
- );
+ u64 t1[16U] = { 0U };
+ u64 *a0 = t1;
+ u64 *b = t1 + (u32)4U;
+ u64 *c = t1 + (u32)8U;
+ u64 *t00 = t1 + (u32)12U;
+ u64 *tmp1 = tmp;
+ u64 *a;
+ u64 *t0;
+ fsquare_times(a0, i, tmp1, (u32)1U);
+ fsquare_times(t00, a0, tmp1, (u32)2U);
+ fmul(b, t00, i, tmp);
+ fmul(a0, b, a0, tmp);
+ fsquare_times(t00, a0, tmp1, (u32)1U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)5U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)10U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)20U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)10U);
+ fmul(b, t00, b, tmp);
+ fsquare_times(t00, b, tmp1, (u32)50U);
+ fmul(c, t00, b, tmp);
+ fsquare_times(t00, c, tmp1, (u32)100U);
+ fmul(t00, t00, c, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)50U);
+ fmul(t00, t00, b, tmp);
+ fsquare_times(t00, t00, tmp1, (u32)5U);
+ a = t1;
+ t0 = t1 + (u32)12U;
+ fmul(o, t0, a, tmp);
}
-static __always_inline void cselect(u8 bit, u64 *const px, const u64 *const py)
+static void store_felem(u64 *b, u64 *f)
{
- asm volatile(
- "test %4, %4 ;"
- "cmovnzq %5, %0 ;"
- "cmovnzq %6, %1 ;"
- "cmovnzq %7, %2 ;"
- "cmovnzq %8, %3 ;"
- : "+r"(px[0]), "+r"(px[1]), "+r"(px[2]), "+r"(px[3])
- : "r"(bit), "rm"(py[0]), "rm"(py[1]), "rm"(py[2]), "rm"(py[3])
- : "cc"
- );
+ u64 f30 = f[3U];
+ u64 top_bit0 = f30 >> (u32)63U;
+ u64 carry0;
+ u64 f31;
+ u64 top_bit;
+ u64 carry;
+ u64 f0;
+ u64 f1;
+ u64 f2;
+ u64 f3;
+ u64 m0;
+ u64 m1;
+ u64 m2;
+ u64 m3;
+ u64 mask;
+ u64 f0_;
+ u64 f1_;
+ u64 f2_;
+ u64 f3_;
+ u64 o0;
+ u64 o1;
+ u64 o2;
+ u64 o3;
+ f[3U] = f30 & (u64)0x7fffffffffffffffU;
+ carry0 = add_scalar(f, f, (u64)19U * top_bit0);
+ f31 = f[3U];
+ top_bit = f31 >> (u32)63U;
+ f[3U] = f31 & (u64)0x7fffffffffffffffU;
+ carry = add_scalar(f, f, (u64)19U * top_bit);
+ f0 = f[0U];
+ f1 = f[1U];
+ f2 = f[2U];
+ f3 = f[3U];
+ m0 = gte_mask(f0, (u64)0xffffffffffffffedU);
+ m1 = eq_mask(f1, (u64)0xffffffffffffffffU);
+ m2 = eq_mask(f2, (u64)0xffffffffffffffffU);
+ m3 = eq_mask(f3, (u64)0x7fffffffffffffffU);
+ mask = ((m0 & m1) & m2) & m3;
+ f0_ = f0 - (mask & (u64)0xffffffffffffffedU);
+ f1_ = f1 - (mask & (u64)0xffffffffffffffffU);
+ f2_ = f2 - (mask & (u64)0xffffffffffffffffU);
+ f3_ = f3 - (mask & (u64)0x7fffffffffffffffU);
+ o0 = f0_;
+ o1 = f1_;
+ o2 = f2_;
+ o3 = f3_;
+ b[0U] = o0;
+ b[1U] = o1;
+ b[2U] = o2;
+ b[3U] = o3;
}
-static void curve25519_adx(u8 shared[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE],
- const u8 session_key[CURVE25519_KEY_SIZE])
+static void encode_point(u8 *o, const u64 *i)
{
-#ifdef CONFIG_AS_ADX
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[6 * NUM_WORDS_ELTFP25519];
- u8 session[CURVE25519_KEY_SIZE];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- int i = 0, j = 0;
- u64 prev = 0;
- u64 *const X1 = (u64 *)m.session;
- u64 *const key = (u64 *)m.private;
- u64 *const Px = m.coordinates + 0;
- u64 *const Pz = m.coordinates + 4;
- u64 *const Qx = m.coordinates + 8;
- u64 *const Qz = m.coordinates + 12;
- u64 *const X2 = Qx;
- u64 *const Z2 = Qz;
- u64 *const X3 = Px;
- u64 *const Z3 = Pz;
- u64 *const X2Z2 = Qx;
- u64 *const X3Z3 = Px;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const D = m.workspace + 8;
- u64 *const C = m.workspace + 12;
- u64 *const DA = m.workspace + 16;
- u64 *const CB = m.workspace + 20;
- u64 *const AB = A;
- u64 *const DC = D;
- u64 *const DACB = DA;
-
- memcpy(m.private, private_key, sizeof(m.private));
- memcpy(m.session, session_key, sizeof(m.session));
-
- curve25519_clamp_secret(m.private);
-
- /* As in the draft:
- * When receiving such an array, implementations of curve25519
- * MUST mask the most-significant bit in the final byte. This
- * is done to preserve compatibility with point formats which
- * reserve the sign bit for use in other protocols and to
- * increase resistance to implementation fingerprinting
- */
- m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
-
- copy_eltfp25519_1w(Px, X1);
- setzero_eltfp25519_1w(Pz);
- setzero_eltfp25519_1w(Qx);
- setzero_eltfp25519_1w(Qz);
-
- Pz[0] = 1;
- Qx[0] = 1;
-
- /* main-loop */
- prev = 0;
- j = 62;
- for (i = 3; i >= 0; --i) {
- while (j >= 0) {
- u64 bit = (key[i] >> j) & 0x1;
- u64 swap = bit ^ prev;
- prev = bit;
-
- add_eltfp25519_1w_adx(A, X2, Z2); /* A = (X2+Z2) */
- sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
- add_eltfp25519_1w_adx(C, X3, Z3); /* C = (X3+Z3) */
- sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
- mul_eltfp25519_2w_adx(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
-
- cselect(swap, A, C);
- cselect(swap, B, D);
-
- sqr_eltfp25519_2w_adx(AB); /* [AA|BB] = [A^2|B^2] */
- add_eltfp25519_1w_adx(X3, DA, CB); /* X3 = (DA+CB) */
- sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_eltfp25519_2w_adx(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
-
- copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
- sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
-
- mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
- add_eltfp25519_1w_adx(B, B, X2); /* B = a24*E+B */
- mul_eltfp25519_2w_adx(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_eltfp25519_1w_adx(Z3, Z3, X1); /* Z3 = Z3*X1 */
- --j;
- }
- j = 63;
- }
-
- inv_eltfp25519_1w_adx(A, Qz);
- mul_eltfp25519_1w_adx((u64 *)shared, Qx, A);
- fred_eltfp25519_1w((u64 *)shared);
-
- memzero_explicit(&m, sizeof(m));
-#endif
+ const u64 *x = i;
+ const u64 *z = i + (u32)4U;
+ u64 tmp[4U] = { 0U };
+ u64 tmp_w[16U] = { 0U };
+ finv(tmp, z, tmp_w);
+ fmul(tmp, tmp, x, tmp_w);
+ store_felem((u64 *)o, tmp);
}
-static void curve25519_adx_base(u8 session_key[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE])
+static void curve25519_ever64(u8 *out, const u8 *priv, const u8 *pub)
{
-#ifdef CONFIG_AS_ADX
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[4 * NUM_WORDS_ELTFP25519];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
- u64 swap = 1;
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)m.private;
- u64 *const Ur1 = m.coordinates + 0;
- u64 *const Zr1 = m.coordinates + 4;
- u64 *const Ur2 = m.coordinates + 8;
- u64 *const Zr2 = m.coordinates + 12;
-
- u64 *const UZr1 = m.coordinates + 0;
- u64 *const ZUr2 = m.coordinates + 8;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const C = m.workspace + 8;
- u64 *const D = m.workspace + 12;
-
- u64 *const AB = m.workspace + 0;
- u64 *const CD = m.workspace + 8;
-
- const u64 *const P = table_ladder_8k;
-
- memcpy(m.private, private_key, sizeof(m.private));
-
- curve25519_clamp_secret(m.private);
-
- setzero_eltfp25519_1w(Ur1);
- setzero_eltfp25519_1w(Zr1);
- setzero_eltfp25519_1w(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34UL;
- Ur2[2] = 0xadc7a0b9235d48e2UL;
- Ur2[1] = 0xbbf095ae14b2edf8UL;
- Ur2[0] = 0x7e94e1fec82faabdUL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
- while (j < ite[i]) {
- u64 bit = (key[i] >> j) & 0x1;
- k = (64 * i + j - q);
- swap = swap ^ bit;
- cswap(swap, Ur1, Ur2);
- cswap(swap, Zr1, Zr2);
- swap = bit;
- /* Addition */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_eltfp25519_1w_adx(C, &P[4 * k], B); /* C = M0-B */
- sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_eltfp25519_1w_adx(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_eltfp25519_2w_adx(AB); /* A = A^2 | B = B^2 */
- mul_eltfp25519_2w_adx(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
- ++j;
+ u64 init1[8U] = { 0U };
+ u64 tmp[4U] = { 0U };
+ u64 tmp3;
+ u64 *x;
+ u64 *z;
+ {
+ u32 i;
+ for (i = (u32)0U; i < (u32)4U; i = i + (u32)1U) {
+ u64 *os = tmp;
+ const u8 *bj = pub + i * (u32)8U;
+ u64 u = *(u64 *)bj;
+ u64 r = u;
+ u64 x0 = r;
+ os[i] = x0;
}
- j = 0;
- }
-
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_eltfp25519_1w_adx(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_eltfp25519_2w_adx(AB); /* A = A**2 B = B**2 */
- copy_eltfp25519_1w(C, B); /* C = B */
- sub_eltfp25519_1w(B, A, B); /* B = A-B */
- mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
- add_eltfp25519_1w_adx(D, D, C); /* D = D+C */
- mul_eltfp25519_2w_adx(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
}
-
- /* Convert to affine coordinates */
- inv_eltfp25519_1w_adx(A, Zr1);
- mul_eltfp25519_1w_adx((u64 *)session_key, Ur1, A);
- fred_eltfp25519_1w((u64 *)session_key);
-
- memzero_explicit(&m, sizeof(m));
-#endif
+ tmp3 = tmp[3U];
+ tmp[3U] = tmp3 & (u64)0x7fffffffffffffffU;
+ x = init1;
+ z = init1 + (u32)4U;
+ z[0U] = (u64)1U;
+ z[1U] = (u64)0U;
+ z[2U] = (u64)0U;
+ z[3U] = (u64)0U;
+ x[0U] = tmp[0U];
+ x[1U] = tmp[1U];
+ x[2U] = tmp[2U];
+ x[3U] = tmp[3U];
+ montgomery_ladder(init1, priv, init1);
+ encode_point(out, init1);
}
-static void curve25519_bmi2(u8 shared[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE],
- const u8 session_key[CURVE25519_KEY_SIZE])
-{
-#ifdef CONFIG_AS_BMI2
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[6 * NUM_WORDS_ELTFP25519];
- u8 session[CURVE25519_KEY_SIZE];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- int i = 0, j = 0;
- u64 prev = 0;
- u64 *const X1 = (u64 *)m.session;
- u64 *const key = (u64 *)m.private;
- u64 *const Px = m.coordinates + 0;
- u64 *const Pz = m.coordinates + 4;
- u64 *const Qx = m.coordinates + 8;
- u64 *const Qz = m.coordinates + 12;
- u64 *const X2 = Qx;
- u64 *const Z2 = Qz;
- u64 *const X3 = Px;
- u64 *const Z3 = Pz;
- u64 *const X2Z2 = Qx;
- u64 *const X3Z3 = Px;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const D = m.workspace + 8;
- u64 *const C = m.workspace + 12;
- u64 *const DA = m.workspace + 16;
- u64 *const CB = m.workspace + 20;
- u64 *const AB = A;
- u64 *const DC = D;
- u64 *const DACB = DA;
-
- memcpy(m.private, private_key, sizeof(m.private));
- memcpy(m.session, session_key, sizeof(m.session));
-
- curve25519_clamp_secret(m.private);
-
- /* As in the draft:
- * When receiving such an array, implementations of curve25519
- * MUST mask the most-significant bit in the final byte. This
- * is done to preserve compatibility with point formats which
- * reserve the sign bit for use in other protocols and to
- * increase resistance to implementation fingerprinting
- */
- m.session[CURVE25519_KEY_SIZE - 1] &= (1 << (255 % 8)) - 1;
-
- copy_eltfp25519_1w(Px, X1);
- setzero_eltfp25519_1w(Pz);
- setzero_eltfp25519_1w(Qx);
- setzero_eltfp25519_1w(Qz);
-
- Pz[0] = 1;
- Qx[0] = 1;
-
- /* main-loop */
- prev = 0;
- j = 62;
- for (i = 3; i >= 0; --i) {
- while (j >= 0) {
- u64 bit = (key[i] >> j) & 0x1;
- u64 swap = bit ^ prev;
- prev = bit;
-
- add_eltfp25519_1w_bmi2(A, X2, Z2); /* A = (X2+Z2) */
- sub_eltfp25519_1w(B, X2, Z2); /* B = (X2-Z2) */
- add_eltfp25519_1w_bmi2(C, X3, Z3); /* C = (X3+Z3) */
- sub_eltfp25519_1w(D, X3, Z3); /* D = (X3-Z3) */
- mul_eltfp25519_2w_bmi2(DACB, AB, DC); /* [DA|CB] = [A|B]*[D|C] */
-
- cselect(swap, A, C);
- cselect(swap, B, D);
-
- sqr_eltfp25519_2w_bmi2(AB); /* [AA|BB] = [A^2|B^2] */
- add_eltfp25519_1w_bmi2(X3, DA, CB); /* X3 = (DA+CB) */
- sub_eltfp25519_1w(Z3, DA, CB); /* Z3 = (DA-CB) */
- sqr_eltfp25519_2w_bmi2(X3Z3); /* [X3|Z3] = [(DA+CB)|(DA+CB)]^2 */
-
- copy_eltfp25519_1w(X2, B); /* X2 = B^2 */
- sub_eltfp25519_1w(Z2, A, B); /* Z2 = E = AA-BB */
-
- mul_a24_eltfp25519_1w(B, Z2); /* B = a24*E */
- add_eltfp25519_1w_bmi2(B, B, X2); /* B = a24*E+B */
- mul_eltfp25519_2w_bmi2(X2Z2, X2Z2, AB); /* [X2|Z2] = [B|E]*[A|a24*E+B] */
- mul_eltfp25519_1w_bmi2(Z3, Z3, X1); /* Z3 = Z3*X1 */
- --j;
- }
- j = 63;
- }
-
- inv_eltfp25519_1w_bmi2(A, Qz);
- mul_eltfp25519_1w_bmi2((u64 *)shared, Qx, A);
- fred_eltfp25519_1w((u64 *)shared);
+/* The below constants were generated using this sage script:
+ *
+ * #!/usr/bin/env sage
+ * import sys
+ * from sage.all import *
+ * def limbs(n):
+ * n = int(n)
+ * l = ((n >> 0) % 2^64, (n >> 64) % 2^64, (n >> 128) % 2^64, (n >> 192) % 2^64)
+ * return "0x%016xULL, 0x%016xULL, 0x%016xULL, 0x%016xULL" % l
+ * ec = EllipticCurve(GF(2^255 - 19), [0, 486662, 0, 1, 0])
+ * p_minus_s = (ec.lift_x(9) - ec.lift_x(1))[0]
+ * print("static const u64 p_minus_s[] = { %s };\n" % limbs(p_minus_s))
+ * print("static const u64 table_ladder[] = {")
+ * p = ec.lift_x(9)
+ * for i in range(252):
+ * l = (p[0] + p[2]) / (p[0] - p[2])
+ * print(("\t%s" + ("," if i != 251 else "")) % limbs(l))
+ * p = p * 2
+ * print("};")
+ *
+ */
- memzero_explicit(&m, sizeof(m));
-#endif
-}
+static const u64 p_minus_s[] = { 0x816b1e0137d48290ULL, 0x440f6a51eb4d1207ULL, 0x52385f46dca2b71dULL, 0x215132111d8354cbULL };
+
+static const u64 table_ladder[] = {
+ 0xfffffffffffffff3ULL, 0xffffffffffffffffULL, 0xffffffffffffffffULL, 0x5fffffffffffffffULL,
+ 0x6b8220f416aafe96ULL, 0x82ebeb2b4f566a34ULL, 0xd5a9a5b075a5950fULL, 0x5142b2cf4b2488f4ULL,
+ 0x6aaebc750069680cULL, 0x89cf7820a0f99c41ULL, 0x2a58d9183b56d0f4ULL, 0x4b5aca80e36011a4ULL,
+ 0x329132348c29745dULL, 0xf4a2e616e1642fd7ULL, 0x1e45bb03ff67bc34ULL, 0x306912d0f42a9b4aULL,
+ 0xff886507e6af7154ULL, 0x04f50e13dfeec82fULL, 0xaa512fe82abab5ceULL, 0x174e251a68d5f222ULL,
+ 0xcf96700d82028898ULL, 0x1743e3370a2c02c5ULL, 0x379eec98b4e86eaaULL, 0x0c59888a51e0482eULL,
+ 0xfbcbf1d699b5d189ULL, 0xacaef0d58e9fdc84ULL, 0xc1c20d06231f7614ULL, 0x2938218da274f972ULL,
+ 0xf6af49beff1d7f18ULL, 0xcc541c22387ac9c2ULL, 0x96fcc9ef4015c56bULL, 0x69c1627c690913a9ULL,
+ 0x7a86fd2f4733db0eULL, 0xfdb8c4f29e087de9ULL, 0x095e4b1a8ea2a229ULL, 0x1ad7a7c829b37a79ULL,
+ 0x342d89cad17ea0c0ULL, 0x67bedda6cced2051ULL, 0x19ca31bf2bb42f74ULL, 0x3df7b4c84980acbbULL,
+ 0xa8c6444dc80ad883ULL, 0xb91e440366e3ab85ULL, 0xc215cda00164f6d8ULL, 0x3d867c6ef247e668ULL,
+ 0xc7dd582bcc3e658cULL, 0xfd2c4748ee0e5528ULL, 0xa0fd9b95cc9f4f71ULL, 0x7529d871b0675ddfULL,
+ 0xb8f568b42d3cbd78ULL, 0x1233011b91f3da82ULL, 0x2dce6ccd4a7c3b62ULL, 0x75e7fc8e9e498603ULL,
+ 0x2f4f13f1fcd0b6ecULL, 0xf1a8ca1f29ff7a45ULL, 0xc249c1a72981e29bULL, 0x6ebe0dbb8c83b56aULL,
+ 0x7114fa8d170bb222ULL, 0x65a2dcd5bf93935fULL, 0xbdc41f68b59c979aULL, 0x2f0eef79a2ce9289ULL,
+ 0x42ecbf0c083c37ceULL, 0x2930bc09ec496322ULL, 0xf294b0c19cfeac0dULL, 0x3780aa4bedfabb80ULL,
+ 0x56c17d3e7cead929ULL, 0xe7cb4beb2e5722c5ULL, 0x0ce931732dbfe15aULL, 0x41b883c7621052f8ULL,
+ 0xdbf75ca0c3d25350ULL, 0x2936be086eb1e351ULL, 0xc936e03cb4a9b212ULL, 0x1d45bf82322225aaULL,
+ 0xe81ab1036a024cc5ULL, 0xe212201c304c9a72ULL, 0xc5d73fba6832b1fcULL, 0x20ffdb5a4d839581ULL,
+ 0xa283d367be5d0fadULL, 0x6c2b25ca8b164475ULL, 0x9d4935467caaf22eULL, 0x5166408eee85ff49ULL,
+ 0x3c67baa2fab4e361ULL, 0xb3e433c67ef35cefULL, 0x5259729241159b1cULL, 0x6a621892d5b0ab33ULL,
+ 0x20b74a387555cdcbULL, 0x532aa10e1208923fULL, 0xeaa17b7762281dd1ULL, 0x61ab3443f05c44bfULL,
+ 0x257a6c422324def8ULL, 0x131c6c1017e3cf7fULL, 0x23758739f630a257ULL, 0x295a407a01a78580ULL,
+ 0xf8c443246d5da8d9ULL, 0x19d775450c52fa5dULL, 0x2afcfc92731bf83dULL, 0x7d10c8e81b2b4700ULL,
+ 0xc8e0271f70baa20bULL, 0x993748867ca63957ULL, 0x5412efb3cb7ed4bbULL, 0x3196d36173e62975ULL,
+ 0xde5bcad141c7dffcULL, 0x47cc8cd2b395c848ULL, 0xa34cd942e11af3cbULL, 0x0256dbf2d04ecec2ULL,
+ 0x875ab7e94b0e667fULL, 0xcad4dd83c0850d10ULL, 0x47f12e8f4e72c79fULL, 0x5f1a87bb8c85b19bULL,
+ 0x7ae9d0b6437f51b8ULL, 0x12c7ce5518879065ULL, 0x2ade09fe5cf77aeeULL, 0x23a05a2f7d2c5627ULL,
+ 0x5908e128f17c169aULL, 0xf77498dd8ad0852dULL, 0x74b4c4ceab102f64ULL, 0x183abadd10139845ULL,
+ 0xb165ba8daa92aaacULL, 0xd5c5ef9599386705ULL, 0xbe2f8f0cf8fc40d1ULL, 0x2701e635ee204514ULL,
+ 0x629fa80020156514ULL, 0xf223868764a8c1ceULL, 0x5b894fff0b3f060eULL, 0x60d9944cf708a3faULL,
+ 0xaeea001a1c7a201fULL, 0xebf16a633ee2ce63ULL, 0x6f7709594c7a07e1ULL, 0x79b958150d0208cbULL,
+ 0x24b55e5301d410e7ULL, 0xe3a34edff3fdc84dULL, 0xd88768e4904032d8ULL, 0x131384427b3aaeecULL,
+ 0x8405e51286234f14ULL, 0x14dc4739adb4c529ULL, 0xb8a2b5b250634ffdULL, 0x2fe2a94ad8a7ff93ULL,
+ 0xec5c57efe843faddULL, 0x2843ce40f0bb9918ULL, 0xa4b561d6cf3d6305ULL, 0x743629bde8fb777eULL,
+ 0x343edd46bbaf738fULL, 0xed981828b101a651ULL, 0xa401760b882c797aULL, 0x1fc223e28dc88730ULL,
+ 0x48604e91fc0fba0eULL, 0xb637f78f052c6fa4ULL, 0x91ccac3d09e9239cULL, 0x23f7eed4437a687cULL,
+ 0x5173b1118d9bd800ULL, 0x29d641b63189d4a7ULL, 0xfdbf177988bbc586ULL, 0x2959894fcad81df5ULL,
+ 0xaebc8ef3b4bbc899ULL, 0x4148995ab26992b9ULL, 0x24e20b0134f92cfbULL, 0x40d158894a05dee8ULL,
+ 0x46b00b1185af76f6ULL, 0x26bac77873187a79ULL, 0x3dc0bf95ab8fff5fULL, 0x2a608bd8945524d7ULL,
+ 0x26449588bd446302ULL, 0x7c4bc21c0388439cULL, 0x8e98a4f383bd11b2ULL, 0x26218d7bc9d876b9ULL,
+ 0xe3081542997c178aULL, 0x3c2d29a86fb6606fULL, 0x5c217736fa279374ULL, 0x7dde05734afeb1faULL,
+ 0x3bf10e3906d42babULL, 0xe4f7803e1980649cULL, 0xe6053bf89595bf7aULL, 0x394faf38da245530ULL,
+ 0x7a8efb58896928f4ULL, 0xfbc778e9cc6a113cULL, 0x72670ce330af596fULL, 0x48f222a81d3d6cf7ULL,
+ 0xf01fce410d72caa7ULL, 0x5a20ecc7213b5595ULL, 0x7bc21165c1fa1483ULL, 0x07f89ae31da8a741ULL,
+ 0x05d2c2b4c6830ff9ULL, 0xd43e330fc6316293ULL, 0xa5a5590a96d3a904ULL, 0x705edb91a65333b6ULL,
+ 0x048ee15e0bb9a5f7ULL, 0x3240cfca9e0aaf5dULL, 0x8f4b71ceedc4a40bULL, 0x621c0da3de544a6dULL,
+ 0x92872836a08c4091ULL, 0xce8375b010c91445ULL, 0x8a72eb524f276394ULL, 0x2667fcfa7ec83635ULL,
+ 0x7f4c173345e8752aULL, 0x061b47feee7079a5ULL, 0x25dd9afa9f86ff34ULL, 0x3780cef5425dc89cULL,
+ 0x1a46035a513bb4e9ULL, 0x3e1ef379ac575adaULL, 0xc78c5f1c5fa24b50ULL, 0x321a967634fd9f22ULL,
+ 0x946707b8826e27faULL, 0x3dca84d64c506fd0ULL, 0xc189218075e91436ULL, 0x6d9284169b3b8484ULL,
+ 0x3a67e840383f2ddfULL, 0x33eec9a30c4f9b75ULL, 0x3ec7c86fa783ef47ULL, 0x26ec449fbac9fbc4ULL,
+ 0x5c0f38cba09b9e7dULL, 0x81168cc762a3478cULL, 0x3e23b0d306fc121cULL, 0x5a238aa0a5efdcddULL,
+ 0x1ba26121c4ea43ffULL, 0x36f8c77f7c8832b5ULL, 0x88fbea0b0adcf99aULL, 0x5ca9938ec25bebf9ULL,
+ 0xd5436a5e51fccda0ULL, 0x1dbc4797c2cd893bULL, 0x19346a65d3224a08ULL, 0x0f5034e49b9af466ULL,
+ 0xf23c3967a1e0b96eULL, 0xe58b08fa867a4d88ULL, 0xfb2fabc6a7341679ULL, 0x2a75381eb6026946ULL,
+ 0xc80a3be4c19420acULL, 0x66b1f6c681f2b6dcULL, 0x7cf7036761e93388ULL, 0x25abbbd8a660a4c4ULL,
+ 0x91ea12ba14fd5198ULL, 0x684950fc4a3cffa9ULL, 0xf826842130f5ad28ULL, 0x3ea988f75301a441ULL,
+ 0xc978109a695f8c6fULL, 0x1746eb4a0530c3f3ULL, 0x444d6d77b4459995ULL, 0x75952b8c054e5cc7ULL,
+ 0xa3703f7915f4d6aaULL, 0x66c346202f2647d8ULL, 0xd01469df811d644bULL, 0x77fea47d81a5d71fULL,
+ 0xc5e9529ef57ca381ULL, 0x6eeeb4b9ce2f881aULL, 0xb6e91a28e8009bd6ULL, 0x4b80be3e9afc3fecULL,
+ 0x7e3773c526aed2c5ULL, 0x1b4afcb453c9a49dULL, 0xa920bdd7baffb24dULL, 0x7c54699f122d400eULL,
+ 0xef46c8e14fa94bc8ULL, 0xe0b074ce2952ed5eULL, 0xbea450e1dbd885d5ULL, 0x61b68649320f712cULL,
+ 0x8a485f7309ccbdd1ULL, 0xbd06320d7d4d1a2dULL, 0x25232973322dbef4ULL, 0x445dc4758c17f770ULL,
+ 0xdb0434177cc8933cULL, 0xed6fe82175ea059fULL, 0x1efebefdc053db34ULL, 0x4adbe867c65daf99ULL,
+ 0x3acd71a2a90609dfULL, 0xe5e991856dd04050ULL, 0x1ec69b688157c23cULL, 0x697427f6885cfe4dULL,
+ 0xd7be7b9b65e1a851ULL, 0xa03d28d522c536ddULL, 0x28399d658fd2b645ULL, 0x49e5b7e17c2641e1ULL,
+ 0x6f8c3a98700457a4ULL, 0x5078f0a25ebb6778ULL, 0xd13c3ccbc382960fULL, 0x2e003258a7df84b1ULL,
+ 0x8ad1f39be6296a1cULL, 0xc1eeaa652a5fbfb2ULL, 0x33ee0673fd26f3cbULL, 0x59256173a69d2cccULL,
+ 0x41ea07aa4e18fc41ULL, 0xd9fc19527c87a51eULL, 0xbdaacb805831ca6fULL, 0x445b652dc916694fULL,
+ 0xce92a3a7f2172315ULL, 0x1edc282de11b9964ULL, 0xa1823aafe04c314aULL, 0x790a2d94437cf586ULL,
+ 0x71c447fb93f6e009ULL, 0x8922a56722845276ULL, 0xbf70903b204f5169ULL, 0x2f7a89891ba319feULL,
+ 0x02a08eb577e2140cULL, 0xed9a4ed4427bdcf4ULL, 0x5253ec44e4323cd1ULL, 0x3e88363c14e9355bULL,
+ 0xaa66c14277110b8cULL, 0x1ae0391610a23390ULL, 0x2030bd12c93fc2a2ULL, 0x3ee141579555c7abULL,
+ 0x9214de3a6d6e7d41ULL, 0x3ccdd88607f17efeULL, 0x674f1288f8e11217ULL, 0x5682250f329f93d0ULL,
+ 0x6cf00b136d2e396eULL, 0x6e4cf86f1014debfULL, 0x5930b1b5bfcc4e83ULL, 0x047069b48aba16b6ULL,
+ 0x0d4ce4ab69b20793ULL, 0xb24db91a97d0fb9eULL, 0xcdfa50f54e00d01dULL, 0x221b1085368bddb5ULL,
+ 0xe7e59468b1e3d8d2ULL, 0x53c56563bd122f93ULL, 0xeee8a903e0663f09ULL, 0x61efa662cbbe3d42ULL,
+ 0x2cf8ddddde6eab2aULL, 0x9bf80ad51435f231ULL, 0x5deadacec9f04973ULL, 0x29275b5d41d29b27ULL,
+ 0xcfde0f0895ebf14fULL, 0xb9aab96b054905a7ULL, 0xcae80dd9a1c420fdULL, 0x0a63bf2f1673bbc7ULL,
+ 0x092f6e11958fbc8cULL, 0x672a81e804822fadULL, 0xcac8351560d52517ULL, 0x6f3f7722c8f192f8ULL,
+ 0xf8ba90ccc2e894b7ULL, 0x2c7557a438ff9f0dULL, 0x894d1d855ae52359ULL, 0x68e122157b743d69ULL,
+ 0xd87e5570cfb919f3ULL, 0x3f2cdecd95798db9ULL, 0x2121154710c0a2ceULL, 0x3c66a115246dc5b2ULL,
+ 0xcbedc562294ecb72ULL, 0xba7143c36a280b16ULL, 0x9610c2efd4078b67ULL, 0x6144735d946a4b1eULL,
+ 0x536f111ed75b3350ULL, 0x0211db8c2041d81bULL, 0xf93cb1000e10413cULL, 0x149dfd3c039e8876ULL,
+ 0xd479dde46b63155bULL, 0xb66e15e93c837976ULL, 0xdafde43b1f13e038ULL, 0x5fafda1a2e4b0b35ULL,
+ 0x3600bbdf17197581ULL, 0x3972050bbe3cd2c2ULL, 0x5938906dbdd5be86ULL, 0x34fce5e43f9b860fULL,
+ 0x75a8a4cd42d14d02ULL, 0x828dabc53441df65ULL, 0x33dcabedd2e131d3ULL, 0x3ebad76fb814d25fULL,
+ 0xd4906f566f70e10fULL, 0x5d12f7aa51690f5aULL, 0x45adb16e76cefcf2ULL, 0x01f768aead232999ULL,
+ 0x2b6cc77b6248febdULL, 0x3cd30628ec3aaffdULL, 0xce1c0b80d4ef486aULL, 0x4c3bff2ea6f66c23ULL,
+ 0x3f2ec4094aeaeb5fULL, 0x61b19b286e372ca7ULL, 0x5eefa966de2a701dULL, 0x23b20565de55e3efULL,
+ 0xe301ca5279d58557ULL, 0x07b2d4ce27c2874fULL, 0xa532cd8a9dcf1d67ULL, 0x2a52fee23f2bff56ULL,
+ 0x8624efb37cd8663dULL, 0xbbc7ac20ffbd7594ULL, 0x57b85e9c82d37445ULL, 0x7b3052cb86a6ec66ULL,
+ 0x3482f0ad2525e91eULL, 0x2cb68043d28edca0ULL, 0xaf4f6d052e1b003aULL, 0x185f8c2529781b0aULL,
+ 0xaa41de5bd80ce0d6ULL, 0x9407b2416853e9d6ULL, 0x563ec36e357f4c3aULL, 0x4cc4b8dd0e297bceULL,
+ 0xa2fc1a52ffb8730eULL, 0x1811f16e67058e37ULL, 0x10f9a366cddf4ee1ULL, 0x72f4a0c4a0b9f099ULL,
+ 0x8c16c06f663f4ea7ULL, 0x693b3af74e970fbaULL, 0x2102e7f1d69ec345ULL, 0x0ba53cbc968a8089ULL,
+ 0xca3d9dc7fea15537ULL, 0x4c6824bb51536493ULL, 0xb9886314844006b1ULL, 0x40d2a72ab454cc60ULL,
+ 0x5936a1b712570975ULL, 0x91b9d648debda657ULL, 0x3344094bb64330eaULL, 0x006ba10d12ee51d0ULL,
+ 0x19228468f5de5d58ULL, 0x0eb12f4c38cc05b0ULL, 0xa1039f9dd5601990ULL, 0x4502d4ce4fff0e0bULL,
+ 0xeb2054106837c189ULL, 0xd0f6544c6dd3b93cULL, 0x40727064c416d74fULL, 0x6e15c6114b502ef0ULL,
+ 0x4df2a398cfb1a76bULL, 0x11256c7419f2f6b1ULL, 0x4a497962066e6043ULL, 0x705b3aab41355b44ULL,
+ 0x365ef536d797b1d8ULL, 0x00076bd622ddf0dbULL, 0x3bbf33b0e0575a88ULL, 0x3777aa05c8e4ca4dULL,
+ 0x392745c85578db5fULL, 0x6fda4149dbae5ae2ULL, 0xb1f0b00b8adc9867ULL, 0x09963437d36f1da3ULL,
+ 0x7e824e90a5dc3853ULL, 0xccb5f6641f135cbdULL, 0x6736d86c87ce8fccULL, 0x625f3ce26604249fULL,
+ 0xaf8ac8059502f63fULL, 0x0c05e70a2e351469ULL, 0x35292e9c764b6305ULL, 0x1a394360c7e23ac3ULL,
+ 0xd5c6d53251183264ULL, 0x62065abd43c2b74fULL, 0xb5fbf5d03b973f9bULL, 0x13a3da3661206e5eULL,
+ 0xc6bd5837725d94e5ULL, 0x18e30912205016c5ULL, 0x2088ce1570033c68ULL, 0x7fba1f495c837987ULL,
+ 0x5a8c7423f2f9079dULL, 0x1735157b34023fc5ULL, 0xe4f9b49ad2fab351ULL, 0x6691ff72c878e33cULL,
+ 0x122c2adedc5eff3eULL, 0xf8dd4bf1d8956cf4ULL, 0xeb86205d9e9e5bdaULL, 0x049b92b9d975c743ULL,
+ 0xa5379730b0f6c05aULL, 0x72a0ffacc6f3a553ULL, 0xb0032c34b20dcd6dULL, 0x470e9dbc88d5164aULL,
+ 0xb19cf10ca237c047ULL, 0xb65466711f6c81a2ULL, 0xb3321bd16dd80b43ULL, 0x48c14f600c5fbe8eULL,
+ 0x66451c264aa6c803ULL, 0xb66e3904a4fa7da6ULL, 0xd45f19b0b3128395ULL, 0x31602627c3c9bc10ULL,
+ 0x3120dc4832e4e10dULL, 0xeb20c46756c717f7ULL, 0x00f52e3f67280294ULL, 0x566d4fc14730c509ULL,
+ 0x7e3a5d40fd837206ULL, 0xc1e926dc7159547aULL, 0x216730fba68d6095ULL, 0x22e8c3843f69cea7ULL,
+ 0x33d074e8930e4b2bULL, 0xb6e4350e84d15816ULL, 0x5534c26ad6ba2365ULL, 0x7773c12f89f1f3f3ULL,
+ 0x8cba404da57962aaULL, 0x5b9897a81999ce56ULL, 0x508e862f121692fcULL, 0x3a81907fa093c291ULL,
+ 0x0dded0ff4725a510ULL, 0x10d8cc10673fc503ULL, 0x5b9d151c9f1f4e89ULL, 0x32a5c1d5cb09a44cULL,
+ 0x1e0aa442b90541fbULL, 0x5f85eb7cc1b485dbULL, 0xbee595ce8a9df2e5ULL, 0x25e496c722422236ULL,
+ 0x5edf3c46cd0fe5b9ULL, 0x34e75a7ed2a43388ULL, 0xe488de11d761e352ULL, 0x0e878a01a085545cULL,
+ 0xba493c77e021bb04ULL, 0x2b4d1843c7df899aULL, 0x9ea37a487ae80d67ULL, 0x67a9958011e41794ULL,
+ 0x4b58051a6697b065ULL, 0x47e33f7d8d6ba6d4ULL, 0xbb4da8d483ca46c1ULL, 0x68becaa181c2db0dULL,
+ 0x8d8980e90b989aa5ULL, 0xf95eb14a2c93c99bULL, 0x51c6c7c4796e73a2ULL, 0x6e228363b5efb569ULL,
+ 0xc6bbc0b02dd624c8ULL, 0x777eb47dec8170eeULL, 0x3cde15a004cfafa9ULL, 0x1dc6bc087160bf9bULL,
+ 0x2e07e043eec34002ULL, 0x18e9fc677a68dc7fULL, 0xd8da03188bd15b9aULL, 0x48fbc3bb00568253ULL,
+ 0x57547d4cfb654ce1ULL, 0xd3565b82a058e2adULL, 0xf63eaf0bbf154478ULL, 0x47531ef114dfbb18ULL,
+ 0xe1ec630a4278c587ULL, 0x5507d546ca8e83f3ULL, 0x85e135c63adc0c2bULL, 0x0aa7efa85682844eULL,
+ 0x72691ba8b3e1f615ULL, 0x32b4e9701fbe3ffaULL, 0x97b6d92e39bb7868ULL, 0x2cfe53dea02e39e8ULL,
+ 0x687392cd85cd52b0ULL, 0x27ff66c910e29831ULL, 0x97134556a9832d06ULL, 0x269bb0360a84f8a0ULL,
+ 0x706e55457643f85cULL, 0x3734a48c9b597d1bULL, 0x7aee91e8c6efa472ULL, 0x5cd6abc198a9d9e0ULL,
+ 0x0e04de06cb3ce41aULL, 0xd8c6eb893402e138ULL, 0x904659bb686e3772ULL, 0x7215c371746ba8c8ULL,
+ 0xfd12a97eeae4a2d9ULL, 0x9514b7516394f2c5ULL, 0x266fd5809208f294ULL, 0x5c847085619a26b9ULL,
+ 0x52985410fed694eaULL, 0x3c905b934a2ed254ULL, 0x10bb47692d3be467ULL, 0x063b3d2d69e5e9e1ULL,
+ 0x472726eedda57debULL, 0xefb6c4ae10f41891ULL, 0x2b1641917b307614ULL, 0x117c554fc4f45b7cULL,
+ 0xc07cf3118f9d8812ULL, 0x01dbd82050017939ULL, 0xd7e803f4171b2827ULL, 0x1015e87487d225eaULL,
+ 0xc58de3fed23acc4dULL, 0x50db91c294a7be2dULL, 0x0b94d43d1c9cf457ULL, 0x6b1640fa6e37524aULL,
+ 0x692f346c5fda0d09ULL, 0x200b1c59fa4d3151ULL, 0xb8c46f760777a296ULL, 0x4b38395f3ffdfbcfULL,
+ 0x18d25e00be54d671ULL, 0x60d50582bec8aba6ULL, 0x87ad8f263b78b982ULL, 0x50fdf64e9cda0432ULL,
+ 0x90f567aac578dcf0ULL, 0xef1e9b0ef2a3133bULL, 0x0eebba9242d9de71ULL, 0x15473c9bf03101c7ULL,
+ 0x7c77e8ae56b78095ULL, 0xb678e7666e6f078eULL, 0x2da0b9615348ba1fULL, 0x7cf931c1ff733f0bULL,
+ 0x26b357f50a0a366cULL, 0xe9708cf42b87d732ULL, 0xc13aeea5f91cb2c0ULL, 0x35d90c991143bb4cULL,
+ 0x47c1c404a9a0d9dcULL, 0x659e58451972d251ULL, 0x3875a8c473b38c31ULL, 0x1fbd9ed379561f24ULL,
+ 0x11fabc6fd41ec28dULL, 0x7ef8dfe3cd2a2dcaULL, 0x72e73b5d8c404595ULL, 0x6135fa4954b72f27ULL,
+ 0xccfc32a2de24b69cULL, 0x3f55698c1f095d88ULL, 0xbe3350ed5ac3f929ULL, 0x5e9bf806ca477eebULL,
+ 0xe9ce8fb63c309f68ULL, 0x5376f63565e1f9f4ULL, 0xd1afcfb35a6393f1ULL, 0x6632a1ede5623506ULL,
+ 0x0b7d6c390c2ded4cULL, 0x56cb3281df04cb1fULL, 0x66305a1249ecc3c7ULL, 0x5d588b60a38ca72aULL,
+ 0xa6ecbf78e8e5f42dULL, 0x86eeb44b3c8a3eecULL, 0xec219c48fbd21604ULL, 0x1aaf1af517c36731ULL,
+ 0xc306a2836769bde7ULL, 0x208280622b1e2adbULL, 0x8027f51ffbff94a6ULL, 0x76cfa1ce1124f26bULL,
+ 0x18eb00562422abb6ULL, 0xf377c4d58f8c29c3ULL, 0x4dbbc207f531561aULL, 0x0253b7f082128a27ULL,
+ 0x3d1f091cb62c17e0ULL, 0x4860e1abd64628a9ULL, 0x52d17436309d4253ULL, 0x356f97e13efae576ULL,
+ 0xd351e11aa150535bULL, 0x3e6b45bb1dd878ccULL, 0x0c776128bed92c98ULL, 0x1d34ae93032885b8ULL,
+ 0x4ba0488ca85ba4c3ULL, 0x985348c33c9ce6ceULL, 0x66124c6f97bda770ULL, 0x0f81a0290654124aULL,
+ 0x9ed09ca6569b86fdULL, 0x811009fd18af9a2dULL, 0xff08d03f93d8c20aULL, 0x52a148199faef26bULL,
+ 0x3e03f9dc2d8d1b73ULL, 0x4205801873961a70ULL, 0xc0d987f041a35970ULL, 0x07aa1f15a1c0d549ULL,
+ 0xdfd46ce08cd27224ULL, 0x6d0a024f934e4239ULL, 0x808a7a6399897b59ULL, 0x0a4556e9e13d95a2ULL,
+ 0xd21a991fe9c13045ULL, 0x9b0e8548fe7751b8ULL, 0x5da643cb4bf30035ULL, 0x77db28d63940f721ULL,
+ 0xfc5eeb614adc9011ULL, 0x5229419ae8c411ebULL, 0x9ec3e7787d1dcf74ULL, 0x340d053e216e4cb5ULL,
+ 0xcac7af39b48df2b4ULL, 0xc0faec2871a10a94ULL, 0x140a69245ca575edULL, 0x0cf1c37134273a4cULL,
+ 0xc8ee306ac224b8a5ULL, 0x57eaee7ccb4930b0ULL, 0xa1e806bdaacbe74fULL, 0x7d9a62742eeb657dULL,
+ 0x9eb6b6ef546c4830ULL, 0x885cca1fddb36e2eULL, 0xe6b9f383ef0d7105ULL, 0x58654fef9d2e0412ULL,
+ 0xa905c4ffbe0e8e26ULL, 0x942de5df9b31816eULL, 0x497d723f802e88e1ULL, 0x30684dea602f408dULL,
+ 0x21e5a278a3e6cb34ULL, 0xaefb6e6f5b151dc4ULL, 0xb30b8e049d77ca15ULL, 0x28c3c9cf53b98981ULL,
+ 0x287fb721556cdd2aULL, 0x0d317ca897022274ULL, 0x7468c7423a543258ULL, 0x4a7f11464eb5642fULL,
+ 0xa237a4774d193aa6ULL, 0xd865986ea92129a1ULL, 0x24c515ecf87c1a88ULL, 0x604003575f39f5ebULL,
+ 0x47b9f189570a9b27ULL, 0x2b98cede465e4b78ULL, 0x026df551dbb85c20ULL, 0x74fcd91047e21901ULL,
+ 0x13e2a90a23c1bfa3ULL, 0x0cb0074e478519f6ULL, 0x5ff1cbbe3af6cf44ULL, 0x67fe5438be812dbeULL,
+ 0xd13cf64fa40f05b0ULL, 0x054dfb2f32283787ULL, 0x4173915b7f0d2aeaULL, 0x482f144f1f610d4eULL,
+ 0xf6210201b47f8234ULL, 0x5d0ae1929e70b990ULL, 0xdcd7f455b049567cULL, 0x7e93d0f1f0916f01ULL,
+ 0xdd79cbf18a7db4faULL, 0xbe8391bf6f74c62fULL, 0x027145d14b8291bdULL, 0x585a73ea2cbf1705ULL,
+ 0x485ca03e928a0db2ULL, 0x10fc01a5742857e7ULL, 0x2f482edbd6d551a7ULL, 0x0f0433b5048fdb8aULL,
+ 0x60da2e8dd7dc6247ULL, 0x88b4c9d38cd4819aULL, 0x13033ac001f66697ULL, 0x273b24fe3b367d75ULL,
+ 0xc6e8f66a31b3b9d4ULL, 0x281514a494df49d5ULL, 0xd1726fdfc8b23da7ULL, 0x4b3ae7d103dee548ULL,
+ 0xc6256e19ce4b9d7eULL, 0xff5c5cf186e3c61cULL, 0xacc63ca34b8ec145ULL, 0x74621888fee66574ULL,
+ 0x956f409645290a1eULL, 0xef0bf8e3263a962eULL, 0xed6a50eb5ec2647bULL, 0x0694283a9dca7502ULL,
+ 0x769b963643a2dcd1ULL, 0x42b7c8ea09fc5353ULL, 0x4f002aee13397eabULL, 0x63005e2c19b7d63aULL,
+ 0xca6736da63023beaULL, 0x966c7f6db12a99b7ULL, 0xace09390c537c5e1ULL, 0x0b696063a1aa89eeULL,
+ 0xebb03e97288c56e5ULL, 0x432a9f9f938c8be8ULL, 0xa6a5a93d5b717f71ULL, 0x1a5fb4c3e18f9d97ULL,
+ 0x1c94e7ad1c60cdceULL, 0xee202a43fc02c4a0ULL, 0x8dafe4d867c46a20ULL, 0x0a10263c8ac27b58ULL,
+ 0xd0dea9dfe4432a4aULL, 0x856af87bbe9277c5ULL, 0xce8472acc212c71aULL, 0x6f151b6d9bbb1e91ULL,
+ 0x26776c527ceed56aULL, 0x7d211cb7fbf8faecULL, 0x37ae66a6fd4609ccULL, 0x1f81b702d2770c42ULL,
+ 0x2fb0b057eac58392ULL, 0xe1dd89fe29744e9dULL, 0xc964f8eb17beb4f8ULL, 0x29571073c9a2d41eULL,
+ 0xa948a18981c0e254ULL, 0x2df6369b65b22830ULL, 0xa33eb2d75fcfd3c6ULL, 0x078cd6ec4199a01fULL,
+ 0x4a584a41ad900d2fULL, 0x32142b78e2c74c52ULL, 0x68c4e8338431c978ULL, 0x7f69ea9008689fc2ULL,
+ 0x52f2c81e46a38265ULL, 0xfd78072d04a832fdULL, 0x8cd7d5fa25359e94ULL, 0x4de71b7454cc29d2ULL,
+ 0x42eb60ad1eda6ac9ULL, 0x0aad37dfdbc09c3aULL, 0x81004b71e33cc191ULL, 0x44e6be345122803cULL,
+ 0x03fe8388ba1920dbULL, 0xf5d57c32150db008ULL, 0x49c8c4281af60c29ULL, 0x21edb518de701aeeULL,
+ 0x7fb63e418f06dc99ULL, 0xa4460d99c166d7b8ULL, 0x24dd5248ce520a83ULL, 0x5ec3ad712b928358ULL,
+ 0x15022a5fbd17930fULL, 0xa4f64a77d82570e3ULL, 0x12bc8d6915783712ULL, 0x498194c0fc620abbULL,
+ 0x38a2d9d255686c82ULL, 0x785c6bd9193e21f0ULL, 0xe4d5c81ab24a5484ULL, 0x56307860b2e20989ULL,
+ 0x429d55f78b4d74c4ULL, 0x22f1834643350131ULL, 0x1e60c24598c71fffULL, 0x59f2f014979983efULL,
+ 0x46a47d56eb494a44ULL, 0x3e22a854d636a18eULL, 0xb346e15274491c3bULL, 0x2ceafd4e5390cde7ULL,
+ 0xba8a8538be0d6675ULL, 0x4b9074bb50818e23ULL, 0xcbdab89085d304c3ULL, 0x61a24fe0e56192c4ULL,
+ 0xcb7615e6db525bcbULL, 0xdd7d8c35a567e4caULL, 0xe6b4153acafcdd69ULL, 0x2d668e097f3c9766ULL,
+ 0xa57e7e265ce55ef0ULL, 0x5d9f4e527cd4b967ULL, 0xfbc83606492fd1e5ULL, 0x090d52beb7c3f7aeULL,
+ 0x09b9515a1e7b4d7cULL, 0x1f266a2599da44c0ULL, 0xa1c49548e2c55504ULL, 0x7ef04287126f15ccULL,
+ 0xfed1659dbd30ef15ULL, 0x8b4ab9eec4e0277bULL, 0x884d6236a5df3291ULL, 0x1fd96ea6bf5cf788ULL,
+ 0x42a161981f190d9aULL, 0x61d849507e6052c1ULL, 0x9fe113bf285a2cd5ULL, 0x7c22d676dbad85d8ULL,
+ 0x82e770ed2bfbd27dULL, 0x4c05b2ece996f5a5ULL, 0xcd40a9c2b0900150ULL, 0x5895319213d9bf64ULL,
+ 0xe7cc5d703fea2e08ULL, 0xb50c491258e2188cULL, 0xcce30baa48205bf0ULL, 0x537c659ccfa32d62ULL,
+ 0x37b6623a98cfc088ULL, 0xfe9bed1fa4d6aca4ULL, 0x04d29b8e56a8d1b0ULL, 0x725f71c40b519575ULL,
+ 0x28c7f89cd0339ce6ULL, 0x8367b14469ddc18bULL, 0x883ada83a6a1652cULL, 0x585f1974034d6c17ULL,
+ 0x89cfb266f1b19188ULL, 0xe63b4863e7c35217ULL, 0xd88c9da6b4c0526aULL, 0x3e035c9df0954635ULL,
+ 0xdd9d5412fb45de9dULL, 0xdd684532e4cff40dULL, 0x4b5c999b151d671cULL, 0x2d8c2cc811e7f690ULL,
+ 0x7f54be1d90055d40ULL, 0xa464c5df464aaf40ULL, 0x33979624f0e917beULL, 0x2c018dc527356b30ULL,
+ 0xa5415024e330b3d4ULL, 0x73ff3d96691652d3ULL, 0x94ec42c4ef9b59f1ULL, 0x0747201618d08e5aULL,
+ 0x4d6ca48aca411c53ULL, 0x66415f2fcfa66119ULL, 0x9c4dd40051e227ffULL, 0x59810bc09a02f7ebULL,
+ 0x2a7eb171b3dc101dULL, 0x441c5ab99ffef68eULL, 0x32025c9b93b359eaULL, 0x5e8ce0a71e9d112fULL,
+ 0xbfcccb92429503fdULL, 0xd271ba752f095d55ULL, 0x345ead5e972d091eULL, 0x18c8df11a83103baULL,
+ 0x90cd949a9aed0f4cULL, 0xc5d1f4cb6660e37eULL, 0xb8cac52d56c52e0bULL, 0x6e42e400c5808e0dULL,
+ 0xa3b46966eeaefd23ULL, 0x0c4f1f0be39ecdcaULL, 0x189dc8c9d683a51dULL, 0x51f27f054c09351bULL,
+ 0x4c487ccd2a320682ULL, 0x587ea95bb3df1c96ULL, 0xc8ccf79e555cb8e8ULL, 0x547dc829a206d73dULL,
+ 0xb822a6cd80c39b06ULL, 0xe96d54732000d4c6ULL, 0x28535b6f91463b4dULL, 0x228f4660e2486e1dULL,
+ 0x98799538de8d3abfULL, 0x8cd8330045ebca6eULL, 0x79952a008221e738ULL, 0x4322e1a7535cd2bbULL,
+ 0xb114c11819d1801cULL, 0x2016e4d84f3f5ec7ULL, 0xdd0e2df409260f4cULL, 0x5ec362c0ae5f7266ULL,
+ 0xc0462b18b8b2b4eeULL, 0x7cc8d950274d1afbULL, 0xf25f7105436b02d2ULL, 0x43bbf8dcbff9ccd3ULL,
+ 0xb6ad1767a039e9dfULL, 0xb0714da8f69d3583ULL, 0x5e55fa18b42931f5ULL, 0x4ed5558f33c60961ULL,
+ 0x1fe37901c647a5ddULL, 0x593ddf1f8081d357ULL, 0x0249a4fd813fd7a6ULL, 0x69acca274e9caf61ULL,
+ 0x047ba3ea330721c9ULL, 0x83423fc20e7e1ea0ULL, 0x1df4c0af01314a60ULL, 0x09a62dab89289527ULL,
+ 0xa5b325a49cc6cb00ULL, 0xe94b5dc654b56cb6ULL, 0x3be28779adc994a0ULL, 0x4296e8f8ba3a4aadULL,
+ 0x328689761e451eabULL, 0x2e4d598bff59594aULL, 0x49b96853d7a7084aULL, 0x4980a319601420a8ULL,
+ 0x9565b9e12f552c42ULL, 0x8a5318db7100fe96ULL, 0x05c90b4d43add0d7ULL, 0x538b4cd66a5d4edaULL,
+ 0xf4e94fc3e89f039fULL, 0x592c9af26f618045ULL, 0x08a36eb5fd4b9550ULL, 0x25fffaf6c2ed1419ULL,
+ 0x34434459cc79d354ULL, 0xeeecbfb4b1d5476bULL, 0xddeb34a061615d99ULL, 0x5129cecceb64b773ULL,
+ 0xee43215894993520ULL, 0x772f9c7cf14c0b3bULL, 0xd2e2fce306bedad5ULL, 0x715f42b546f06a97ULL,
+ 0x434ecdceda5b5f1aULL, 0x0da17115a49741a9ULL, 0x680bd77c73edad2eULL, 0x487c02354edd9041ULL,
+ 0xb8efeff3a70ed9c4ULL, 0x56a32aa3e857e302ULL, 0xdf3a68bd48a2a5a0ULL, 0x07f650b73176c444ULL,
+ 0xe38b9b1626e0ccb1ULL, 0x79e053c18b09fb36ULL, 0x56d90319c9f94964ULL, 0x1ca941e7ac9ff5c4ULL,
+ 0x49c4df29162fa0bbULL, 0x8488cf3282b33305ULL, 0x95dfda14cabb437dULL, 0x3391f78264d5ad86ULL,
+ 0x729ae06ae2b5095dULL, 0xd58a58d73259a946ULL, 0xe9834262d13921edULL, 0x27fedafaa54bb592ULL,
+ 0xa99dc5b829ad48bbULL, 0x5f025742499ee260ULL, 0x802c8ecd5d7513fdULL, 0x78ceb3ef3f6dd938ULL,
+ 0xc342f44f8a135d94ULL, 0x7b9edb44828cdda3ULL, 0x9436d11a0537cfe7ULL, 0x5064b164ec1ab4c8ULL,
+ 0x7020eccfd37eb2fcULL, 0x1f31ea3ed90d25fcULL, 0x1b930d7bdfa1bb34ULL, 0x5344467a48113044ULL,
+ 0x70073170f25e6dfbULL, 0xe385dc1a50114cc8ULL, 0x2348698ac8fc4f00ULL, 0x2a77a55284dd40d8ULL,
+ 0xfe06afe0c98c6ce4ULL, 0xc235df96dddfd6e4ULL, 0x1428d01e33bf1ed3ULL, 0x785768ec9300bdafULL,
+ 0x9702e57a91deb63bULL, 0x61bdb8bfe5ce8b80ULL, 0x645b426f3d1d58acULL, 0x4804a82227a557bcULL,
+ 0x8e57048ab44d2601ULL, 0x68d6501a4b3a6935ULL, 0xc39c9ec3f9e1c293ULL, 0x4172f257d4de63e2ULL,
+ 0xd368b450330c6401ULL, 0x040d3017418f2391ULL, 0x2c34bb6090b7d90dULL, 0x16f649228fdfd51fULL,
+ 0xbea6818e2b928ef5ULL, 0xe28ccf91cdc11e72ULL, 0x594aaa68e77a36cdULL, 0x313034806c7ffd0fULL,
+ 0x8a9d27ac2249bd65ULL, 0x19a3b464018e9512ULL, 0xc26ccff352b37ec7ULL, 0x056f68341d797b21ULL,
+ 0x5e79d6757efd2327ULL, 0xfabdbcb6553afe15ULL, 0xd3e7222c6eaf5a60ULL, 0x7046c76d4dae743bULL,
+ 0x660be872b18d4a55ULL, 0x19992518574e1496ULL, 0xc103053a302bdcbbULL, 0x3ed8e9800b218e8eULL,
+ 0x7b0b9239fa75e03eULL, 0xefe9fb684633c083ULL, 0x98a35fbe391a7793ULL, 0x6065510fe2d0fe34ULL,
+ 0x55cb668548abad0cULL, 0xb4584548da87e527ULL, 0x2c43ecea0107c1ddULL, 0x526028809372de35ULL,
+ 0x3415c56af9213b1fULL, 0x5bee1a4d017e98dbULL, 0x13f6b105b5cf709bULL, 0x5ff20e3482b29ab6ULL,
+ 0x0aa29c75cc2e6c90ULL, 0xfc7d73ca3a70e206ULL, 0x899fc38fc4b5c515ULL, 0x250386b124ffc207ULL,
+ 0x54ea28d5ae3d2b56ULL, 0x9913149dd6de60ceULL, 0x16694fc58f06d6c1ULL, 0x46b23975eb018fc7ULL,
+ 0x470a6a0fb4b7b4e2ULL, 0x5d92475a8f7253deULL, 0xabeee5b52fbd3adbULL, 0x7fa20801a0806968ULL,
+ 0x76f3faf19f7714d2ULL, 0xb3e840c12f4660c3ULL, 0x0fb4cd8df212744eULL, 0x4b065a251d3a2dd2ULL,
+ 0x5cebde383d77cd4aULL, 0x6adf39df882c9cb1ULL, 0xa2dd242eb09af759ULL, 0x3147c0e50e5f6422ULL,
+ 0x164ca5101d1350dbULL, 0xf8d13479c33fc962ULL, 0xe640ce4d13e5da08ULL, 0x4bdee0c45061f8baULL,
+ 0xd7c46dc1a4edb1c9ULL, 0x5514d7b6437fd98aULL, 0x58942f6bb2a1c00bULL, 0x2dffb2ab1d70710eULL,
+ 0xccdfcf2fc18b6d68ULL, 0xa8ebcba8b7806167ULL, 0x980697f95e2937e3ULL, 0x02fbba1cd0126e8cULL
+};
-static void curve25519_bmi2_base(u8 session_key[CURVE25519_KEY_SIZE],
- const u8 private_key[CURVE25519_KEY_SIZE])
+static void curve25519_ever64_base(u8 *out, const u8 *priv)
{
-#ifdef CONFIG_AS_BMI2
- struct {
- u64 buffer[4 * NUM_WORDS_ELTFP25519];
- u64 coordinates[4 * NUM_WORDS_ELTFP25519];
- u64 workspace[4 * NUM_WORDS_ELTFP25519];
- u8 private[CURVE25519_KEY_SIZE];
- } __aligned(32) m;
-
- const int ite[4] = { 64, 64, 64, 63 };
- const int q = 3;
u64 swap = 1;
-
- int i = 0, j = 0, k = 0;
- u64 *const key = (u64 *)m.private;
- u64 *const Ur1 = m.coordinates + 0;
- u64 *const Zr1 = m.coordinates + 4;
- u64 *const Ur2 = m.coordinates + 8;
- u64 *const Zr2 = m.coordinates + 12;
-
- u64 *const UZr1 = m.coordinates + 0;
- u64 *const ZUr2 = m.coordinates + 8;
-
- u64 *const A = m.workspace + 0;
- u64 *const B = m.workspace + 4;
- u64 *const C = m.workspace + 8;
- u64 *const D = m.workspace + 12;
-
- u64 *const AB = m.workspace + 0;
- u64 *const CD = m.workspace + 8;
-
- const u64 *const P = table_ladder_8k;
-
- memcpy(m.private, private_key, sizeof(m.private));
-
- curve25519_clamp_secret(m.private);
-
- setzero_eltfp25519_1w(Ur1);
- setzero_eltfp25519_1w(Zr1);
- setzero_eltfp25519_1w(Zr2);
- Ur1[0] = 1;
- Zr1[0] = 1;
- Zr2[0] = 1;
-
- /* G-S */
- Ur2[3] = 0x1eaecdeee27cab34UL;
- Ur2[2] = 0xadc7a0b9235d48e2UL;
- Ur2[1] = 0xbbf095ae14b2edf8UL;
- Ur2[0] = 0x7e94e1fec82faabdUL;
-
- /* main-loop */
- j = q;
- for (i = 0; i < NUM_WORDS_ELTFP25519; ++i) {
- while (j < ite[i]) {
- u64 bit = (key[i] >> j) & 0x1;
- k = (64 * i + j - q);
+ int i, j, k;
+ u64 tmp[16 + 32 + 4];
+ u64 *x1 = &tmp[0];
+ u64 *z1 = &tmp[4];
+ u64 *x2 = &tmp[8];
+ u64 *z2 = &tmp[12];
+ u64 *xz1 = &tmp[0];
+ u64 *xz2 = &tmp[8];
+ u64 *a = &tmp[0 + 16];
+ u64 *b = &tmp[4 + 16];
+ u64 *c = &tmp[8 + 16];
+ u64 *ab = &tmp[0 + 16];
+ u64 *abcd = &tmp[0 + 16];
+ u64 *ef = &tmp[16 + 16];
+ u64 *efgh = &tmp[16 + 16];
+ u64 *key = &tmp[0 + 16 + 32];
+
+ memcpy(key, priv, 32);
+ ((u8 *)key)[0] &= 248;
+ ((u8 *)key)[31] = (((u8 *)key)[31] & 127) | 64;
+
+ x1[0] = 1, x1[1] = x1[2] = x1[3] = 0;
+ z1[0] = 1, z1[1] = z1[2] = z1[3] = 0;
+ z2[0] = 1, z2[1] = z2[2] = z2[3] = 0;
+ memcpy(x2, p_minus_s, sizeof(p_minus_s));
+
+ j = 3;
+ for (i = 0; i < 4; ++i) {
+ while (j < (const int[]){ 64, 64, 64, 63 }[i]) {
+ u64 bit = (key[i] >> j) & 1;
+ k = (64 * i + j - 3);
swap = swap ^ bit;
- cswap(swap, Ur1, Ur2);
- cswap(swap, Zr1, Zr2);
+ cswap2(swap, xz1, xz2);
swap = bit;
- /* Addition */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- mul_eltfp25519_1w_bmi2(C, &P[4 * k], B);/* C = M0-B */
- sub_eltfp25519_1w(B, A, C); /* B = (Ur1+Zr1) - M*(Ur1-Zr1) */
- add_eltfp25519_1w_bmi2(A, A, C); /* A = (Ur1+Zr1) + M*(Ur1-Zr1) */
- sqr_eltfp25519_2w_bmi2(AB); /* A = A^2 | B = B^2 */
- mul_eltfp25519_2w_bmi2(UZr1, ZUr2, AB); /* Ur1 = Zr2*A | Zr1 = Ur2*B */
+ fsub(b, x1, z1);
+ fadd(a, x1, z1);
+ fmul(c, &table_ladder[4 * k], b, ef);
+ fsub(b, a, c);
+ fadd(a, a, c);
+ fsqr2(ab, ab, efgh);
+ fmul2(xz1, xz2, ab, efgh);
++j;
}
j = 0;
}
- /* Doubling */
- for (i = 0; i < q; ++i) {
- add_eltfp25519_1w_bmi2(A, Ur1, Zr1); /* A = Ur1+Zr1 */
- sub_eltfp25519_1w(B, Ur1, Zr1); /* B = Ur1-Zr1 */
- sqr_eltfp25519_2w_bmi2(AB); /* A = A**2 B = B**2 */
- copy_eltfp25519_1w(C, B); /* C = B */
- sub_eltfp25519_1w(B, A, B); /* B = A-B */
- mul_a24_eltfp25519_1w(D, B); /* D = my_a24*B */
- add_eltfp25519_1w_bmi2(D, D, C); /* D = D+C */
- mul_eltfp25519_2w_bmi2(UZr1, AB, CD); /* Ur1 = A*B Zr1 = Zr1*A */
- }
-
- /* Convert to affine coordinates */
- inv_eltfp25519_1w_bmi2(A, Zr1);
- mul_eltfp25519_1w_bmi2((u64 *)session_key, Ur1, A);
- fred_eltfp25519_1w((u64 *)session_key);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ point_double(xz1, abcd, efgh);
+ encode_point(out, xz1);
- memzero_explicit(&m, sizeof(m));
-#endif
+ memzero_explicit(tmp, sizeof(tmp));
}