path: root/src/crypto/zinc/curve25519/curve25519-hacl64.c
diff options
authorJason A. Donenfeld <Jason@zx2c4.com>2020-01-21 16:07:52 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2020-01-21 16:07:52 +0100
commit55df79c6c08e31f24370f3ddc69e37729b4c5676 (patch)
treea1efdab7f7202a86cbda4ece13d9a38e7a722c79 /src/crypto/zinc/curve25519/curve25519-hacl64.c
parentdevice: skb_list_walk_safe moved upstream (diff)
curve25519: x86_64: replace with formally verified implementation
This comes from INRIA's HACL*/Vale. It implements the same algorithm and implementation strategy as the code it replaces, only this code has been formally verified, sans the base point multiplication, which uses code similar to prior, only it uses the formally verified field arithmetic alongside reproducable ladder generation steps. This doesn't have a pure-bmi2 version, which means haswell no longer benefits, but the increased (doubled) code complexity is not worth it for a single generation of chips that's already old. Performance-wise, this is around 1% slower on older microarchitectures, and slightly faster on newer microarchitectures, mainly 10nm ones or backports of 10nm to 14nm. This implementation is "everest" below: Xeon E5-2680 v4 (Broadwell) armfazh: 133340 cycles per call everest: 133436 cycles per call Xeon Gold 5120 (Sky Lake Server) armfazh: 112636 cycles per call everest: 113906 cycles per call Core i5-6300U (Sky Lake Client) armfazh: 116810 cycles per call everest: 117916 cycles per call Core i7-7600U (Kaby Lake) armfazh: 119523 cycles per call everest: 119040 cycles per call Core i7-8750H (Coffee Lake) armfazh: 113914 cycles per call everest: 113650 cycles per call Core i9-9880H (Coffee Lake Refresh) armfazh: 112616 cycles per call everest: 114082 cycles per call Core i3-8121U (Cannon Lake) armfazh: 113202 cycles per call everest: 111382 cycles per call Core i7-8265U (Whiskey Lake) armfazh: 127307 cycles per call everest: 127697 cycles per call Core i7-8550U (Kaby Lake Refresh) armfazh: 127522 cycles per call everest: 127083 cycles per call Xeon Platinum 8275CL (Cascade Lake) armfazh: 114380 cycles per call everest: 114656 cycles per call Achieving these kind of results with formally verified code is quite remarkable, especialy considering that performance is favorable for newer chips. Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Diffstat (limited to 'src/crypto/zinc/curve25519/curve25519-hacl64.c')
1 files changed, 6 insertions, 11 deletions
diff --git a/src/crypto/zinc/curve25519/curve25519-hacl64.c b/src/crypto/zinc/curve25519/curve25519-hacl64.c
index 0f729ec..d6dcd0c 100644
--- a/src/crypto/zinc/curve25519/curve25519-hacl64.c
+++ b/src/crypto/zinc/curve25519/curve25519-hacl64.c
@@ -427,11 +427,6 @@ static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
fproduct_copy_from_wide_(output, tmp);
-static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
- fmul_fmul(output, a, b);
static __always_inline void crecip(u64 *output, u64 *input)
crecip_crecip(output, input);
@@ -498,8 +493,8 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
fsum(xprime, zprime);
fdifference(zprime, origxprime0);
- fmul(xxprime0, xprime, z);
- fmul(zzprime0, x, zprime);
+ fmul_fmul(xxprime0, xprime, z);
+ fmul_fmul(zzprime0, x, zprime);
origxprime = buf + 5;
u64 *xx0;
@@ -517,7 +512,7 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
fdifference(zzprime, origxprime);
fsquare_fsquare_times(x3, xxprime, 1);
fsquare_fsquare_times(zzzprime, zzprime, 1);
- fmul(z3, zzzprime, qx);
+ fmul_fmul(z3, zzzprime, qx);
fsquare_fsquare_times(xx0, x, 1);
fsquare_fsquare_times(zz0, z, 1);
@@ -528,12 +523,12 @@ static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
zzz = buf + 10;
xx = buf + 15;
zz = buf + 20;
- fmul(x2, xx, zz);
+ fmul_fmul(x2, xx, zz);
fdifference(zz, xx);
scalar = 121665;
fscalar(zzz, zz, scalar);
fsum(zzz, xx);
- fmul(z2, zzz, zz);
+ fmul_fmul(z2, zzz, zz);
@@ -748,7 +743,7 @@ static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
u64 *zmone = buf;
u64 *sc = buf + 5;
crecip(zmone, z);
- fmul(sc, x, zmone);
+ fmul_fmul(sc, x, zmone);
format_fcontract(scalar, sc);