From 2102651f4ee34568a8588fde624ed0bd3b64b1f7 Mon Sep 17 00:00:00 2001 From: karthikbhargavan Date: Mon, 24 Jun 2019 10:07:21 +0200 Subject: curve25519 with new adx code --- Makefile | 2 +- curve25519-evercrypt64.c | 536 +++++++++++++++++++++++ curve25519-hacl51.c | 1052 ++++++++++++++++++++++++++++++++++++++++++++++ curve25519-vale-inline.h | 810 +++++++++++++++++++++++++++++++++++ main.c | 28 +- 5 files changed, 2417 insertions(+), 11 deletions(-) create mode 100644 curve25519-evercrypt64.c create mode 100644 curve25519-hacl51.c create mode 100644 curve25519-vale-inline.h diff --git a/Makefile b/Makefile index f736c12..83fb516 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ifneq ($(KERNELRELEASE),) -kbench9000-y := main.o curve25519-donna64.o curve25519-hacl64.o curve25519-fiat64.o curve25519-sandy2x.o curve25519-sandy2x-asm.o curve25519-amd64.o curve25519-precomp.o curve25519-amd64-asm.o curve25519-donna32.o curve25519-fiat32.o curve25519-tweetnacl.o +kbench9000-y := main.o curve25519-donna64.o curve25519-evercrypt64.o curve25519-hacl51.o curve25519-fiat64.o curve25519-sandy2x.o curve25519-sandy2x-asm.o curve25519-amd64.o curve25519-precomp.o curve25519-amd64-asm.o curve25519-donna32.o curve25519-fiat32.o curve25519-tweetnacl.o obj-m := kbench9000.o ccflags-y += -O3 ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt' diff --git a/curve25519-evercrypt64.c b/curve25519-evercrypt64.c new file mode 100644 index 0000000..2d45ce6 --- /dev/null +++ b/curve25519-evercrypt64.c @@ -0,0 +1,536 @@ +/* + This file was generated by KreMLin + KreMLin invocation: /home/bhargava/Desktop/repositories/kremlin/krml -fbuiltin-uint128 -fnocompound-literals -fc89-scope -fparentheses -fcurly-braces -funroll-loops 8 -warn-error +9 -add-include "kremlib.h" -add-include "FStar_UInt_8_16_32_64.h" /dist/minimal/testlib.c -skip-compilation -add-include "vale_25519.h" -library C,FStar -bundle Lib.* -bundle Spec.* -bundle C=C.Endianness -bundle Hacl.Curve25519_64=Hacl.Impl.* -drop FStar.UInt128,Hacl.Impl.Curve25519.Field64.Core,LowStar,Spec,Prims,Lib,C.Loops.*,Hacl.Spec.*,Hacl.Curve25519_51 -no-prefix Hacl.Impl.Curve25519.Field64.Core -tmpdir curve25519-c .output/prims.krml .output/FStar_Pervasives_Native.krml .output/FStar_Pervasives.krml .output/FStar_Reflection_Types.krml .output/FStar_Reflection_Data.krml .output/FStar_Order.krml .output/FStar_Reflection_Basic.krml .output/FStar_Mul.krml .output/FStar_Preorder.krml .output/FStar_Calc.krml .output/FStar_Squash.krml .output/FStar_Classical.krml .output/FStar_StrongExcludedMiddle.krml .output/FStar_FunctionalExtensionality.krml .output/FStar_List_Tot_Base.krml .output/FStar_List_Tot_Properties.krml .output/FStar_List_Tot.krml .output/FStar_Seq_Base.krml .output/FStar_Seq_Properties.krml .output/FStar_Seq.krml .output/FStar_Math_Lib.krml .output/FStar_Math_Lemmas.krml .output/FStar_BitVector.krml .output/FStar_UInt.krml .output/FStar_UInt32.krml .output/FStar_Int.krml .output/FStar_Int16.krml .output/FStar_Ghost.krml .output/FStar_ErasedLogic.krml .output/FStar_UInt64.krml .output/FStar_Set.krml .output/FStar_PropositionalExtensionality.krml .output/FStar_PredicateExtensionality.krml .output/FStar_TSet.krml .output/FStar_Monotonic_Heap.krml .output/FStar_Heap.krml .output/FStar_Map.krml .output/FStar_Monotonic_HyperHeap.krml .output/FStar_Monotonic_HyperStack.krml .output/FStar_HyperStack.krml .output/FStar_Monotonic_Witnessed.krml .output/FStar_HyperStack_ST.krml .output/FStar_HyperStack_All.krml .output/FStar_Universe.krml .output/FStar_GSet.krml .output/FStar_ModifiesGen.krml .output/FStar_Range.krml .output/FStar_Tactics_Types.krml .output/FStar_Tactics_Result.krml .output/FStar_Tactics_Effect.krml .output/FStar_Tactics_Util.krml .output/FStar_Reflection_Const.krml .output/FStar_Char.krml .output/FStar_Exn.krml .output/FStar_ST.krml .output/FStar_All.krml .output/FStar_List.krml .output/FStar_String.krml .output/FStar_Reflection_Derived.krml .output/FStar_Tactics_Builtins.krml .output/FStar_Reflection_Formula.krml .output/FStar_Reflection_Derived_Lemmas.krml .output/FStar_Reflection.krml .output/FStar_Tactics_Derived.krml .output/FStar_Tactics_Logic.krml .output/FStar_Tactics.krml .output/FStar_BigOps.krml .output/LowStar_Monotonic_Buffer.krml .output/LowStar_Buffer.krml .output/LowStar_BufferOps.krml .output/Spec_Loops.krml .output/C_Loops.krml .output/FStar_Int64.krml .output/FStar_Int63.krml .output/FStar_Int32.krml .output/FStar_Int8.krml .output/FStar_UInt63.krml .output/FStar_UInt16.krml .output/FStar_UInt8.krml .output/FStar_Int_Cast.krml .output/FStar_UInt128.krml .output/FStar_Int_Cast_Full.krml .output/Lib_IntTypes.krml .output/Lib_Loops.krml .output/Lib_LoopCombinators.krml .output/Lib_RawIntTypes.krml .output/Lib_Sequence.krml .output/Lib_ByteSequence.krml .output/Hacl_Impl_Curve25519_Lemmas.krml .output/Spec_Curve25519_Lemmas.krml .output/Spec_Curve25519.krml .output/Hacl_Spec_Curve25519_Field51_Definition.krml .output/FStar_Reflection_Arith.krml .output/FStar_Tactics_Canon.krml .output/Hacl_Spec_Curve25519_Field51_Lemmas.krml .output/Hacl_Spec_Curve25519_Field51.krml .output/LowStar_ImmutableBuffer.krml .output/Lib_Buffer.krml .output/Hacl_Impl_Curve25519_Field51.krml .output/Hacl_Spec_Curve25519_AddAndDouble.krml .output/Hacl_Spec_Curve25519_Field64_Definition.krml .output/Hacl_Spec_Curve25519_Field64_Lemmas.krml .output/Hacl_Spec_Curve25519_Field64_Core.krml .output/Hacl_Spec_Curve25519_Field64.krml .output/Hacl_Impl_Curve25519_Field64_Core.krml .output/Hacl_Impl_Curve25519_Field64.krml .output/Hacl_Impl_Curve25519_Fields.krml .output/FStar_Kremlin_Endianness.krml .output/C_Endianness.krml .output/C.krml .output/Lib_ByteBuffer.krml .output/Hacl_Impl_Curve25519_AddAndDouble.krml .output/Hacl_Spec_Curve25519_Finv.krml .output/Hacl_Impl_Curve25519_Finv.krml .output/Hacl_Impl_Curve25519_Generic.krml .output/Hacl_Curve25519_51.krml .output/Hacl_Curve25519_64.krml + F* version: 36529b85 + KreMLin version: 5acc2a95 + */ + +#include "curve25519-vale-inline.h" +enum { CURVE25519_POINT_SIZE = 32 }; + +#define KRML_CHECK_SIZE(a,b) {} +#define load64_le(b_i) le64_to_cpup((__force __le64 *)b_i) +#define store64_le(b_o,o) *(__force __le64 *)(b_o) = cpu_to_le64(o) + +__always_inline static uint64_t FStar_UInt64_eq_mask(uint64_t a, uint64_t b) +{ + uint64_t x = a ^ b; + uint64_t minus_x = ~x + (uint64_t)1U; + uint64_t x_or_minus_x = x | minus_x; + uint64_t xnx = x_or_minus_x >> (uint32_t)63U; + return xnx - (uint64_t)1U; +} + +__always_inline static uint64_t FStar_UInt64_gte_mask(uint64_t a, uint64_t b) +{ + uint64_t x = a; + uint64_t y = b; + uint64_t x_xor_y = x ^ y; + uint64_t x_sub_y = x - y; + uint64_t x_sub_y_xor_y = x_sub_y ^ y; + uint64_t q = x_xor_y | x_sub_y_xor_y; + uint64_t x_xor_q = x ^ q; + uint64_t x_xor_q_ = x_xor_q >> (uint32_t)63U; + return x_xor_q_ - (uint64_t)1U; +} + +inline static void +Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_64( + uint64_t *q, + uint64_t *p01_tmp1, + uint64_t *tmp2 +) +{ + uint64_t *nq = p01_tmp1; + uint64_t *nq_p1 = p01_tmp1 + (uint32_t)8U; + uint64_t *tmp1 = p01_tmp1 + (uint32_t)16U; + uint64_t *x1 = q; + uint64_t *x2 = nq; + uint64_t *z2 = nq + (uint32_t)4U; + uint64_t *z3 = nq_p1 + (uint32_t)4U; + uint64_t *a = tmp1; + uint64_t *b = tmp1 + (uint32_t)4U; + uint64_t *ab = tmp1; + uint64_t *dc = tmp1 + (uint32_t)8U; + uint64_t *x3; + uint64_t *z31; + uint64_t *d0; + uint64_t *c0; + uint64_t *a1; + uint64_t *b1; + uint64_t *d; + uint64_t *c; + uint64_t *ab1; + uint64_t *dc1; + fadd(a, x2, z2); + fsub(b, x2, z2); + x3 = nq_p1; + z31 = nq_p1 + (uint32_t)4U; + d0 = dc; + c0 = dc + (uint32_t)4U; + fadd(c0, x3, z31); + fsub(d0, x3, z31); + fmul2(dc, dc, ab, tmp2); + fadd(x3, d0, c0); + fsub(z31, d0, c0); + a1 = tmp1; + b1 = tmp1 + (uint32_t)4U; + d = tmp1 + (uint32_t)8U; + c = tmp1 + (uint32_t)12U; + ab1 = tmp1; + dc1 = tmp1 + (uint32_t)8U; + fsqr2(dc1, ab1, tmp2); + fsqr2(nq_p1, nq_p1, tmp2); + a1[0U] = c[0U]; + a1[1U] = c[1U]; + a1[2U] = c[2U]; + a1[3U] = c[3U]; + fsub(c, d, c); + fmul1(b1, c, (uint64_t)121665U); + fadd(b1, b1, d); + fmul2(nq, dc1, ab1, tmp2); + fmul(z3, z3, x1, tmp2); +} + +inline static void +Hacl_Impl_Curve25519_AddAndDouble_point_double_64(uint64_t *nq, uint64_t *tmp1, uint64_t *tmp2) +{ + uint64_t *x2 = nq; + uint64_t *z2 = nq + (uint32_t)4U; + uint64_t *a = tmp1; + uint64_t *b = tmp1 + (uint32_t)4U; + uint64_t *d = tmp1 + (uint32_t)8U; + uint64_t *c = tmp1 + (uint32_t)12U; + uint64_t *ab = tmp1; + uint64_t *dc = tmp1 + (uint32_t)8U; + fadd(a, x2, z2); + fsub(b, x2, z2); + fsqr2(dc, ab, tmp2); + a[0U] = c[0U]; + a[1U] = c[1U]; + a[2U] = c[2U]; + a[3U] = c[3U]; + fsub(c, d, c); + fmul1(b, c, (uint64_t)121665U); + fadd(b, b, d); + fmul2(nq, dc, ab, tmp2); +} + +inline static void +Hacl_Impl_Curve25519_Finv_fsquare_times_64( + uint64_t *o, + uint64_t *i, + uint64_t *tmp, + uint32_t n1 +) +{ + uint32_t i0; + fsqr(o, i, tmp); + for (i0 = (uint32_t)0U; i0 < n1 - (uint32_t)1U; i0 = i0 + (uint32_t)1U) + { + fsqr(o, o, tmp); + } +} + +inline static void Hacl_Impl_Curve25519_Finv_finv_64(uint64_t *o, uint64_t *i, uint64_t *tmp) +{ + uint64_t t1[16U] = { 0U }; + uint64_t *a0 = t1; + uint64_t *b = t1 + (uint32_t)4U; + uint64_t *c = t1 + (uint32_t)8U; + uint64_t *t00 = t1 + (uint32_t)12U; + uint64_t *tmp1 = tmp; + uint64_t *a; + uint64_t *t0; + Hacl_Impl_Curve25519_Finv_fsquare_times_64(a0, i, tmp1, (uint32_t)1U); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, a0, tmp1, (uint32_t)2U); + fmul(b, t00, i, tmp); + fmul(a0, b, a0, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, a0, tmp1, (uint32_t)1U); + fmul(b, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, b, tmp1, (uint32_t)5U); + fmul(b, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, b, tmp1, (uint32_t)10U); + fmul(c, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, c, tmp1, (uint32_t)20U); + fmul(t00, t00, c, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, t00, tmp1, (uint32_t)10U); + fmul(b, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, b, tmp1, (uint32_t)50U); + fmul(c, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, c, tmp1, (uint32_t)100U); + fmul(t00, t00, c, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, t00, tmp1, (uint32_t)50U); + fmul(t00, t00, b, tmp); + Hacl_Impl_Curve25519_Finv_fsquare_times_64(t00, t00, tmp1, (uint32_t)5U); + a = t1; + t0 = t1 + (uint32_t)12U; + fmul(o, t0, a, tmp); +} + +inline static void +Hacl_Impl_Curve25519_Generic_montgomery_ladder_64(uint64_t *out, uint8_t *key, uint64_t *init1) +{ + uint64_t tmp2[16U] = { 0U }; + uint64_t p01_tmp1_swap[33U] = { 0U }; + uint64_t *p0 = p01_tmp1_swap; + uint64_t *p01 = p01_tmp1_swap; + uint64_t *p03 = p01; + uint64_t *p11 = p01 + (uint32_t)8U; + uint64_t *x0; + uint64_t *z0; + uint64_t *p01_tmp1; + uint64_t *p01_tmp11; + uint64_t *nq10; + uint64_t *nq_p11; + uint64_t *swap1; + uint64_t sw0; + uint64_t *nq1; + uint64_t *tmp1; + memcpy(p11, init1, (uint32_t)8U * sizeof init1[0U]); + x0 = p03; + z0 = p03 + (uint32_t)4U; + x0[0U] = (uint64_t)1U; + x0[1U] = (uint64_t)0U; + x0[2U] = (uint64_t)0U; + x0[3U] = (uint64_t)0U; + z0[0U] = (uint64_t)0U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + p01_tmp1 = p01_tmp1_swap; + p01_tmp11 = p01_tmp1_swap; + nq10 = p01_tmp1_swap; + nq_p11 = p01_tmp1_swap + (uint32_t)8U; + swap1 = p01_tmp1_swap + (uint32_t)32U; + cswap2((uint64_t)1U, nq10, nq_p11); + Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_64(init1, p01_tmp11, tmp2); + swap1[0U] = (uint64_t)1U; + { + uint32_t i; + for (i = (uint32_t)0U; i < (uint32_t)251U; i = i + (uint32_t)1U) + { + uint64_t *p01_tmp12 = p01_tmp1_swap; + uint64_t *swap2 = p01_tmp1_swap + (uint32_t)32U; + uint64_t *nq2 = p01_tmp12; + uint64_t *nq_p12 = p01_tmp12 + (uint32_t)8U; + uint64_t + bit = + (uint64_t)(key[((uint32_t)253U - i) + / (uint32_t)8U] + >> ((uint32_t)253U - i) % (uint32_t)8U + & (uint8_t)1U); + uint64_t sw = swap2[0U] ^ bit; + cswap2(sw, nq2, nq_p12); + Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_64(init1, p01_tmp12, tmp2); + swap2[0U] = bit; + } + } + sw0 = swap1[0U]; + cswap2(sw0, nq10, nq_p11); + nq1 = p01_tmp1; + tmp1 = p01_tmp1 + (uint32_t)16U; + Hacl_Impl_Curve25519_AddAndDouble_point_double_64(nq1, tmp1, tmp2); + Hacl_Impl_Curve25519_AddAndDouble_point_double_64(nq1, tmp1, tmp2); + Hacl_Impl_Curve25519_AddAndDouble_point_double_64(nq1, tmp1, tmp2); + memcpy(out, p0, (uint32_t)8U * sizeof p0[0U]); +} + +static uint8_t +Hacl_Impl_Curve25519_Generic_g25519[32U] = + { + (uint8_t)9U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U + }; + +void Hacl_Curve25519_64_secret_to_public(uint8_t *pub, uint8_t *priv) +{ + uint8_t basepoint[32U] = { 0U }; + { + uint32_t i; + for (i = (uint32_t)0U; i < (uint32_t)32U; i = i + (uint32_t)1U) + { + uint8_t *os = basepoint; + uint8_t x = Hacl_Impl_Curve25519_Generic_g25519[i]; + os[i] = x; + } + } + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)4U); + { + uint64_t init1[(uint32_t)2U * (uint32_t)4U]; + memset(init1, 0U, (uint32_t)2U * (uint32_t)4U * sizeof init1[0U]); + { + uint64_t tmp0[4U] = { 0U }; + uint64_t tmp3; + uint64_t *x; + uint64_t *z0; + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)0U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[0U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)1U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[1U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)2U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[2U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)3U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[3U] = x; + } + tmp3 = tmp0[3U]; + tmp0[3U] = tmp3 & (uint64_t)0x7fffffffffffffffU; + x = init1; + z0 = init1 + (uint32_t)4U; + z0[0U] = (uint64_t)1U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + x[0U] = tmp0[0U]; + x[1U] = tmp0[1U]; + x[2U] = tmp0[2U]; + x[3U] = tmp0[3U]; + Hacl_Impl_Curve25519_Generic_montgomery_ladder_64(init1, priv, init1); + { + uint64_t *x0 = init1; + uint64_t *z = init1 + (uint32_t)4U; + uint64_t tmp[4U] = { 0U }; + uint64_t u64s[4U] = { 0U }; + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)8U); + { + uint64_t tmp_w[(uint32_t)2U * (uint32_t)8U]; + memset(tmp_w, 0U, (uint32_t)2U * (uint32_t)8U * sizeof tmp_w[0U]); + { + uint64_t f30; + uint64_t top_bit0; + uint64_t carry0; + uint64_t f31; + uint64_t top_bit; + uint64_t carry; + uint64_t f0; + uint64_t f1; + uint64_t f2; + uint64_t f3; + uint64_t m0; + uint64_t m1; + uint64_t m2; + uint64_t m3; + uint64_t mask; + uint64_t f0_; + uint64_t f1_; + uint64_t f2_; + uint64_t f3_; + uint64_t o0; + uint64_t o1; + uint64_t o2; + uint64_t o3; + Hacl_Impl_Curve25519_Finv_finv_64(tmp, z, tmp_w); + fmul(tmp, tmp, x0, tmp_w); + f30 = tmp[3U]; + top_bit0 = f30 >> (uint32_t)63U; + tmp[3U] = f30 & (uint64_t)0x7fffffffffffffffU; + carry0 = add1(tmp, tmp, (uint64_t)19U * top_bit0); + f31 = tmp[3U]; + top_bit = f31 >> (uint32_t)63U; + tmp[3U] = f31 & (uint64_t)0x7fffffffffffffffU; + carry = add1(tmp, tmp, (uint64_t)19U * top_bit); + f0 = tmp[0U]; + f1 = tmp[1U]; + f2 = tmp[2U]; + f3 = tmp[3U]; + m0 = FStar_UInt64_gte_mask(f0, (uint64_t)0xffffffffffffffedU); + m1 = FStar_UInt64_eq_mask(f1, (uint64_t)0xffffffffffffffffU); + m2 = FStar_UInt64_eq_mask(f2, (uint64_t)0xffffffffffffffffU); + m3 = FStar_UInt64_eq_mask(f3, (uint64_t)0x7fffffffffffffffU); + mask = ((m0 & m1) & m2) & m3; + f0_ = f0 - (mask & (uint64_t)0xffffffffffffffedU); + f1_ = f1 - (mask & (uint64_t)0xffffffffffffffffU); + f2_ = f2 - (mask & (uint64_t)0xffffffffffffffffU); + f3_ = f3 - (mask & (uint64_t)0x7fffffffffffffffU); + o0 = f0_; + o1 = f1_; + o2 = f2_; + o3 = f3_; + u64s[0U] = o0; + u64s[1U] = o1; + u64s[2U] = o2; + u64s[3U] = o3; + { + store64_le(pub + (uint32_t)0U * (uint32_t)8U, u64s[0U]); + } + { + store64_le(pub + (uint32_t)1U * (uint32_t)8U, u64s[1U]); + } + { + store64_le(pub + (uint32_t)2U * (uint32_t)8U, u64s[2U]); + } + { + store64_le(pub + (uint32_t)3U * (uint32_t)8U, u64s[3U]); + } + } + } + } + } + } +} + +void curve25519_evercrypt64(uint8_t *shared, uint8_t *my_priv, uint8_t *their_pub) +{ + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)4U); + { + uint64_t init1[(uint32_t)2U * (uint32_t)4U]; + memset(init1, 0U, (uint32_t)2U * (uint32_t)4U * sizeof init1[0U]); + { + uint64_t tmp0[4U] = { 0U }; + uint64_t tmp3; + uint64_t *x; + uint64_t *z0; + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)0U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[0U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)1U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[1U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)2U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[2U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)3U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[3U] = x; + } + tmp3 = tmp0[3U]; + tmp0[3U] = tmp3 & (uint64_t)0x7fffffffffffffffU; + x = init1; + z0 = init1 + (uint32_t)4U; + z0[0U] = (uint64_t)1U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + x[0U] = tmp0[0U]; + x[1U] = tmp0[1U]; + x[2U] = tmp0[2U]; + x[3U] = tmp0[3U]; + Hacl_Impl_Curve25519_Generic_montgomery_ladder_64(init1, my_priv, init1); + { + uint64_t *x0 = init1; + uint64_t *z = init1 + (uint32_t)4U; + uint64_t tmp[4U] = { 0U }; + uint64_t u64s[4U] = { 0U }; + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)8U); + { + uint64_t tmp_w[(uint32_t)2U * (uint32_t)8U]; + memset(tmp_w, 0U, (uint32_t)2U * (uint32_t)8U * sizeof tmp_w[0U]); + { + uint64_t f30; + uint64_t top_bit0; + uint64_t carry0; + uint64_t f31; + uint64_t top_bit; + uint64_t carry; + uint64_t f0; + uint64_t f1; + uint64_t f2; + uint64_t f3; + uint64_t m0; + uint64_t m1; + uint64_t m2; + uint64_t m3; + uint64_t mask; + uint64_t f0_; + uint64_t f1_; + uint64_t f2_; + uint64_t f3_; + uint64_t o0; + uint64_t o1; + uint64_t o2; + uint64_t o3; + Hacl_Impl_Curve25519_Finv_finv_64(tmp, z, tmp_w); + fmul(tmp, tmp, x0, tmp_w); + f30 = tmp[3U]; + top_bit0 = f30 >> (uint32_t)63U; + tmp[3U] = f30 & (uint64_t)0x7fffffffffffffffU; + carry0 = add1(tmp, tmp, (uint64_t)19U * top_bit0); + f31 = tmp[3U]; + top_bit = f31 >> (uint32_t)63U; + tmp[3U] = f31 & (uint64_t)0x7fffffffffffffffU; + carry = add1(tmp, tmp, (uint64_t)19U * top_bit); + f0 = tmp[0U]; + f1 = tmp[1U]; + f2 = tmp[2U]; + f3 = tmp[3U]; + m0 = FStar_UInt64_gte_mask(f0, (uint64_t)0xffffffffffffffedU); + m1 = FStar_UInt64_eq_mask(f1, (uint64_t)0xffffffffffffffffU); + m2 = FStar_UInt64_eq_mask(f2, (uint64_t)0xffffffffffffffffU); + m3 = FStar_UInt64_eq_mask(f3, (uint64_t)0x7fffffffffffffffU); + mask = ((m0 & m1) & m2) & m3; + f0_ = f0 - (mask & (uint64_t)0xffffffffffffffedU); + f1_ = f1 - (mask & (uint64_t)0xffffffffffffffffU); + f2_ = f2 - (mask & (uint64_t)0xffffffffffffffffU); + f3_ = f3 - (mask & (uint64_t)0x7fffffffffffffffU); + o0 = f0_; + o1 = f1_; + o2 = f2_; + o3 = f3_; + u64s[0U] = o0; + u64s[1U] = o1; + u64s[2U] = o2; + u64s[3U] = o3; + { + store64_le(shared + (uint32_t)0U * (uint32_t)8U, u64s[0U]); + } + { + store64_le(shared + (uint32_t)1U * (uint32_t)8U, u64s[1U]); + } + { + store64_le(shared + (uint32_t)2U * (uint32_t)8U, u64s[2U]); + } + { + store64_le(shared + (uint32_t)3U * (uint32_t)8U, u64s[3U]); + } + } + } + } + } + } +} + diff --git a/curve25519-hacl51.c b/curve25519-hacl51.c new file mode 100644 index 0000000..39bc4b9 --- /dev/null +++ b/curve25519-hacl51.c @@ -0,0 +1,1052 @@ +/* + This file was generated by KreMLin + KreMLin invocation: /home/bhargava/Desktop/repositories/kremlin/krml -fbuiltin-uint128 -fnocompound-literals -fc89-scope -fparentheses -fcurly-braces -funroll-loops 8 -warn-error +9 -add-include "kremlib.h" -add-include "FStar_UInt_8_16_32_64.h" /dist/minimal/testlib.c -skip-compilation -library C,FStar -bundle Lib.* -bundle Spec.* -bundle C=C.Endianness -bundle Hacl.Curve25519_51=Hacl.Impl.* -drop FStar.UInt128,LowStar,Spec,Prims,Lib,C.Loops.*,Hacl.Spec.*,Hacl.Curve25519_64 -tmpdir curve25519-c .output/prims.krml .output/FStar_Pervasives_Native.krml .output/FStar_Pervasives.krml .output/FStar_Reflection_Types.krml .output/FStar_Reflection_Data.krml .output/FStar_Order.krml .output/FStar_Reflection_Basic.krml .output/FStar_Mul.krml .output/FStar_Preorder.krml .output/FStar_Calc.krml .output/FStar_Squash.krml .output/FStar_Classical.krml .output/FStar_StrongExcludedMiddle.krml .output/FStar_FunctionalExtensionality.krml .output/FStar_List_Tot_Base.krml .output/FStar_List_Tot_Properties.krml .output/FStar_List_Tot.krml .output/FStar_Seq_Base.krml .output/FStar_Seq_Properties.krml .output/FStar_Seq.krml .output/FStar_Math_Lib.krml .output/FStar_Math_Lemmas.krml .output/FStar_BitVector.krml .output/FStar_UInt.krml .output/FStar_UInt32.krml .output/FStar_Int.krml .output/FStar_Int16.krml .output/FStar_Ghost.krml .output/FStar_ErasedLogic.krml .output/FStar_UInt64.krml .output/FStar_Set.krml .output/FStar_PropositionalExtensionality.krml .output/FStar_PredicateExtensionality.krml .output/FStar_TSet.krml .output/FStar_Monotonic_Heap.krml .output/FStar_Heap.krml .output/FStar_Map.krml .output/FStar_Monotonic_HyperHeap.krml .output/FStar_Monotonic_HyperStack.krml .output/FStar_HyperStack.krml .output/FStar_Monotonic_Witnessed.krml .output/FStar_HyperStack_ST.krml .output/FStar_HyperStack_All.krml .output/FStar_Universe.krml .output/FStar_GSet.krml .output/FStar_ModifiesGen.krml .output/FStar_Range.krml .output/FStar_Tactics_Types.krml .output/FStar_Tactics_Result.krml .output/FStar_Tactics_Effect.krml .output/FStar_Tactics_Util.krml .output/FStar_Reflection_Const.krml .output/FStar_Char.krml .output/FStar_Exn.krml .output/FStar_ST.krml .output/FStar_All.krml .output/FStar_List.krml .output/FStar_String.krml .output/FStar_Reflection_Derived.krml .output/FStar_Tactics_Builtins.krml .output/FStar_Reflection_Formula.krml .output/FStar_Reflection_Derived_Lemmas.krml .output/FStar_Reflection.krml .output/FStar_Tactics_Derived.krml .output/FStar_Tactics_Logic.krml .output/FStar_Tactics.krml .output/FStar_BigOps.krml .output/LowStar_Monotonic_Buffer.krml .output/LowStar_Buffer.krml .output/LowStar_BufferOps.krml .output/Spec_Loops.krml .output/C_Loops.krml .output/FStar_Int64.krml .output/FStar_Int63.krml .output/FStar_Int32.krml .output/FStar_Int8.krml .output/FStar_UInt63.krml .output/FStar_UInt16.krml .output/FStar_UInt8.krml .output/FStar_Int_Cast.krml .output/FStar_UInt128.krml .output/FStar_Int_Cast_Full.krml .output/Lib_IntTypes.krml .output/Lib_Loops.krml .output/Lib_LoopCombinators.krml .output/Lib_RawIntTypes.krml .output/Lib_Sequence.krml .output/Lib_ByteSequence.krml .output/Hacl_Impl_Curve25519_Lemmas.krml .output/Spec_Curve25519_Lemmas.krml .output/Spec_Curve25519.krml .output/Hacl_Spec_Curve25519_Field51_Definition.krml .output/FStar_Reflection_Arith.krml .output/FStar_Tactics_Canon.krml .output/Hacl_Spec_Curve25519_Field51_Lemmas.krml .output/Hacl_Spec_Curve25519_Field51.krml .output/LowStar_ImmutableBuffer.krml .output/Lib_Buffer.krml .output/Hacl_Impl_Curve25519_Field51.krml .output/Hacl_Spec_Curve25519_AddAndDouble.krml .output/Hacl_Spec_Curve25519_Field64_Definition.krml .output/Hacl_Spec_Curve25519_Field64_Lemmas.krml .output/Hacl_Spec_Curve25519_Field64_Core.krml .output/Hacl_Spec_Curve25519_Field64.krml .output/Hacl_Impl_Curve25519_Field64_Core.krml .output/Hacl_Impl_Curve25519_Field64.krml .output/Hacl_Impl_Curve25519_Fields.krml .output/FStar_Kremlin_Endianness.krml .output/C_Endianness.krml .output/C.krml .output/Lib_ByteBuffer.krml .output/Hacl_Impl_Curve25519_AddAndDouble.krml .output/Hacl_Spec_Curve25519_Finv.krml .output/Hacl_Impl_Curve25519_Finv.krml .output/Hacl_Impl_Curve25519_Generic.krml .output/Hacl_Curve25519_51.krml .output/Hacl_Curve25519_64.krml + F* version: 36529b85 + KreMLin version: 5acc2a95 + */ + + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; +typedef __uint128_t uint128_t; +typedef u64 uint64_t; +typedef u8 uint8_t; + +#define KRML_CHECK_SIZE(a,b) {} +#define load64_le(b_i) le64_to_cpup((__force __le64 *)b_i) +#define store64_le(b_o,o) *(__force __le64 *)(b_o) = cpu_to_le64(o) + +__always_inline static uint64_t FStar_UInt64_eq_mask(uint64_t a, uint64_t b) +{ + uint64_t x = a ^ b; + uint64_t minus_x = ~x + (uint64_t)1U; + uint64_t x_or_minus_x = x | minus_x; + uint64_t xnx = x_or_minus_x >> (uint32_t)63U; + return xnx - (uint64_t)1U; +} + +__always_inline static uint64_t FStar_UInt64_gte_mask(uint64_t a, uint64_t b) +{ + uint64_t x = a; + uint64_t y = b; + uint64_t x_xor_y = x ^ y; + uint64_t x_sub_y = x - y; + uint64_t x_sub_y_xor_y = x_sub_y ^ y; + uint64_t q = x_xor_y | x_sub_y_xor_y; + uint64_t x_xor_q = x ^ q; + uint64_t x_xor_q_ = x_xor_q >> (uint32_t)63U; + return x_xor_q_ - (uint64_t)1U; +} + + +inline static void Hacl_Impl_Curve25519_Field51_fadd(uint64_t *out, uint64_t *f1, uint64_t *f2) +{ + uint64_t f10 = f1[0U]; + uint64_t f20 = f2[0U]; + uint64_t f11 = f1[1U]; + uint64_t f21 = f2[1U]; + uint64_t f12 = f1[2U]; + uint64_t f22 = f2[2U]; + uint64_t f13 = f1[3U]; + uint64_t f23 = f2[3U]; + uint64_t f14 = f1[4U]; + uint64_t f24 = f2[4U]; + out[0U] = f10 + f20; + out[1U] = f11 + f21; + out[2U] = f12 + f22; + out[3U] = f13 + f23; + out[4U] = f14 + f24; +} + +inline static void Hacl_Impl_Curve25519_Field51_fsub(uint64_t *out, uint64_t *f1, uint64_t *f2) +{ + uint64_t f10 = f1[0U]; + uint64_t f20 = f2[0U]; + uint64_t f11 = f1[1U]; + uint64_t f21 = f2[1U]; + uint64_t f12 = f1[2U]; + uint64_t f22 = f2[2U]; + uint64_t f13 = f1[3U]; + uint64_t f23 = f2[3U]; + uint64_t f14 = f1[4U]; + uint64_t f24 = f2[4U]; + out[0U] = f10 + (uint64_t)0x3fffffffffff68U - f20; + out[1U] = f11 + (uint64_t)0x3ffffffffffff8U - f21; + out[2U] = f12 + (uint64_t)0x3ffffffffffff8U - f22; + out[3U] = f13 + (uint64_t)0x3ffffffffffff8U - f23; + out[4U] = f14 + (uint64_t)0x3ffffffffffff8U - f24; +} + +inline static void Hacl_Impl_Curve25519_Field51_fmul(uint64_t *out, uint64_t *f1, uint64_t *f2) +{ + uint64_t f10 = f1[0U]; + uint64_t f11 = f1[1U]; + uint64_t f12 = f1[2U]; + uint64_t f13 = f1[3U]; + uint64_t f14 = f1[4U]; + uint64_t f20 = f2[0U]; + uint64_t f21 = f2[1U]; + uint64_t f22 = f2[2U]; + uint64_t f23 = f2[3U]; + uint64_t f24 = f2[4U]; + uint64_t tmp1 = f21 * (uint64_t)19U; + uint64_t tmp2 = f22 * (uint64_t)19U; + uint64_t tmp3 = f23 * (uint64_t)19U; + uint64_t tmp4 = f24 * (uint64_t)19U; + uint128_t o00 = (uint128_t)f10 * f20; + uint128_t o10 = (uint128_t)f10 * f21; + uint128_t o20 = (uint128_t)f10 * f22; + uint128_t o30 = (uint128_t)f10 * f23; + uint128_t o40 = (uint128_t)f10 * f24; + uint128_t o01 = o00 + (uint128_t)f11 * tmp4; + uint128_t o11 = o10 + (uint128_t)f11 * f20; + uint128_t o21 = o20 + (uint128_t)f11 * f21; + uint128_t o31 = o30 + (uint128_t)f11 * f22; + uint128_t o41 = o40 + (uint128_t)f11 * f23; + uint128_t o02 = o01 + (uint128_t)f12 * tmp3; + uint128_t o12 = o11 + (uint128_t)f12 * tmp4; + uint128_t o22 = o21 + (uint128_t)f12 * f20; + uint128_t o32 = o31 + (uint128_t)f12 * f21; + uint128_t o42 = o41 + (uint128_t)f12 * f22; + uint128_t o03 = o02 + (uint128_t)f13 * tmp2; + uint128_t o13 = o12 + (uint128_t)f13 * tmp3; + uint128_t o23 = o22 + (uint128_t)f13 * tmp4; + uint128_t o33 = o32 + (uint128_t)f13 * f20; + uint128_t o43 = o42 + (uint128_t)f13 * f21; + uint128_t o04 = o03 + (uint128_t)f14 * tmp1; + uint128_t o14 = o13 + (uint128_t)f14 * tmp2; + uint128_t o24 = o23 + (uint128_t)f14 * tmp3; + uint128_t o34 = o33 + (uint128_t)f14 * tmp4; + uint128_t o44 = o43 + (uint128_t)f14 * f20; + uint128_t tmp_w0 = o04; + uint128_t tmp_w1 = o14; + uint128_t tmp_w2 = o24; + uint128_t tmp_w3 = o34; + uint128_t tmp_w4 = o44; + uint128_t l_ = tmp_w0 + (uint128_t)(uint64_t)0U; + uint64_t tmp01 = (uint64_t)l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = (uint64_t)(l_ >> (uint32_t)51U); + uint128_t l_0 = tmp_w1 + (uint128_t)c0; + uint64_t tmp11 = (uint64_t)l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = (uint64_t)(l_0 >> (uint32_t)51U); + uint128_t l_1 = tmp_w2 + (uint128_t)c1; + uint64_t tmp21 = (uint64_t)l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = (uint64_t)(l_1 >> (uint32_t)51U); + uint128_t l_2 = tmp_w3 + (uint128_t)c2; + uint64_t tmp31 = (uint64_t)l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = (uint64_t)(l_2 >> (uint32_t)51U); + uint128_t l_3 = tmp_w4 + (uint128_t)c3; + uint64_t tmp41 = (uint64_t)l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = (uint64_t)(l_3 >> (uint32_t)51U); + uint64_t l_4 = tmp01 + c4 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_4 >> (uint32_t)51U; + uint64_t o0 = tmp0_; + uint64_t o1 = tmp11 + c5; + uint64_t o2 = tmp21; + uint64_t o3 = tmp31; + uint64_t o4 = tmp41; + out[0U] = o0; + out[1U] = o1; + out[2U] = o2; + out[3U] = o3; + out[4U] = o4; +} + +inline static void +Hacl_Impl_Curve25519_Field51_fmul2(uint64_t *out, uint64_t *f1, uint64_t *f2) +{ + uint64_t f10 = f1[0U]; + uint64_t f11 = f1[1U]; + uint64_t f12 = f1[2U]; + uint64_t f13 = f1[3U]; + uint64_t f14 = f1[4U]; + uint64_t f20 = f2[0U]; + uint64_t f21 = f2[1U]; + uint64_t f22 = f2[2U]; + uint64_t f23 = f2[3U]; + uint64_t f24 = f2[4U]; + uint64_t f30 = f1[5U]; + uint64_t f31 = f1[6U]; + uint64_t f32 = f1[7U]; + uint64_t f33 = f1[8U]; + uint64_t f34 = f1[9U]; + uint64_t f40 = f2[5U]; + uint64_t f41 = f2[6U]; + uint64_t f42 = f2[7U]; + uint64_t f43 = f2[8U]; + uint64_t f44 = f2[9U]; + uint64_t tmp11 = f21 * (uint64_t)19U; + uint64_t tmp12 = f22 * (uint64_t)19U; + uint64_t tmp13 = f23 * (uint64_t)19U; + uint64_t tmp14 = f24 * (uint64_t)19U; + uint64_t tmp21 = f41 * (uint64_t)19U; + uint64_t tmp22 = f42 * (uint64_t)19U; + uint64_t tmp23 = f43 * (uint64_t)19U; + uint64_t tmp24 = f44 * (uint64_t)19U; + uint128_t o00 = (uint128_t)f10 * f20; + uint128_t o15 = (uint128_t)f10 * f21; + uint128_t o25 = (uint128_t)f10 * f22; + uint128_t o30 = (uint128_t)f10 * f23; + uint128_t o40 = (uint128_t)f10 * f24; + uint128_t o010 = o00 + (uint128_t)f11 * tmp14; + uint128_t o110 = o15 + (uint128_t)f11 * f20; + uint128_t o210 = o25 + (uint128_t)f11 * f21; + uint128_t o310 = o30 + (uint128_t)f11 * f22; + uint128_t o410 = o40 + (uint128_t)f11 * f23; + uint128_t o020 = o010 + (uint128_t)f12 * tmp13; + uint128_t o120 = o110 + (uint128_t)f12 * tmp14; + uint128_t o220 = o210 + (uint128_t)f12 * f20; + uint128_t o320 = o310 + (uint128_t)f12 * f21; + uint128_t o420 = o410 + (uint128_t)f12 * f22; + uint128_t o030 = o020 + (uint128_t)f13 * tmp12; + uint128_t o130 = o120 + (uint128_t)f13 * tmp13; + uint128_t o230 = o220 + (uint128_t)f13 * tmp14; + uint128_t o330 = o320 + (uint128_t)f13 * f20; + uint128_t o430 = o420 + (uint128_t)f13 * f21; + uint128_t o040 = o030 + (uint128_t)f14 * tmp11; + uint128_t o140 = o130 + (uint128_t)f14 * tmp12; + uint128_t o240 = o230 + (uint128_t)f14 * tmp13; + uint128_t o340 = o330 + (uint128_t)f14 * tmp14; + uint128_t o440 = o430 + (uint128_t)f14 * f20; + uint128_t tmp_w10 = o040; + uint128_t tmp_w11 = o140; + uint128_t tmp_w12 = o240; + uint128_t tmp_w13 = o340; + uint128_t tmp_w14 = o440; + uint128_t o0 = (uint128_t)f30 * f40; + uint128_t o1 = (uint128_t)f30 * f41; + uint128_t o2 = (uint128_t)f30 * f42; + uint128_t o3 = (uint128_t)f30 * f43; + uint128_t o4 = (uint128_t)f30 * f44; + uint128_t o01 = o0 + (uint128_t)f31 * tmp24; + uint128_t o111 = o1 + (uint128_t)f31 * f40; + uint128_t o211 = o2 + (uint128_t)f31 * f41; + uint128_t o31 = o3 + (uint128_t)f31 * f42; + uint128_t o41 = o4 + (uint128_t)f31 * f43; + uint128_t o02 = o01 + (uint128_t)f32 * tmp23; + uint128_t o121 = o111 + (uint128_t)f32 * tmp24; + uint128_t o221 = o211 + (uint128_t)f32 * f40; + uint128_t o32 = o31 + (uint128_t)f32 * f41; + uint128_t o42 = o41 + (uint128_t)f32 * f42; + uint128_t o03 = o02 + (uint128_t)f33 * tmp22; + uint128_t o131 = o121 + (uint128_t)f33 * tmp23; + uint128_t o231 = o221 + (uint128_t)f33 * tmp24; + uint128_t o33 = o32 + (uint128_t)f33 * f40; + uint128_t o43 = o42 + (uint128_t)f33 * f41; + uint128_t o04 = o03 + (uint128_t)f34 * tmp21; + uint128_t o141 = o131 + (uint128_t)f34 * tmp22; + uint128_t o241 = o231 + (uint128_t)f34 * tmp23; + uint128_t o34 = o33 + (uint128_t)f34 * tmp24; + uint128_t o44 = o43 + (uint128_t)f34 * f40; + uint128_t tmp_w20 = o04; + uint128_t tmp_w21 = o141; + uint128_t tmp_w22 = o241; + uint128_t tmp_w23 = o34; + uint128_t tmp_w24 = o44; + uint128_t l_ = tmp_w10 + (uint128_t)(uint64_t)0U; + uint64_t tmp00 = (uint64_t)l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c00 = (uint64_t)(l_ >> (uint32_t)51U); + uint128_t l_0 = tmp_w11 + (uint128_t)c00; + uint64_t tmp10 = (uint64_t)l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c10 = (uint64_t)(l_0 >> (uint32_t)51U); + uint128_t l_1 = tmp_w12 + (uint128_t)c10; + uint64_t tmp20 = (uint64_t)l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c20 = (uint64_t)(l_1 >> (uint32_t)51U); + uint128_t l_2 = tmp_w13 + (uint128_t)c20; + uint64_t tmp30 = (uint64_t)l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c30 = (uint64_t)(l_2 >> (uint32_t)51U); + uint128_t l_3 = tmp_w14 + (uint128_t)c30; + uint64_t tmp40 = (uint64_t)l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c40 = (uint64_t)(l_3 >> (uint32_t)51U); + uint64_t l_4 = tmp00 + c40 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c50 = l_4 >> (uint32_t)51U; + uint64_t o100 = tmp0_; + uint64_t o112 = tmp10 + c50; + uint64_t o122 = tmp20; + uint64_t o132 = tmp30; + uint64_t o142 = tmp40; + uint128_t l_5 = tmp_w20 + (uint128_t)(uint64_t)0U; + uint64_t tmp0 = (uint64_t)l_5 & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = (uint64_t)(l_5 >> (uint32_t)51U); + uint128_t l_6 = tmp_w21 + (uint128_t)c0; + uint64_t tmp1 = (uint64_t)l_6 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = (uint64_t)(l_6 >> (uint32_t)51U); + uint128_t l_7 = tmp_w22 + (uint128_t)c1; + uint64_t tmp2 = (uint64_t)l_7 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = (uint64_t)(l_7 >> (uint32_t)51U); + uint128_t l_8 = tmp_w23 + (uint128_t)c2; + uint64_t tmp3 = (uint64_t)l_8 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = (uint64_t)(l_8 >> (uint32_t)51U); + uint128_t l_9 = tmp_w24 + (uint128_t)c3; + uint64_t tmp4 = (uint64_t)l_9 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = (uint64_t)(l_9 >> (uint32_t)51U); + uint64_t l_10 = tmp0 + c4 * (uint64_t)19U; + uint64_t tmp0_0 = l_10 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_10 >> (uint32_t)51U; + uint64_t o200 = tmp0_0; + uint64_t o212 = tmp1 + c5; + uint64_t o222 = tmp2; + uint64_t o232 = tmp3; + uint64_t o242 = tmp4; + uint64_t o10 = o100; + uint64_t o11 = o112; + uint64_t o12 = o122; + uint64_t o13 = o132; + uint64_t o14 = o142; + uint64_t o20 = o200; + uint64_t o21 = o212; + uint64_t o22 = o222; + uint64_t o23 = o232; + uint64_t o24 = o242; + out[0U] = o10; + out[1U] = o11; + out[2U] = o12; + out[3U] = o13; + out[4U] = o14; + out[5U] = o20; + out[6U] = o21; + out[7U] = o22; + out[8U] = o23; + out[9U] = o24; +} + +inline static void Hacl_Impl_Curve25519_Field51_fmul1(uint64_t *out, uint64_t *f1, uint64_t f2) +{ + uint64_t f10 = f1[0U]; + uint64_t f11 = f1[1U]; + uint64_t f12 = f1[2U]; + uint64_t f13 = f1[3U]; + uint64_t f14 = f1[4U]; + uint128_t tmp_w0 = (uint128_t)f2 * f10; + uint128_t tmp_w1 = (uint128_t)f2 * f11; + uint128_t tmp_w2 = (uint128_t)f2 * f12; + uint128_t tmp_w3 = (uint128_t)f2 * f13; + uint128_t tmp_w4 = (uint128_t)f2 * f14; + uint128_t l_ = tmp_w0 + (uint128_t)(uint64_t)0U; + uint64_t tmp0 = (uint64_t)l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = (uint64_t)(l_ >> (uint32_t)51U); + uint128_t l_0 = tmp_w1 + (uint128_t)c0; + uint64_t tmp1 = (uint64_t)l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = (uint64_t)(l_0 >> (uint32_t)51U); + uint128_t l_1 = tmp_w2 + (uint128_t)c1; + uint64_t tmp2 = (uint64_t)l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = (uint64_t)(l_1 >> (uint32_t)51U); + uint128_t l_2 = tmp_w3 + (uint128_t)c2; + uint64_t tmp3 = (uint64_t)l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = (uint64_t)(l_2 >> (uint32_t)51U); + uint128_t l_3 = tmp_w4 + (uint128_t)c3; + uint64_t tmp4 = (uint64_t)l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = (uint64_t)(l_3 >> (uint32_t)51U); + uint64_t l_4 = tmp0 + c4 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_4 >> (uint32_t)51U; + uint64_t o0 = tmp0_; + uint64_t o1 = tmp1 + c5; + uint64_t o2 = tmp2; + uint64_t o3 = tmp3; + uint64_t o4 = tmp4; + out[0U] = o0; + out[1U] = o1; + out[2U] = o2; + out[3U] = o3; + out[4U] = o4; +} + +inline static void Hacl_Impl_Curve25519_Field51_fsqr(uint64_t *out, uint64_t *f) +{ + uint64_t f0 = f[0U]; + uint64_t f1 = f[1U]; + uint64_t f2 = f[2U]; + uint64_t f3 = f[3U]; + uint64_t f4 = f[4U]; + uint64_t d0 = (uint64_t)2U * f0; + uint64_t d1 = (uint64_t)2U * f1; + uint64_t d2 = (uint64_t)38U * f2; + uint64_t d3 = (uint64_t)19U * f3; + uint64_t d419 = (uint64_t)19U * f4; + uint64_t d4 = (uint64_t)2U * d419; + uint128_t s0 = (uint128_t)f0 * f0 + (uint128_t)d4 * f1 + (uint128_t)d2 * f3; + uint128_t s1 = (uint128_t)d0 * f1 + (uint128_t)d4 * f2 + (uint128_t)d3 * f3; + uint128_t s2 = (uint128_t)d0 * f2 + (uint128_t)f1 * f1 + (uint128_t)d4 * f3; + uint128_t s3 = (uint128_t)d0 * f3 + (uint128_t)d1 * f2 + (uint128_t)f4 * d419; + uint128_t s4 = (uint128_t)d0 * f4 + (uint128_t)d1 * f3 + (uint128_t)f2 * f2; + uint128_t o00 = s0; + uint128_t o10 = s1; + uint128_t o20 = s2; + uint128_t o30 = s3; + uint128_t o40 = s4; + uint128_t l_ = o00 + (uint128_t)(uint64_t)0U; + uint64_t tmp0 = (uint64_t)l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = (uint64_t)(l_ >> (uint32_t)51U); + uint128_t l_0 = o10 + (uint128_t)c0; + uint64_t tmp1 = (uint64_t)l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = (uint64_t)(l_0 >> (uint32_t)51U); + uint128_t l_1 = o20 + (uint128_t)c1; + uint64_t tmp2 = (uint64_t)l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = (uint64_t)(l_1 >> (uint32_t)51U); + uint128_t l_2 = o30 + (uint128_t)c2; + uint64_t tmp3 = (uint64_t)l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = (uint64_t)(l_2 >> (uint32_t)51U); + uint128_t l_3 = o40 + (uint128_t)c3; + uint64_t tmp4 = (uint64_t)l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = (uint64_t)(l_3 >> (uint32_t)51U); + uint64_t l_4 = tmp0 + c4 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_4 >> (uint32_t)51U; + uint64_t o0 = tmp0_; + uint64_t o1 = tmp1 + c5; + uint64_t o2 = tmp2; + uint64_t o3 = tmp3; + uint64_t o4 = tmp4; + out[0U] = o0; + out[1U] = o1; + out[2U] = o2; + out[3U] = o3; + out[4U] = o4; +} + +inline static void Hacl_Impl_Curve25519_Field51_fsqr2(uint64_t *out, uint64_t *f) +{ + uint64_t f10 = f[0U]; + uint64_t f11 = f[1U]; + uint64_t f12 = f[2U]; + uint64_t f13 = f[3U]; + uint64_t f14 = f[4U]; + uint64_t f20 = f[5U]; + uint64_t f21 = f[6U]; + uint64_t f22 = f[7U]; + uint64_t f23 = f[8U]; + uint64_t f24 = f[9U]; + uint64_t d00 = (uint64_t)2U * f10; + uint64_t d10 = (uint64_t)2U * f11; + uint64_t d20 = (uint64_t)38U * f12; + uint64_t d30 = (uint64_t)19U * f13; + uint64_t d4190 = (uint64_t)19U * f14; + uint64_t d40 = (uint64_t)2U * d4190; + uint128_t s00 = (uint128_t)f10 * f10 + (uint128_t)d40 * f11 + (uint128_t)d20 * f13; + uint128_t s10 = (uint128_t)d00 * f11 + (uint128_t)d40 * f12 + (uint128_t)d30 * f13; + uint128_t s20 = (uint128_t)d00 * f12 + (uint128_t)f11 * f11 + (uint128_t)d40 * f13; + uint128_t s30 = (uint128_t)d00 * f13 + (uint128_t)d10 * f12 + (uint128_t)f14 * d4190; + uint128_t s40 = (uint128_t)d00 * f14 + (uint128_t)d10 * f13 + (uint128_t)f12 * f12; + uint128_t o100 = s00; + uint128_t o110 = s10; + uint128_t o120 = s20; + uint128_t o130 = s30; + uint128_t o140 = s40; + uint64_t d0 = (uint64_t)2U * f20; + uint64_t d1 = (uint64_t)2U * f21; + uint64_t d2 = (uint64_t)38U * f22; + uint64_t d3 = (uint64_t)19U * f23; + uint64_t d419 = (uint64_t)19U * f24; + uint64_t d4 = (uint64_t)2U * d419; + uint128_t s0 = (uint128_t)f20 * f20 + (uint128_t)d4 * f21 + (uint128_t)d2 * f23; + uint128_t s1 = (uint128_t)d0 * f21 + (uint128_t)d4 * f22 + (uint128_t)d3 * f23; + uint128_t s2 = (uint128_t)d0 * f22 + (uint128_t)f21 * f21 + (uint128_t)d4 * f23; + uint128_t s3 = (uint128_t)d0 * f23 + (uint128_t)d1 * f22 + (uint128_t)f24 * d419; + uint128_t s4 = (uint128_t)d0 * f24 + (uint128_t)d1 * f23 + (uint128_t)f22 * f22; + uint128_t o200 = s0; + uint128_t o210 = s1; + uint128_t o220 = s2; + uint128_t o230 = s3; + uint128_t o240 = s4; + uint128_t l_ = o100 + (uint128_t)(uint64_t)0U; + uint64_t tmp00 = (uint64_t)l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c00 = (uint64_t)(l_ >> (uint32_t)51U); + uint128_t l_0 = o110 + (uint128_t)c00; + uint64_t tmp10 = (uint64_t)l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c10 = (uint64_t)(l_0 >> (uint32_t)51U); + uint128_t l_1 = o120 + (uint128_t)c10; + uint64_t tmp20 = (uint64_t)l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c20 = (uint64_t)(l_1 >> (uint32_t)51U); + uint128_t l_2 = o130 + (uint128_t)c20; + uint64_t tmp30 = (uint64_t)l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c30 = (uint64_t)(l_2 >> (uint32_t)51U); + uint128_t l_3 = o140 + (uint128_t)c30; + uint64_t tmp40 = (uint64_t)l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c40 = (uint64_t)(l_3 >> (uint32_t)51U); + uint64_t l_4 = tmp00 + c40 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c50 = l_4 >> (uint32_t)51U; + uint64_t o101 = tmp0_; + uint64_t o111 = tmp10 + c50; + uint64_t o121 = tmp20; + uint64_t o131 = tmp30; + uint64_t o141 = tmp40; + uint128_t l_5 = o200 + (uint128_t)(uint64_t)0U; + uint64_t tmp0 = (uint64_t)l_5 & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = (uint64_t)(l_5 >> (uint32_t)51U); + uint128_t l_6 = o210 + (uint128_t)c0; + uint64_t tmp1 = (uint64_t)l_6 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = (uint64_t)(l_6 >> (uint32_t)51U); + uint128_t l_7 = o220 + (uint128_t)c1; + uint64_t tmp2 = (uint64_t)l_7 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = (uint64_t)(l_7 >> (uint32_t)51U); + uint128_t l_8 = o230 + (uint128_t)c2; + uint64_t tmp3 = (uint64_t)l_8 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = (uint64_t)(l_8 >> (uint32_t)51U); + uint128_t l_9 = o240 + (uint128_t)c3; + uint64_t tmp4 = (uint64_t)l_9 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = (uint64_t)(l_9 >> (uint32_t)51U); + uint64_t l_10 = tmp0 + c4 * (uint64_t)19U; + uint64_t tmp0_0 = l_10 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_10 >> (uint32_t)51U; + uint64_t o201 = tmp0_0; + uint64_t o211 = tmp1 + c5; + uint64_t o221 = tmp2; + uint64_t o231 = tmp3; + uint64_t o241 = tmp4; + uint64_t o10 = o101; + uint64_t o11 = o111; + uint64_t o12 = o121; + uint64_t o13 = o131; + uint64_t o14 = o141; + uint64_t o20 = o201; + uint64_t o21 = o211; + uint64_t o22 = o221; + uint64_t o23 = o231; + uint64_t o24 = o241; + out[0U] = o10; + out[1U] = o11; + out[2U] = o12; + out[3U] = o13; + out[4U] = o14; + out[5U] = o20; + out[6U] = o21; + out[7U] = o22; + out[8U] = o23; + out[9U] = o24; +} + +inline static void Hacl_Impl_Curve25519_Field51_store_felem(uint64_t *u64s, uint64_t *f) +{ + uint64_t f0 = f[0U]; + uint64_t f1 = f[1U]; + uint64_t f2 = f[2U]; + uint64_t f3 = f[3U]; + uint64_t f4 = f[4U]; + uint64_t l_ = f0 + (uint64_t)0U; + uint64_t tmp0 = l_ & (uint64_t)0x7ffffffffffffU; + uint64_t c0 = l_ >> (uint32_t)51U; + uint64_t l_0 = f1 + c0; + uint64_t tmp1 = l_0 & (uint64_t)0x7ffffffffffffU; + uint64_t c1 = l_0 >> (uint32_t)51U; + uint64_t l_1 = f2 + c1; + uint64_t tmp2 = l_1 & (uint64_t)0x7ffffffffffffU; + uint64_t c2 = l_1 >> (uint32_t)51U; + uint64_t l_2 = f3 + c2; + uint64_t tmp3 = l_2 & (uint64_t)0x7ffffffffffffU; + uint64_t c3 = l_2 >> (uint32_t)51U; + uint64_t l_3 = f4 + c3; + uint64_t tmp4 = l_3 & (uint64_t)0x7ffffffffffffU; + uint64_t c4 = l_3 >> (uint32_t)51U; + uint64_t l_4 = tmp0 + c4 * (uint64_t)19U; + uint64_t tmp0_ = l_4 & (uint64_t)0x7ffffffffffffU; + uint64_t c5 = l_4 >> (uint32_t)51U; + uint64_t f01 = tmp0_; + uint64_t f11 = tmp1 + c5; + uint64_t f21 = tmp2; + uint64_t f31 = tmp3; + uint64_t f41 = tmp4; + uint64_t m0 = FStar_UInt64_gte_mask(f01, (uint64_t)0x7ffffffffffedU); + uint64_t m1 = FStar_UInt64_eq_mask(f11, (uint64_t)0x7ffffffffffffU); + uint64_t m2 = FStar_UInt64_eq_mask(f21, (uint64_t)0x7ffffffffffffU); + uint64_t m3 = FStar_UInt64_eq_mask(f31, (uint64_t)0x7ffffffffffffU); + uint64_t m4 = FStar_UInt64_eq_mask(f41, (uint64_t)0x7ffffffffffffU); + uint64_t mask = (((m0 & m1) & m2) & m3) & m4; + uint64_t f0_ = f01 - (mask & (uint64_t)0x7ffffffffffedU); + uint64_t f1_ = f11 - (mask & (uint64_t)0x7ffffffffffffU); + uint64_t f2_ = f21 - (mask & (uint64_t)0x7ffffffffffffU); + uint64_t f3_ = f31 - (mask & (uint64_t)0x7ffffffffffffU); + uint64_t f4_ = f41 - (mask & (uint64_t)0x7ffffffffffffU); + uint64_t f02 = f0_; + uint64_t f12 = f1_; + uint64_t f22 = f2_; + uint64_t f32 = f3_; + uint64_t f42 = f4_; + uint64_t o00 = f02 | f12 << (uint32_t)51U; + uint64_t o10 = f12 >> (uint32_t)13U | f22 << (uint32_t)38U; + uint64_t o20 = f22 >> (uint32_t)26U | f32 << (uint32_t)25U; + uint64_t o30 = f32 >> (uint32_t)39U | f42 << (uint32_t)12U; + uint64_t o0 = o00; + uint64_t o1 = o10; + uint64_t o2 = o20; + uint64_t o3 = o30; + u64s[0U] = o0; + u64s[1U] = o1; + u64s[2U] = o2; + u64s[3U] = o3; +} + +inline static void +Hacl_Impl_Curve25519_Field51_cswap2(uint64_t bit, uint64_t *p1, uint64_t *p2) +{ + uint64_t mask = (uint64_t)0U - bit; + uint32_t i; + for (i = (uint32_t)0U; i < (uint32_t)10U; i = i + (uint32_t)1U) + { + uint64_t dummy = mask & (p1[i] ^ p2[i]); + p1[i] = p1[i] ^ dummy; + p2[i] = p2[i] ^ dummy; + } +} + +inline static void +Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_51( + uint64_t *q, + uint64_t *p01_tmp1, + uint128_t *tmp2 +) +{ + uint64_t *nq = p01_tmp1; + uint64_t *nq_p1 = p01_tmp1 + (uint32_t)10U; + uint64_t *tmp1 = p01_tmp1 + (uint32_t)20U; + uint64_t *x1 = q; + uint64_t *x2 = nq; + uint64_t *z2 = nq + (uint32_t)5U; + uint64_t *z3 = nq_p1 + (uint32_t)5U; + uint64_t *a = tmp1; + uint64_t *b = tmp1 + (uint32_t)5U; + uint64_t *ab = tmp1; + uint64_t *dc = tmp1 + (uint32_t)10U; + uint64_t *x3; + uint64_t *z31; + uint64_t *d0; + uint64_t *c0; + uint64_t *a1; + uint64_t *b1; + uint64_t *d; + uint64_t *c; + uint64_t *ab1; + uint64_t *dc1; + Hacl_Impl_Curve25519_Field51_fadd(a, x2, z2); + Hacl_Impl_Curve25519_Field51_fsub(b, x2, z2); + x3 = nq_p1; + z31 = nq_p1 + (uint32_t)5U; + d0 = dc; + c0 = dc + (uint32_t)5U; + Hacl_Impl_Curve25519_Field51_fadd(c0, x3, z31); + Hacl_Impl_Curve25519_Field51_fsub(d0, x3, z31); + Hacl_Impl_Curve25519_Field51_fmul2(dc, dc, ab); + Hacl_Impl_Curve25519_Field51_fadd(x3, d0, c0); + Hacl_Impl_Curve25519_Field51_fsub(z31, d0, c0); + a1 = tmp1; + b1 = tmp1 + (uint32_t)5U; + d = tmp1 + (uint32_t)10U; + c = tmp1 + (uint32_t)15U; + ab1 = tmp1; + dc1 = tmp1 + (uint32_t)10U; + Hacl_Impl_Curve25519_Field51_fsqr2(dc1, ab1); + Hacl_Impl_Curve25519_Field51_fsqr2(nq_p1, nq_p1); + a1[0U] = c[0U]; + a1[1U] = c[1U]; + a1[2U] = c[2U]; + a1[3U] = c[3U]; + a1[4U] = c[4U]; + Hacl_Impl_Curve25519_Field51_fsub(c, d, c); + Hacl_Impl_Curve25519_Field51_fmul1(b1, c, (uint64_t)121665U); + Hacl_Impl_Curve25519_Field51_fadd(b1, b1, d); + Hacl_Impl_Curve25519_Field51_fmul2(nq, dc1, ab1); + Hacl_Impl_Curve25519_Field51_fmul(z3, z3, x1); +} + +inline static void +Hacl_Impl_Curve25519_AddAndDouble_point_double_51( + uint64_t *nq, + uint64_t *tmp1, + uint128_t *tmp2 +) +{ + uint64_t *x2 = nq; + uint64_t *z2 = nq + (uint32_t)5U; + uint64_t *a = tmp1; + uint64_t *b = tmp1 + (uint32_t)5U; + uint64_t *d = tmp1 + (uint32_t)10U; + uint64_t *c = tmp1 + (uint32_t)15U; + uint64_t *ab = tmp1; + uint64_t *dc = tmp1 + (uint32_t)10U; + Hacl_Impl_Curve25519_Field51_fadd(a, x2, z2); + Hacl_Impl_Curve25519_Field51_fsub(b, x2, z2); + Hacl_Impl_Curve25519_Field51_fsqr2(dc, ab); + a[0U] = c[0U]; + a[1U] = c[1U]; + a[2U] = c[2U]; + a[3U] = c[3U]; + a[4U] = c[4U]; + Hacl_Impl_Curve25519_Field51_fsub(c, d, c); + Hacl_Impl_Curve25519_Field51_fmul1(b, c, (uint64_t)121665U); + Hacl_Impl_Curve25519_Field51_fadd(b, b, d); + Hacl_Impl_Curve25519_Field51_fmul2(nq, dc, ab); +} + +inline static void +Hacl_Impl_Curve25519_Finv_fsquare_times_51( + uint64_t *o, + uint64_t *i, + uint128_t *tmp, + uint32_t n1 +) +{ + uint32_t i0; + Hacl_Impl_Curve25519_Field51_fsqr(o, i); + for (i0 = (uint32_t)0U; i0 < n1 - (uint32_t)1U; i0 = i0 + (uint32_t)1U) + { + Hacl_Impl_Curve25519_Field51_fsqr(o, o); + } +} + +inline static void Hacl_Impl_Curve25519_Finv_finv_51(uint64_t *o, uint64_t *i, uint128_t *tmp) +{ + uint64_t t1[20U] = { 0U }; + uint64_t *a0 = t1; + uint64_t *b = t1 + (uint32_t)5U; + uint64_t *c = t1 + (uint32_t)10U; + uint64_t *t00 = t1 + (uint32_t)15U; + uint128_t *tmp1 = tmp; + uint64_t *a; + uint64_t *t0; + Hacl_Impl_Curve25519_Finv_fsquare_times_51(a0, i, tmp1, (uint32_t)1U); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, a0, tmp1, (uint32_t)2U); + Hacl_Impl_Curve25519_Field51_fmul(b, t00, i); + Hacl_Impl_Curve25519_Field51_fmul(a0, b, a0); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, a0, tmp1, (uint32_t)1U); + Hacl_Impl_Curve25519_Field51_fmul(b, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, b, tmp1, (uint32_t)5U); + Hacl_Impl_Curve25519_Field51_fmul(b, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, b, tmp1, (uint32_t)10U); + Hacl_Impl_Curve25519_Field51_fmul(c, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, c, tmp1, (uint32_t)20U); + Hacl_Impl_Curve25519_Field51_fmul(t00, t00, c); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, t00, tmp1, (uint32_t)10U); + Hacl_Impl_Curve25519_Field51_fmul(b, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, b, tmp1, (uint32_t)50U); + Hacl_Impl_Curve25519_Field51_fmul(c, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, c, tmp1, (uint32_t)100U); + Hacl_Impl_Curve25519_Field51_fmul(t00, t00, c); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, t00, tmp1, (uint32_t)50U); + Hacl_Impl_Curve25519_Field51_fmul(t00, t00, b); + Hacl_Impl_Curve25519_Finv_fsquare_times_51(t00, t00, tmp1, (uint32_t)5U); + a = t1; + t0 = t1 + (uint32_t)15U; + Hacl_Impl_Curve25519_Field51_fmul(o, t0, a); +} + +inline static void +Hacl_Impl_Curve25519_Generic_montgomery_ladder_51(uint64_t *out, uint8_t *key, uint64_t *init1) +{ + KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)10U); + { + uint128_t tmp2[10U]; + { + uint32_t _i; + for (_i = 0U; _i < (uint32_t)10U; ++_i) + tmp2[_i] = (uint128_t)(uint64_t)0U; + } + { + uint64_t p01_tmp1_swap[41U] = { 0U }; + uint64_t *p0 = p01_tmp1_swap; + uint64_t *p01 = p01_tmp1_swap; + uint64_t *p03 = p01; + uint64_t *p11 = p01 + (uint32_t)10U; + uint64_t *x0; + uint64_t *z0; + uint64_t *p01_tmp1; + uint64_t *p01_tmp11; + uint64_t *nq10; + uint64_t *nq_p11; + uint64_t *swap1; + uint64_t sw0; + uint64_t *nq1; + uint64_t *tmp1; + memcpy(p11, init1, (uint32_t)10U * sizeof init1[0U]); + x0 = p03; + z0 = p03 + (uint32_t)5U; + x0[0U] = (uint64_t)1U; + x0[1U] = (uint64_t)0U; + x0[2U] = (uint64_t)0U; + x0[3U] = (uint64_t)0U; + x0[4U] = (uint64_t)0U; + z0[0U] = (uint64_t)0U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + z0[4U] = (uint64_t)0U; + p01_tmp1 = p01_tmp1_swap; + p01_tmp11 = p01_tmp1_swap; + nq10 = p01_tmp1_swap; + nq_p11 = p01_tmp1_swap + (uint32_t)10U; + swap1 = p01_tmp1_swap + (uint32_t)40U; + Hacl_Impl_Curve25519_Field51_cswap2((uint64_t)1U, nq10, nq_p11); + Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_51(init1, p01_tmp11, tmp2); + swap1[0U] = (uint64_t)1U; + { + uint32_t i; + for (i = (uint32_t)0U; i < (uint32_t)251U; i = i + (uint32_t)1U) + { + uint64_t *p01_tmp12 = p01_tmp1_swap; + uint64_t *swap2 = p01_tmp1_swap + (uint32_t)40U; + uint64_t *nq2 = p01_tmp12; + uint64_t *nq_p12 = p01_tmp12 + (uint32_t)10U; + uint64_t + bit = + (uint64_t)(key[((uint32_t)253U - i) + / (uint32_t)8U] + >> ((uint32_t)253U - i) % (uint32_t)8U + & (uint8_t)1U); + uint64_t sw = swap2[0U] ^ bit; + Hacl_Impl_Curve25519_Field51_cswap2(sw, nq2, nq_p12); + Hacl_Impl_Curve25519_AddAndDouble_point_add_and_double_51(init1, p01_tmp12, tmp2); + swap2[0U] = bit; + } + } + sw0 = swap1[0U]; + Hacl_Impl_Curve25519_Field51_cswap2(sw0, nq10, nq_p11); + nq1 = p01_tmp1; + tmp1 = p01_tmp1 + (uint32_t)20U; + Hacl_Impl_Curve25519_AddAndDouble_point_double_51(nq1, tmp1, tmp2); + Hacl_Impl_Curve25519_AddAndDouble_point_double_51(nq1, tmp1, tmp2); + Hacl_Impl_Curve25519_AddAndDouble_point_double_51(nq1, tmp1, tmp2); + memcpy(out, p0, (uint32_t)10U * sizeof p0[0U]); + } + } +} + +static uint8_t +Hacl_Impl_Curve25519_Generic_g25519[32U] = + { + (uint8_t)9U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, + (uint8_t)0U, (uint8_t)0U, (uint8_t)0U, (uint8_t)0U + }; + +void Hacl_Curve25519_51_secret_to_public(uint8_t *pub, uint8_t *priv) +{ + uint8_t basepoint[32U] = { 0U }; + { + uint32_t i; + for (i = (uint32_t)0U; i < (uint32_t)32U; i = i + (uint32_t)1U) + { + uint8_t *os = basepoint; + uint8_t x = Hacl_Impl_Curve25519_Generic_g25519[i]; + os[i] = x; + } + } + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)5U); + { + uint64_t init1[(uint32_t)2U * (uint32_t)5U]; + memset(init1, 0U, (uint32_t)2U * (uint32_t)5U * sizeof init1[0U]); + { + uint64_t tmp0[4U] = { 0U }; + uint64_t tmp3; + uint64_t *x; + uint64_t *z0; + uint64_t f0l; + uint64_t f0h; + uint64_t f1l; + uint64_t f1h; + uint64_t f2l; + uint64_t f2h; + uint64_t f3l; + uint64_t f3h; + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)0U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[0U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)1U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[1U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)2U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[2U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = basepoint + (uint32_t)3U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[3U] = x; + } + tmp3 = tmp0[3U]; + tmp0[3U] = tmp3 & (uint64_t)0x7fffffffffffffffU; + x = init1; + z0 = init1 + (uint32_t)5U; + z0[0U] = (uint64_t)1U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + z0[4U] = (uint64_t)0U; + f0l = tmp0[0U] & (uint64_t)0x7ffffffffffffU; + f0h = tmp0[0U] >> (uint32_t)51U; + f1l = (tmp0[1U] & (uint64_t)0x3fffffffffU) << (uint32_t)13U; + f1h = tmp0[1U] >> (uint32_t)38U; + f2l = (tmp0[2U] & (uint64_t)0x1ffffffU) << (uint32_t)26U; + f2h = tmp0[2U] >> (uint32_t)25U; + f3l = (tmp0[3U] & (uint64_t)0xfffU) << (uint32_t)39U; + f3h = tmp0[3U] >> (uint32_t)12U; + x[0U] = f0l; + x[1U] = f0h | f1l; + x[2U] = f1h | f2l; + x[3U] = f2h | f3l; + x[4U] = f3h; + Hacl_Impl_Curve25519_Generic_montgomery_ladder_51(init1, priv, init1); + { + uint64_t *x0 = init1; + uint64_t *z = init1 + (uint32_t)5U; + uint64_t tmp[5U] = { 0U }; + uint64_t u64s[4U] = { 0U }; + KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)2U * (uint32_t)5U); + { + uint128_t tmp_w[(uint32_t)2U * (uint32_t)5U]; + { + uint32_t _i; + for (_i = 0U; _i < (uint32_t)2U * (uint32_t)5U; ++_i) + tmp_w[_i] = (uint128_t)(uint64_t)0U; + } + Hacl_Impl_Curve25519_Finv_finv_51(tmp, z, tmp_w); + Hacl_Impl_Curve25519_Field51_fmul(tmp, tmp, x0); + Hacl_Impl_Curve25519_Field51_store_felem(u64s, tmp); + { + store64_le(pub + (uint32_t)0U * (uint32_t)8U, u64s[0U]); + } + { + store64_le(pub + (uint32_t)1U * (uint32_t)8U, u64s[1U]); + } + { + store64_le(pub + (uint32_t)2U * (uint32_t)8U, u64s[2U]); + } + { + store64_le(pub + (uint32_t)3U * (uint32_t)8U, u64s[3U]); + } + } + } + } + } +} + +void curve25519_hacl51(uint8_t *shared, uint8_t *my_priv, uint8_t *their_pub) +{ + KRML_CHECK_SIZE(sizeof (uint64_t), (uint32_t)2U * (uint32_t)5U); + { + uint64_t init1[(uint32_t)2U * (uint32_t)5U]; + memset(init1, 0U, (uint32_t)2U * (uint32_t)5U * sizeof init1[0U]); + { + uint64_t tmp0[4U] = { 0U }; + uint64_t tmp3; + uint64_t *x; + uint64_t *z0; + uint64_t f0l; + uint64_t f0h; + uint64_t f1l; + uint64_t f1h; + uint64_t f2l; + uint64_t f2h; + uint64_t f3l; + uint64_t f3h; + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)0U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[0U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)1U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[1U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)2U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[2U] = x; + } + { + uint64_t *os = tmp0; + uint8_t *bj = their_pub + (uint32_t)3U * (uint32_t)8U; + uint64_t u = load64_le(bj); + uint64_t r = u; + uint64_t x = r; + os[3U] = x; + } + tmp3 = tmp0[3U]; + tmp0[3U] = tmp3 & (uint64_t)0x7fffffffffffffffU; + x = init1; + z0 = init1 + (uint32_t)5U; + z0[0U] = (uint64_t)1U; + z0[1U] = (uint64_t)0U; + z0[2U] = (uint64_t)0U; + z0[3U] = (uint64_t)0U; + z0[4U] = (uint64_t)0U; + f0l = tmp0[0U] & (uint64_t)0x7ffffffffffffU; + f0h = tmp0[0U] >> (uint32_t)51U; + f1l = (tmp0[1U] & (uint64_t)0x3fffffffffU) << (uint32_t)13U; + f1h = tmp0[1U] >> (uint32_t)38U; + f2l = (tmp0[2U] & (uint64_t)0x1ffffffU) << (uint32_t)26U; + f2h = tmp0[2U] >> (uint32_t)25U; + f3l = (tmp0[3U] & (uint64_t)0xfffU) << (uint32_t)39U; + f3h = tmp0[3U] >> (uint32_t)12U; + x[0U] = f0l; + x[1U] = f0h | f1l; + x[2U] = f1h | f2l; + x[3U] = f2h | f3l; + x[4U] = f3h; + Hacl_Impl_Curve25519_Generic_montgomery_ladder_51(init1, my_priv, init1); + { + uint64_t *x0 = init1; + uint64_t *z = init1 + (uint32_t)5U; + uint64_t tmp[5U] = { 0U }; + uint64_t u64s[4U] = { 0U }; + KRML_CHECK_SIZE(sizeof (uint128_t), (uint32_t)2U * (uint32_t)5U); + { + uint128_t tmp_w[(uint32_t)2U * (uint32_t)5U]; + { + uint32_t _i; + for (_i = 0U; _i < (uint32_t)2U * (uint32_t)5U; ++_i) + tmp_w[_i] = (uint128_t)(uint64_t)0U; + } + Hacl_Impl_Curve25519_Finv_finv_51(tmp, z, tmp_w); + Hacl_Impl_Curve25519_Field51_fmul(tmp, tmp, x0); + Hacl_Impl_Curve25519_Field51_store_felem(u64s, tmp); + { + store64_le(shared + (uint32_t)0U * (uint32_t)8U, u64s[0U]); + } + { + store64_le(shared + (uint32_t)1U * (uint32_t)8U, u64s[1U]); + } + { + store64_le(shared + (uint32_t)2U * (uint32_t)8U, u64s[2U]); + } + { + store64_le(shared + (uint32_t)3U * (uint32_t)8U, u64s[3U]); + } + } + } + } + } +} + diff --git a/curve25519-vale-inline.h b/curve25519-vale-inline.h new file mode 100644 index 0000000..9817fab --- /dev/null +++ b/curve25519-vale-inline.h @@ -0,0 +1,810 @@ +#ifdef __GNUC__ +#pragma once +#include +#include +typedef __uint128_t uint128_t; +typedef u64 uint64_t; +typedef u8 uint8_t; + + +static inline uint64_t add1 (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t arg2_r asm("rdx") = arg2; + register uint64_t carry_r asm("rax"); + + __asm__ __volatile__( + " xor %%r8, %%r8;" + " xor %%r9, %%r9;" + " xor %%r10, %%r10;" + " xor %%r11, %%r11;" + " xor %%rax, %%rax;" + " addq 0(%%rsi), %%rdx;" + " movq %%rdx, 0(%%rdi);" + " adcxq 8(%%rsi), %%r8;" + " movq %%r8, 8(%%rdi);" + " adcxq 16(%%rsi), %%r9;" + " movq %%r9, 16(%%rdi);" + " adcxq 24(%%rsi), %%r10;" + " movq %%r10, 24(%%rdi);" + " adcx %%r11, %%rax;" + : "+r" (arg2_r), "=r" (carry_r) + : "r" (arg0_r), "r" (arg1_r) + : "%r8", "%r9", "%r10", "%r11", "memory", "cc" + ); + +return carry_r; +} + +static inline void fadd (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " movq 0(%%rdx), %%r8;" + " addq 0(%%rsi), %%r8;" + " movq 8(%%rdx), %%r9;" + " adcxq 8(%%rsi), %%r9;" + " movq 16(%%rdx), %%r10;" + " adcxq 16(%%rsi), %%r10;" + " movq 24(%%rdx), %%r11;" + " adcxq 24(%%rsi), %%r11;" + " mov $0, %%rax;" + " mov $38, %%rdx;" + " cmovc %%rdx, %%rax;" + " xor %%rcx, %%rcx;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + : "+r" (arg2_r) + : "r" (arg0_r), "r" (arg1_r) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" + ); +} + +static inline void fsub (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " movq 0(%%rsi), %%r8;" + " subq 0(%%rdx), %%r8;" + " movq 8(%%rsi), %%r9;" + " sbbq 8(%%rdx), %%r9;" + " movq 16(%%rsi), %%r10;" + " sbbq 16(%%rdx), %%r10;" + " movq 24(%%rsi), %%r11;" + " sbbq 24(%%rdx), %%r11;" + " mov $0, %%rax;" + " mov $38, %%rcx;" + " cmovc %%rcx, %%rax;" + " sub %%rax, %%r8;" + " sbb $0, %%r9;" + " sbb $0, %%r10;" + " sbb $0, %%r11;" + " mov $0, %%rax;" + " cmovc %%rcx, %%rax;" + " sub %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + " movq %%r9, 8(%%rdi);" + " movq %%r10, 16(%%rdi);" + " movq %%r11, 24(%%rdi);" + : + : "r" (arg0_r), "r" (arg1_r), "r" (arg2_r) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" + ); +} + +static inline void fmul (uint64_t* arg2, uint64_t* arg1, uint64_t* arg3, uint64_t* arg0) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + register uint64_t* arg3_r asm("rcx") = arg3; + + __asm__ __volatile__( + " mov %%rdx, %%r15;" + " movq 0(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " movq %%r8, 0(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " movq 8(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 8(%%rdi), %%r8;" + " movq %%r8, 8(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 16(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 16(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 16(%%rdi), %%r8;" + " movq %%r8, 16(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 24(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 24(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 24(%%rdi), %%r8;" + " movq %%r8, 24(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 32(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " movq %%r12, 40(%%rdi);" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%%rdi);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%%rdi);" + " mov %%rdi, %%rsi;" + " mov %%r15, %%rdi;" + " mov $38, %%rdx;" + " mulxq 32(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 0(%%rsi), %%r8;" + " mulxq 40(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 8(%%rsi), %%r9;" + " mulxq 48(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 16(%%rsi), %%r10;" + " mulxq 56(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r) + : + : "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" + ); +} + +static inline void fmul2 (uint64_t* arg2, uint64_t* arg1, uint64_t* arg3, uint64_t* arg0) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + register uint64_t* arg3_r asm("rcx") = arg3; + + __asm__ __volatile__( + " mov %%rdx, %%r15;" + " movq 0(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " movq %%r8, 0(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 8(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " movq 8(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 8(%%rdi), %%r8;" + " movq %%r8, 8(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 16(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 16(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 16(%%rdi), %%r8;" + " movq %%r8, 16(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 24(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 24(%%rsi), %%rdx;" + " mulxq 0(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 24(%%rdi), %%r8;" + " movq %%r8, 24(%%rdi);" + " mulxq 8(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 32(%%rdi);" + " mulxq 16(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " movq %%r12, 40(%%rdi);" + " mov $0, %%r8;" + " mulxq 24(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 48(%%rdi);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 56(%%rdi);" + " movq 32(%%rsi), %%rdx;" + " mulxq 32(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " movq %%r8, 64(%%rdi);" + " mulxq 40(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " movq %%r10, 72(%%rdi);" + " mulxq 48(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " mulxq 56(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " movq 40(%%rsi), %%rdx;" + " mulxq 32(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 72(%%rdi), %%r8;" + " movq %%r8, 72(%%rdi);" + " mulxq 40(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 80(%%rdi);" + " mulxq 48(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 56(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 48(%%rsi), %%rdx;" + " mulxq 32(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 80(%%rdi), %%r8;" + " movq %%r8, 80(%%rdi);" + " mulxq 40(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 88(%%rdi);" + " mulxq 48(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " mov $0, %%r8;" + " mulxq 56(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq 56(%%rsi), %%rdx;" + " mulxq 32(%%rcx), %%r8, %%r9;" + " xor %%r10, %%r10;" + " adcxq 88(%%rdi), %%r8;" + " movq %%r8, 88(%%rdi);" + " mulxq 40(%%rcx), %%r10, %%r11;" + " adox %%r9, %%r10;" + " adcx %%r12, %%r10;" + " movq %%r10, 96(%%rdi);" + " mulxq 48(%%rcx), %%r12, %%r13;" + " adox %%r11, %%r12;" + " adcx %%r14, %%r12;" + " movq %%r12, 104(%%rdi);" + " mov $0, %%r8;" + " mulxq 56(%%rcx), %%r14, %%rdx;" + " adox %%r13, %%r14;" + " adcx %%rax, %%r14;" + " movq %%r14, 112(%%rdi);" + " mov $0, %%rax;" + " adox %%rdx, %%rax;" + " adcx %%r8, %%rax;" + " movq %%rax, 120(%%rdi);" + " mov %%rdi, %%rsi;" + " mov %%r15, %%rdi;" + " mov $38, %%rdx;" + " mulxq 32(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 0(%%rsi), %%r8;" + " mulxq 40(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 8(%%rsi), %%r9;" + " mulxq 48(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 16(%%rsi), %%r10;" + " mulxq 56(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + " mov $38, %%rdx;" + " mulxq 96(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 64(%%rsi), %%r8;" + " mulxq 104(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 72(%%rsi), %%r9;" + " mulxq 112(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 80(%%rsi), %%r10;" + " mulxq 120(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 40(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 48(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 56(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%%rdi);" + : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r) + : + : "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" + ); +} + +static inline void fmul1 (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " mulxq 0(%%rsi), %%r8, %%rcx;" + " mulxq 8(%%rsi), %%r9, %%r12;" + " add %%rcx, %%r9;" + " mov $0, %%rcx;" + " mulxq 16(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " mulxq 24(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adcx %%rcx, %%rax;" + " mov $38, %%rdx;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + : "+r" (arg2_r) + : "r" (arg0_r), "r" (arg1_r) + : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc" + ); +} + +static inline void cswap2 (uint64_t arg2,uint64_t* arg0, uint64_t* arg1) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " add $18446744073709551615, %%rdx;" + " movq 0(%%rdi), %%r8;" + " movq 0(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 0(%%rdi);" + " movq %%r9, 0(%%rsi);" + " movq 8(%%rdi), %%r8;" + " movq 8(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 8(%%rdi);" + " movq %%r9, 8(%%rsi);" + " movq 16(%%rdi), %%r8;" + " movq 16(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 16(%%rdi);" + " movq %%r9, 16(%%rsi);" + " movq 24(%%rdi), %%r8;" + " movq 24(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 24(%%rdi);" + " movq %%r9, 24(%%rsi);" + " movq 32(%%rdi), %%r8;" + " movq 32(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 32(%%rdi);" + " movq %%r9, 32(%%rsi);" + " movq 40(%%rdi), %%r8;" + " movq 40(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 40(%%rdi);" + " movq %%r9, 40(%%rsi);" + " movq 48(%%rdi), %%r8;" + " movq 48(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 48(%%rdi);" + " movq %%r9, 48(%%rsi);" + " movq 56(%%rdi), %%r8;" + " movq 56(%%rsi), %%r9;" + " mov %%r8, %%r10;" + " cmovc %%r9, %%r8;" + " cmovc %%r10, %%r9;" + " movq %%r8, 56(%%rdi);" + " movq %%r9, 56(%%rsi);" + : "+r" (arg2_r) + : "r" (arg0_r), "r" (arg1_r) + : "%r8", "%r9", "%r10", "memory", "cc" + ); +} + +static inline void fsqr (uint64_t* arg2, uint64_t* arg1, uint64_t* arg0) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " mov %%rdx, %%rbx;" + " movq 0(%%rsi), %%rdx;" + " mulxq 8(%%rsi), %%r8, %%r14;" + " xor %%r15, %%r15;" + " mulxq 16(%%rsi), %%r9, %%r10;" + " adcx %%r14, %%r9;" + " mulxq 24(%%rsi), %%rax, %%rcx;" + " adcx %%rax, %%r10;" + " movq 24(%%rsi), %%rdx;" + " mulxq 8(%%rsi), %%r11, %%r12;" + " adcx %%rcx, %%r11;" + " mulxq 16(%%rsi), %%rax, %%r13;" + " adcx %%rax, %%r12;" + " movq 8(%%rsi), %%rdx;" + " adcx %%r15, %%r13;" + " mulxq 16(%%rsi), %%rax, %%rcx;" + " mov $0, %%r14;" + " xor %%r15, %%r15;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%r12;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%r12, %%r12;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + " movq 0(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " movq %%rax, 0(%%rdi);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%%rdi);" + " movq 8(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r9;" + " movq %%r9, 16(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%%rdi);" + " movq 16(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r11;" + " movq %%r11, 32(%%rdi);" + " adcx %%rcx, %%r12;" + " movq %%r12, 40(%%rdi);" + " movq 24(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r13;" + " movq %%r13, 48(%%rdi);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%%rdi);" + " mov %%rdi, %%rsi;" + " mov %%rbx, %%rdi;" + " mov $38, %%rdx;" + " mulxq 32(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 0(%%rsi), %%r8;" + " mulxq 40(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 8(%%rsi), %%r9;" + " mulxq 48(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 16(%%rsi), %%r10;" + " mulxq 56(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r) + : + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" + ); +} + +static inline void fsqr2 (uint64_t* arg2, uint64_t* arg1, uint64_t* arg0) { + register uint64_t* arg0_r asm("rdi") = arg0; + register uint64_t* arg1_r asm("rsi") = arg1; + register uint64_t* arg2_r asm("rdx") = arg2; + + __asm__ __volatile__( + " mov %%rdx, %%rbx;" + " movq 0(%%rsi), %%rdx;" + " mulxq 8(%%rsi), %%r8, %%r14;" + " xor %%r15, %%r15;" + " mulxq 16(%%rsi), %%r9, %%r10;" + " adcx %%r14, %%r9;" + " mulxq 24(%%rsi), %%rax, %%rcx;" + " adcx %%rax, %%r10;" + " movq 24(%%rsi), %%rdx;" + " mulxq 8(%%rsi), %%r11, %%r12;" + " adcx %%rcx, %%r11;" + " mulxq 16(%%rsi), %%rax, %%r13;" + " adcx %%rax, %%r12;" + " movq 8(%%rsi), %%rdx;" + " adcx %%r15, %%r13;" + " mulxq 16(%%rsi), %%rax, %%rcx;" + " mov $0, %%r14;" + " xor %%r15, %%r15;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%r12;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%r12, %%r12;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + " movq 0(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " movq %%rax, 0(%%rdi);" + " add %%rcx, %%r8;" + " movq %%r8, 8(%%rdi);" + " movq 8(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r9;" + " movq %%r9, 16(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 24(%%rdi);" + " movq 16(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r11;" + " movq %%r11, 32(%%rdi);" + " adcx %%rcx, %%r12;" + " movq %%r12, 40(%%rdi);" + " movq 24(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r13;" + " movq %%r13, 48(%%rdi);" + " adcx %%rcx, %%r14;" + " movq %%r14, 56(%%rdi);" + " movq 32(%%rsi), %%rdx;" + " mulxq 40(%%rsi), %%r8, %%r14;" + " xor %%r15, %%r15;" + " mulxq 48(%%rsi), %%r9, %%r10;" + " adcx %%r14, %%r9;" + " mulxq 56(%%rsi), %%rax, %%rcx;" + " adcx %%rax, %%r10;" + " movq 56(%%rsi), %%rdx;" + " mulxq 40(%%rsi), %%r11, %%r12;" + " adcx %%rcx, %%r11;" + " mulxq 48(%%rsi), %%rax, %%r13;" + " adcx %%rax, %%r12;" + " movq 40(%%rsi), %%rdx;" + " adcx %%r15, %%r13;" + " mulxq 48(%%rsi), %%rax, %%rcx;" + " mov $0, %%r14;" + " xor %%r15, %%r15;" + " adox %%rax, %%r10;" + " adcx %%r8, %%r8;" + " adox %%rcx, %%r11;" + " adcx %%r9, %%r9;" + " adox %%r15, %%r12;" + " adcx %%r10, %%r10;" + " adox %%r15, %%r13;" + " adcx %%r11, %%r11;" + " adox %%r15, %%r14;" + " adcx %%r12, %%r12;" + " adcx %%r13, %%r13;" + " adcx %%r14, %%r14;" + " movq 32(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " movq %%rax, 64(%%rdi);" + " add %%rcx, %%r8;" + " movq %%r8, 72(%%rdi);" + " movq 40(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r9;" + " movq %%r9, 80(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 88(%%rdi);" + " movq 48(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r11;" + " movq %%r11, 96(%%rdi);" + " adcx %%rcx, %%r12;" + " movq %%r12, 104(%%rdi);" + " movq 56(%%rsi), %%rdx;" + " mulx %%rdx, %%rax, %%rcx;" + " adcx %%rax, %%r13;" + " movq %%r13, 112(%%rdi);" + " adcx %%rcx, %%r14;" + " movq %%r14, 120(%%rdi);" + " mov %%rdi, %%rsi;" + " mov %%rbx, %%rdi;" + " mov $38, %%rdx;" + " mulxq 32(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 0(%%rsi), %%r8;" + " mulxq 40(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 8(%%rsi), %%r9;" + " mulxq 48(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 16(%%rsi), %%r10;" + " mulxq 56(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 24(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 8(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 16(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 24(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 0(%%rdi);" + " mov $38, %%rdx;" + " mulxq 96(%%rsi), %%r8, %%r13;" + " xor %%rcx, %%rcx;" + " adoxq 64(%%rsi), %%r8;" + " mulxq 104(%%rsi), %%r9, %%r12;" + " adcx %%r13, %%r9;" + " adoxq 72(%%rsi), %%r9;" + " mulxq 112(%%rsi), %%r10, %%r13;" + " adcx %%r12, %%r10;" + " adoxq 80(%%rsi), %%r10;" + " mulxq 120(%%rsi), %%r11, %%rax;" + " adcx %%r13, %%r11;" + " adoxq 88(%%rsi), %%r11;" + " adcx %%rcx, %%rax;" + " adox %%rcx, %%rax;" + " imul %%rdx, %%rax;" + " add %%rax, %%r8;" + " adcx %%rcx, %%r9;" + " movq %%r9, 40(%%rdi);" + " adcx %%rcx, %%r10;" + " movq %%r10, 48(%%rdi);" + " adcx %%rcx, %%r11;" + " movq %%r11, 56(%%rdi);" + " mov $0, %%rax;" + " cmovc %%rdx, %%rax;" + " add %%rax, %%r8;" + " movq %%r8, 32(%%rdi);" + : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r) + : + : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" + ); +} + +#endif diff --git a/main.c b/main.c index cbeae08..9ada3d3 100644 --- a/main.c +++ b/main.c @@ -63,7 +63,8 @@ static __always_inline int name(void) \ declare_it(donna64) -declare_it(hacl64) +declare_it(evercrypt64) +declare_it(hacl51) declare_it(fiat64) declare_it(sandy2x) declare_it(amd64) @@ -86,7 +87,10 @@ static bool verify(void) for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { test_it(donna64, {}, {}); - test_it(hacl64, {}, {}); + test_it(hacl51, {}, {}); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + test_it(evercrypt64, {}, {}); + test_it(fiat64, {}, {}); if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end()); @@ -109,7 +113,8 @@ static int __init mod_init(void) int ret = 0, i; cycles_t *trial_times; cycles_t median_donna64 = 0; - cycles_t median_hacl64 = 0; + cycles_t median_evercrypt64 = 0; + cycles_t median_hacl51 = 0; cycles_t median_fiat64 = 0; cycles_t median_sandy2x = 0; cycles_t median_amd64 = 0; @@ -133,7 +138,8 @@ static int __init mod_init(void) spin_lock_irqsave(&lock, flags); do_it(donna64); - do_it(hacl64); + do_it(evercrypt64); + do_it(hacl51); do_it(fiat64); if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { kernel_fpu_begin(); @@ -152,20 +158,22 @@ static int __init mod_init(void) spin_unlock_irqrestore(&lock, flags); + report_it(tweetnacl); + report_it(donna32); + report_it(fiat32); report_it(donna64); - report_it(hacl64); report_it(fiat64); + if (dangerous) + report_it(amd64); if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) report_it(sandy2x); + report_it(hacl51); if (boot_cpu_has(X86_FEATURE_BMI2)) report_it(precomp_bmi2); if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) report_it(precomp_adx); - if (dangerous) - report_it(amd64); - report_it(fiat32); - report_it(donna32); - report_it(tweetnacl); + if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) + report_it(evercrypt64); /* Don't let compiler be too clever. */ dummy = ret; -- cgit v1.2.3-59-g8ed1b