From 82357073da69113f86ad5fedfc61ba141918749a Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 1 Feb 2018 19:14:08 +0100 Subject: hacl64: optimizations --- curve25519-hacl64.c | 92 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/curve25519-hacl64.c b/curve25519-hacl64.c index af2460b..2f1ba14 100644 --- a/curve25519-hacl64.c +++ b/curve25519-hacl64.c @@ -76,19 +76,45 @@ static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input) static __always_inline void fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s) { - u32 i; - for (i = 0; i < 5; ++i) { - u128 xi = output[i]; - u64 yi = input[i]; - output[i] = ((xi) + (((u128)(yi) * (s)))); - } + output[0] += (u128)input[0] * s; + output[1] += (u128)input[1] * s; + output[2] += (u128)input[2] * s; + output[3] += (u128)input[3] * s; + output[4] += (u128)input[4] * s; } static __always_inline void fproduct_carry_wide_(u128 *tmp) { - u32 i; - for (i = 0; i < 4; ++i) { - u32 ctr = i; + { + u32 ctr = 0; + u128 tctr = tmp[ctr]; + u128 tctrp1 = tmp[ctr + 1]; + u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; + u128 c = ((tctr) >> (51)); + tmp[ctr] = ((u128)(r0)); + tmp[ctr + 1] = ((tctrp1) + (c)); + } + { + u32 ctr = 1; + u128 tctr = tmp[ctr]; + u128 tctrp1 = tmp[ctr + 1]; + u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; + u128 c = ((tctr) >> (51)); + tmp[ctr] = ((u128)(r0)); + tmp[ctr + 1] = ((tctrp1) + (c)); + } + + { + u32 ctr = 2; + u128 tctr = tmp[ctr]; + u128 tctrp1 = tmp[ctr + 1]; + u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; + u128 c = ((tctr) >> (51)); + tmp[ctr] = ((u128)(r0)); + tmp[ctr + 1] = ((tctrp1) + (c)); + } + { + u32 ctr = 3; u128 tctr = tmp[ctr]; u128 tctrp1 = tmp[ctr + 1]; u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; @@ -169,12 +195,7 @@ static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21) u64 i1; u64 i0_; u64 i1_; - u128 t[5]; - { - u32 _i; - for (_i = 0; _i < 5; ++_i) - t[_i] = ((u128)(0)); - } + u128 t[5] = { 0 }; fmul_mul_shift_reduce_(t, tmp, input21); fproduct_carry_wide_(t); b4 = t[4]; @@ -244,22 +265,17 @@ static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output) output[1] = i1_; } -static __always_inline void fsquare_fsquare_times_(u64 *input, u128 *tmp, u32 count1) +static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp, u32 count1) { u32 i; - fsquare_fsquare_(tmp, input); + fsquare_fsquare_(tmp, output); for (i = 1; i < count1; ++i) - fsquare_fsquare_(tmp, input); + fsquare_fsquare_(tmp, output); } static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 count1) { u128 t[5]; - { - u32 _i; - for (_i = 0; _i < 5; ++_i) - t[_i] = ((u128)(0)); - } memcpy(output, input, 5 * sizeof(*input)); fsquare_fsquare_times_(output, t, count1); } @@ -267,11 +283,6 @@ static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 c static __always_inline void fsquare_fsquare_times_inplace(u64 *output, u32 count1) { u128 t[5]; - { - u32 _i; - for (_i = 0; _i < 5; ++_i) - t[_i] = ((u128)(0)); - } fsquare_fsquare_times_(output, t, count1); } @@ -321,12 +332,11 @@ static __always_inline void crecip_crecip(u64 *out, u64 *z) static __always_inline void fsum(u64 *a, u64 *b) { - u32 i; - for (i = 0; i < 5; ++i) { - u64 xi = a[i]; - u64 yi = b[i]; - a[i] = xi + yi; - } + a[0] += b[0]; + a[1] += b[1]; + a[2] += b[2]; + a[3] += b[3]; + a[4] += b[4]; } static __always_inline void fdifference(u64 *a, u64 *b) @@ -434,18 +444,20 @@ static __always_inline void point_swap_conditional_step(u64 *a, u64 *b, u64 swap b[i] = bi1; } -static __always_inline void point_swap_conditional_(u64 *a, u64 *b, u64 swap1, u32 ctr) +static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1) { - u32 i; - for (i = ctr; i > 0; --i) - point_swap_conditional_step(a, b, swap1, i); + point_swap_conditional_step(a, b, swap1, 5); + point_swap_conditional_step(a, b, swap1, 4); + point_swap_conditional_step(a, b, swap1, 3); + point_swap_conditional_step(a, b, swap1, 2); + point_swap_conditional_step(a, b, swap1, 1); } static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap) { u64 swap1 = 0 - iswap; - point_swap_conditional_(a, b, swap1, 5); - point_swap_conditional_(a + 5, b + 5, swap1, 5); + point_swap_conditional5(a, b, swap1); + point_swap_conditional5(a + 5, b + 5, swap1); } static __always_inline void point_copy(u64 *output, u64 *input) -- cgit v1.2.3-59-g8ed1b