aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2018-02-01 19:14:08 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2018-02-01 19:14:08 +0100
commit82357073da69113f86ad5fedfc61ba141918749a (patch)
treec59792e02d66487015dddaf17364da074c131f2e
parentDo reduction in assembly with cmov (diff)
downloadkbench9000-82357073da69113f86ad5fedfc61ba141918749a.tar.xz
kbench9000-82357073da69113f86ad5fedfc61ba141918749a.zip
hacl64: optimizations
-rw-r--r--curve25519-hacl64.c92
1 files changed, 52 insertions, 40 deletions
diff --git a/curve25519-hacl64.c b/curve25519-hacl64.c
index af2460b..2f1ba14 100644
--- a/curve25519-hacl64.c
+++ b/curve25519-hacl64.c
@@ -76,19 +76,45 @@ static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
static __always_inline void fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
{
- u32 i;
- for (i = 0; i < 5; ++i) {
- u128 xi = output[i];
- u64 yi = input[i];
- output[i] = ((xi) + (((u128)(yi) * (s))));
- }
+ output[0] += (u128)input[0] * s;
+ output[1] += (u128)input[1] * s;
+ output[2] += (u128)input[2] * s;
+ output[3] += (u128)input[3] * s;
+ output[4] += (u128)input[4] * s;
}
static __always_inline void fproduct_carry_wide_(u128 *tmp)
{
- u32 i;
- for (i = 0; i < 4; ++i) {
- u32 ctr = i;
+ {
+ u32 ctr = 0;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 1;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+
+ {
+ u32 ctr = 2;
+ u128 tctr = tmp[ctr];
+ u128 tctrp1 = tmp[ctr + 1];
+ u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
+ u128 c = ((tctr) >> (51));
+ tmp[ctr] = ((u128)(r0));
+ tmp[ctr + 1] = ((tctrp1) + (c));
+ }
+ {
+ u32 ctr = 3;
u128 tctr = tmp[ctr];
u128 tctrp1 = tmp[ctr + 1];
u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
@@ -169,12 +195,7 @@ static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
u64 i1;
u64 i0_;
u64 i1_;
- u128 t[5];
- {
- u32 _i;
- for (_i = 0; _i < 5; ++_i)
- t[_i] = ((u128)(0));
- }
+ u128 t[5] = { 0 };
fmul_mul_shift_reduce_(t, tmp, input21);
fproduct_carry_wide_(t);
b4 = t[4];
@@ -244,22 +265,17 @@ static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
output[1] = i1_;
}
-static __always_inline void fsquare_fsquare_times_(u64 *input, u128 *tmp, u32 count1)
+static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp, u32 count1)
{
u32 i;
- fsquare_fsquare_(tmp, input);
+ fsquare_fsquare_(tmp, output);
for (i = 1; i < count1; ++i)
- fsquare_fsquare_(tmp, input);
+ fsquare_fsquare_(tmp, output);
}
static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 count1)
{
u128 t[5];
- {
- u32 _i;
- for (_i = 0; _i < 5; ++_i)
- t[_i] = ((u128)(0));
- }
memcpy(output, input, 5 * sizeof(*input));
fsquare_fsquare_times_(output, t, count1);
}
@@ -267,11 +283,6 @@ static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 c
static __always_inline void fsquare_fsquare_times_inplace(u64 *output, u32 count1)
{
u128 t[5];
- {
- u32 _i;
- for (_i = 0; _i < 5; ++_i)
- t[_i] = ((u128)(0));
- }
fsquare_fsquare_times_(output, t, count1);
}
@@ -321,12 +332,11 @@ static __always_inline void crecip_crecip(u64 *out, u64 *z)
static __always_inline void fsum(u64 *a, u64 *b)
{
- u32 i;
- for (i = 0; i < 5; ++i) {
- u64 xi = a[i];
- u64 yi = b[i];
- a[i] = xi + yi;
- }
+ a[0] += b[0];
+ a[1] += b[1];
+ a[2] += b[2];
+ a[3] += b[3];
+ a[4] += b[4];
}
static __always_inline void fdifference(u64 *a, u64 *b)
@@ -434,18 +444,20 @@ static __always_inline void point_swap_conditional_step(u64 *a, u64 *b, u64 swap
b[i] = bi1;
}
-static __always_inline void point_swap_conditional_(u64 *a, u64 *b, u64 swap1, u32 ctr)
+static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
{
- u32 i;
- for (i = ctr; i > 0; --i)
- point_swap_conditional_step(a, b, swap1, i);
+ point_swap_conditional_step(a, b, swap1, 5);
+ point_swap_conditional_step(a, b, swap1, 4);
+ point_swap_conditional_step(a, b, swap1, 3);
+ point_swap_conditional_step(a, b, swap1, 2);
+ point_swap_conditional_step(a, b, swap1, 1);
}
static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
{
u64 swap1 = 0 - iswap;
- point_swap_conditional_(a, b, swap1, 5);
- point_swap_conditional_(a + 5, b + 5, swap1, 5);
+ point_swap_conditional5(a, b, swap1);
+ point_swap_conditional5(a + 5, b + 5, swap1);
}
static __always_inline void point_copy(u64 *output, u64 *input)