aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJason A. Donenfeld <Jason@zx2c4.com>2020-01-19 01:08:31 +0100
committerJason A. Donenfeld <Jason@zx2c4.com>2020-01-19 01:08:31 +0100
commitc55f2aab716fe1cb25350ae2dfcf2bbca7895ada (patch)
tree82bff51c4ae8d398a0f2e2221ad75fe1fce63e87
parentRemove ones we don't care about (diff)
downloadkbench9000-jd/curve-comparison-showdown.tar.xz
kbench9000-jd/curve-comparison-showdown.zip
-rw-r--r--curve25519-ever64.c315
1 files changed, 148 insertions, 167 deletions
diff --git a/curve25519-ever64.c b/curve25519-ever64.c
index c6f4043..aacfa80 100644
--- a/curve25519-ever64.c
+++ b/curve25519-ever64.c
@@ -7,8 +7,6 @@
#include <linux/kernel.h>
#include <linux/string.h>
-enum { CURVE25519_POINT_SIZE = 32 };
-
static __always_inline u64 eq_mask(u64 a, u64 b)
{
u64 x = a ^ b;
@@ -31,21 +29,21 @@ static __always_inline u64 gte_mask(u64 a, u64 b)
return x_xor_q_ - (u64)1U;
}
-// Computes the addition of four-element f1 with value in f2
-// and returns the carry (if any)
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
{
u64 carry_r;
asm volatile(
- // Clear registers to propagate the carry bit
+ /* Clear registers to propagate the carry bit */
" xor %%r8, %%r8;"
" xor %%r9, %%r9;"
" xor %%r10, %%r10;"
" xor %%r11, %%r11;"
" xor %1, %1;"
- // Begin addition chain
+ /* Begin addition chain */
" addq 0(%3), %0;"
" movq %0, 0(%2);"
" adcxq 8(%3), %%r8;"
@@ -55,7 +53,7 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
" adcxq 24(%3), %%r10;"
" movq %%r10, 24(%2);"
- // Return the carry bit in a register
+ /* Return the carry bit in a register */
" adcx %%r11, %1;"
: "+&r" (f2), "=&r" (carry_r)
: "r" (out), "r" (f1)
@@ -65,11 +63,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
return carry_r;
}
-// Computes the field addition of two field elements
+/* Computes the field addition of two field elements */
static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- // Compute the raw addition of f1 + f2
+ /* Compute the raw addition of f1 + f2 */
" movq 0(%0), %%r8;"
" addq 0(%2), %%r8;"
" movq 8(%0), %%r9;"
@@ -79,14 +77,14 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" movq 24(%0), %%r11;"
" adcxq 24(%2), %%r11;"
- /////// Wrap the result back into the field //////
+ /* Wrap the result back into the field */
- // Step 1: Compute carry*38
+ /* Step 1: Compute carry*38 */
" mov $0, %%rax;"
" mov $38, %0;"
" cmovc %0, %%rax;"
- // Step 2: Add carry*38 to the original sum
+ /* Step 2: Add carry*38 to the original sum */
" xor %%rcx, %%rcx;"
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
@@ -96,7 +94,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%1);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %0, %%rax;"
" add %%rax, %%r8;"
@@ -107,11 +105,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
);
}
-// Computes the field substraction of two field elements
+/* Computes the field substraction of two field elements */
static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
{
asm volatile(
- // Compute the raw substraction of f1-f2
+ /* Compute the raw substraction of f1-f2 */
" movq 0(%1), %%r8;"
" subq 0(%2), %%r8;"
" movq 8(%1), %%r9;"
@@ -121,25 +119,25 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
" movq 24(%1), %%r11;"
" sbbq 24(%2), %%r11;"
- /////// Wrap the result back into the field //////
+ /* Wrap the result back into the field */
- // Step 1: Compute carry*38
+ /* Step 1: Compute carry*38 */
" mov $0, %%rax;"
" mov $38, %%rcx;"
" cmovc %%rcx, %%rax;"
- // Step 2: Substract carry*38 from the original difference
+ /* Step 2: Substract carry*38 from the original difference */
" sub %%rax, %%r8;"
" sbb $0, %%r9;"
" sbb $0, %%r10;"
" sbb $0, %%r11;"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rcx, %%rax;"
" sub %%rax, %%r8;"
- // Store the result
+ /* Store the result */
" movq %%r8, 0(%0);"
" movq %%r9, 8(%0);"
" movq %%r10, 16(%0);"
@@ -150,54 +148,48 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
);
}
-// Computes a field multiplication: out <- f1 * f2
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- /////// Compute the raw multiplication: tmp <- src1 * src2 //////
+ /* Compute the raw multiplication: tmp <- src1 * src2 */
- // Compute src1[0] * src2
+ /* Compute src1[0] * src2 */
" movq 0(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
-
- // Compute src1[1] * src2
+ /* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[2] * src2
+ /* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[3] * src2
+ /* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
-
- // Line up pointers
+ /* Line up pointers */
" mov %0, %1;"
" mov %2, %0;"
- /////// Wrap the result back into the field //////
+ /* Wrap the result back into the field */
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
" xor %3, %3;"
@@ -215,7 +207,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adox %3, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %3, %%r9;"
" movq %%r9, 8(%0);"
@@ -224,7 +216,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adcx %3, %%r11;"
" movq %%r11, 24(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
@@ -235,42 +227,37 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
);
}
-// Computes two field multiplications:
-// out[0] <- f1[0] * f2[0]
-// out[1] <- f1[1] * f2[1]
-// Uses the 16-element buffer tmp for intermediate results:
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results. */
static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
{
asm volatile(
- /////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] //////
+ /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
- // Compute src1[0] * src2
+ /* Compute src1[0] * src2 */
" movq 0(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
-
- // Compute src1[1] * src2
+ /* Compute src1[1] * src2 */
" movq 8(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[2] * src2
+ /* Compute src1[2] * src2 */
" movq 16(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);"
" mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[3] * src2
+ /* Compute src1[3] * src2 */
" movq 24(%1), %%rdx;"
" mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);"
" mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);"
@@ -278,49 +265,43 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);"
- /////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] //////
+ /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
- // Compute src1[0] * src2
+ /* Compute src1[0] * src2 */
" movq 32(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);"
" mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);"
" mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;"
-
- // Compute src1[1] * src2
+ /* Compute src1[1] * src2 */
" movq 40(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);"
" mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);"
" mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[2] * src2
+ /* Compute src1[2] * src2 */
" movq 48(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);"
" mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);"
" mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;"
-
-
- // Compute src1[3] * src2
+ /* Compute src1[3] * src2 */
" movq 56(%1), %%rdx;"
" mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);"
" mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);"
" mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;"
" mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;"
" adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);"
-
- // Line up pointers
+ /* Line up pointers */
" mov %0, %1;"
" mov %2, %0;"
- /////// Wrap the results back into the field //////
+ /* Wrap the results back into the field */
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
" xor %3, %3;"
@@ -338,7 +319,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adox %3, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %3, %%r9;"
" movq %%r9, 8(%0);"
@@ -347,13 +328,13 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adcx %3, %%r11;"
" movq %%r11, 24(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%0);"
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 96(%1), %%r8, %%r13;"
" xor %3, %3;"
@@ -371,7 +352,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adox %3, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %3, %%r9;"
" movq %%r9, 40(%0);"
@@ -380,7 +361,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
" adcx %3, %%r11;"
" movq %%r11, 56(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
@@ -391,30 +372,30 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
);
}
-// Computes the field multiplication of four-element f1 with value in f2
+/* Computes the field multiplication of four-element f1 with value in f2 */
static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
{
register u64 f2_r asm("rdx") = f2;
asm volatile(
- // Compute the raw multiplication of f1*f2
- " mulxq 0(%2), %%r8, %%rcx;" // f1[0]*f2
- " mulxq 8(%2), %%r9, %%r12;" // f1[1]*f2
+ /* Compute the raw multiplication of f1*f2 */
+ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */
+ " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */
" add %%rcx, %%r9;"
" mov $0, %%rcx;"
- " mulxq 16(%2), %%r10, %%r13;" // f1[2]*f2
+ " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */
" adcx %%r12, %%r10;"
- " mulxq 24(%2), %%r11, %%rax;" // f1[3]*f2
+ " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */
" adcx %%r13, %%r11;"
" adcx %%rcx, %%rax;"
- /////// Wrap the result back into the field //////
+ /* Wrap the result back into the field */
- // Step 1: Compute carry*38
+ /* Step 1: Compute carry*38 */
" mov $38, %%rdx;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%1);"
@@ -423,7 +404,7 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%1);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
@@ -434,14 +415,14 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
);
}
-// Computes p1 <- bit ? p2 : p1 in constant time
+/* Computes p1 <- bit ? p2 : p1 in constant time */
static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
{
asm volatile(
- // Invert the polarity of bit to match cmov expectations
+ /* Invert the polarity of bit to match cmov expectations */
" add $18446744073709551615, %0;"
- // cswap p1[0], p2[0]
+ /* cswap p1[0], p2[0] */
" movq 0(%1), %%r8;"
" movq 0(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -450,7 +431,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 0(%1);"
" movq %%r9, 0(%2);"
- // cswap p1[1], p2[1]
+ /* cswap p1[1], p2[1] */
" movq 8(%1), %%r8;"
" movq 8(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -459,7 +440,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 8(%1);"
" movq %%r9, 8(%2);"
- // cswap p1[2], p2[2]
+ /* cswap p1[2], p2[2] */
" movq 16(%1), %%r8;"
" movq 16(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -468,7 +449,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 16(%1);"
" movq %%r9, 16(%2);"
- // cswap p1[3], p2[3]
+ /* cswap p1[3], p2[3] */
" movq 24(%1), %%r8;"
" movq 24(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -477,7 +458,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 24(%1);"
" movq %%r9, 24(%2);"
- // cswap p1[4], p2[4]
+ /* cswap p1[4], p2[4] */
" movq 32(%1), %%r8;"
" movq 32(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -486,7 +467,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 32(%1);"
" movq %%r9, 32(%2);"
- // cswap p1[5], p2[5]
+ /* cswap p1[5], p2[5] */
" movq 40(%1), %%r8;"
" movq 40(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -495,7 +476,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 40(%1);"
" movq %%r9, 40(%2);"
- // cswap p1[6], p2[6]
+ /* cswap p1[6], p2[6] */
" movq 48(%1), %%r8;"
" movq 48(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -504,7 +485,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
" movq %%r8, 48(%1);"
" movq %%r9, 48(%2);"
- // cswap p1[7], p2[7]
+ /* cswap p1[7], p2[7] */
" movq 56(%1), %%r8;"
" movq 56(%2), %%r9;"
" mov %%r8, %%r10;"
@@ -518,25 +499,25 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
);
}
-// Computes the square of a field element: out <- f * f
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- /////// Compute the raw multiplication: tmp <- f * f //////
-
- // Step 1: Compute all partial products
- " movq 0(%1), %%rdx;" // f[0]
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0]
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0]
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0]
- " movq 24(%1), %%rdx;" // f[3]
- " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" // f[1]*f[3]
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" // f[2]*f[3]
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" // f1
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1]
-
- // Step 2: Compute two parallel carry chains
+ /* Compute the raw multiplication: tmp <- f * f */
+
+ /* Step 1: Compute all partial products */
+ " movq 0(%1), %%rdx;" /* f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%1), %%rdx;" /* f[3] */
+ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
@@ -551,27 +532,27 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
- // Step 3: Compute intermediate squares
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
" movq %%rax, 0(%0);"
" add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2
+ " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
" adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
" adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2
+ " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
" adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2
+ " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
- // Line up pointers
+ /* Line up pointers */
" mov %0, %1;"
" mov %2, %0;"
- /////// Wrap the result back into the field //////
+ /* Wrap the result back into the field */
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
@@ -589,7 +570,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%0);"
@@ -598,7 +579,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
@@ -609,25 +590,25 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
);
}
-// Computes two field squarings:
-// out[0] <- f[0] * f[0]
-// out[1] <- f[1] * f[1]
-// Uses the 16-element buffer tmp for intermediate results
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
{
asm volatile(
- // Step 1: Compute all partial products
- " movq 0(%1), %%rdx;" // f[0]
- " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0]
- " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0]
- " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0]
- " movq 24(%1), %%rdx;" // f[3]
- " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" // f[1]*f[3]
- " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" // f[2]*f[3]
- " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" // f1
- " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1]
-
- // Step 2: Compute two parallel carry chains
+ /* Step 1: Compute all partial products */
+ " movq 0(%1), %%rdx;" /* f[0] */
+ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 24(%1), %%rdx;" /* f[3] */
+ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
@@ -642,32 +623,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
- // Step 3: Compute intermediate squares
- " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2
+ /* Step 3: Compute intermediate squares */
+ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
" movq %%rax, 0(%0);"
" add %%rcx, %%r8;" " movq %%r8, 8(%0);"
- " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2
+ " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
" adcx %%rax, %%r9;" " movq %%r9, 16(%0);"
" adcx %%rcx, %%r10;" " movq %%r10, 24(%0);"
- " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2
+ " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 32(%0);"
" adcx %%rcx, %%r12;" " movq %%r12, 40(%0);"
- " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2
+ " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 48(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 56(%0);"
- // Step 1: Compute all partial products
- " movq 32(%1), %%rdx;" // f[0]
- " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0]
- " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0]
- " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0]
- " movq 56(%1), %%rdx;" // f[3]
- " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" // f[1]*f[3]
- " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" // f[2]*f[3]
- " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" // f1
- " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1]
-
- // Step 2: Compute two parallel carry chains
+ /* Step 1: Compute all partial products */
+ " movq 32(%1), %%rdx;" /* f[0] */
+ " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */
+ " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */
+ " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */
+ " movq 56(%1), %%rdx;" /* f[3] */
+ " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */
+ " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */
+ " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */
+ " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */
+
+ /* Step 2: Compute two parallel carry chains */
" xor %%r15, %%r15;"
" adox %%rax, %%r10;"
" adcx %%r8, %%r8;"
@@ -682,25 +663,25 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%r13, %%r13;"
" adcx %%r14, %%r14;"
- // Step 3: Compute intermediate squares
- " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2
+ /* Step 3: Compute intermediate squares */
+ " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */
" movq %%rax, 64(%0);"
" add %%rcx, %%r8;" " movq %%r8, 72(%0);"
- " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2
+ " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */
" adcx %%rax, %%r9;" " movq %%r9, 80(%0);"
" adcx %%rcx, %%r10;" " movq %%r10, 88(%0);"
- " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2
+ " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */
" adcx %%rax, %%r11;" " movq %%r11, 96(%0);"
" adcx %%rcx, %%r12;" " movq %%r12, 104(%0);"
- " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2
+ " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */
" adcx %%rax, %%r13;" " movq %%r13, 112(%0);"
" adcx %%rcx, %%r14;" " movq %%r14, 120(%0);"
- // Line up pointers
+ /* Line up pointers */
" mov %0, %1;"
" mov %2, %0;"
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 32(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
@@ -718,7 +699,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 8(%0);"
@@ -727,13 +708,13 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%rcx, %%r11;"
" movq %%r11, 24(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
" movq %%r8, 0(%0);"
- // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
" mov $38, %%rdx;"
" mulxq 96(%1), %%r8, %%r13;"
" xor %%rcx, %%rcx;"
@@ -751,7 +732,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adox %%rcx, %%rax;"
" imul %%rdx, %%rax;"
- // Step 2: Fold the carry back into dst
+ /* Step 2: Fold the carry back into dst */
" add %%rax, %%r8;"
" adcx %%rcx, %%r9;"
" movq %%r9, 40(%0);"
@@ -760,7 +741,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
" adcx %%rcx, %%r11;"
" movq %%r11, 56(%0);"
- // Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+ /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
" mov $0, %%rax;"
" cmovc %%rdx, %%rax;"
" add %%rax, %%r8;"
@@ -908,7 +889,7 @@ static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
}
-static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
{
u32 i;
fsqr(o, inp, tmp);
@@ -916,7 +897,7 @@ static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
fsqr(o, o, tmp);
}
-static void finv(u64 *o, u64 *i, u64 *tmp)
+static void finv(u64 *o, const u64 *i, u64 *tmp)
{
u64 t1[16U] = { 0U };
u64 *a0 = t1;
@@ -1006,10 +987,10 @@ static void store_felem(u64 *b, u64 *f)
b[3U] = o3;
}
-static void encode_point(u8 *o, u64 *i)
+static void encode_point(u8 *o, const u64 *i)
{
- u64 *x = i;
- u64 *z = i + (u32)4U;
+ const u64 *x = i;
+ const u64 *z = i + (u32)4U;
u64 tmp[4U] = { 0U };
u64 tmp_w[16U] = { 0U };
finv(tmp, z, tmp_w);