From 23bbc5804a414530f2e5ce62b09aa7d13631deec Mon Sep 17 00:00:00 2001
From: "Jason A. Donenfeld" <Jason@zx2c4.com>
Date: Tue, 1 Sep 2020 20:49:41 +0200
Subject: Sync ever64 with upstream kernel code

---
 curve25519-ever64.c | 322 +++++++++++++++++++++++++---------------------------
 1 file changed, 153 insertions(+), 169 deletions(-)

diff --git a/curve25519-ever64.c b/curve25519-ever64.c
index 14bdd7d..0c75c0b 100644
--- a/curve25519-ever64.c
+++ b/curve25519-ever64.c
@@ -31,21 +31,21 @@ static __always_inline u64 gte_mask(u64 a, u64 b)
 	return x_xor_q_ - (u64)1U;
 }
 
-// Computes the addition of four-element f1 with value in f2
-// and returns the carry (if any)
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 {
 	u64 carry_r;
 
 	asm volatile(
-		// Clear registers to propagate the carry bit
+		/* Clear registers to propagate the carry bit */
 		"  xor %%r8, %%r8;"
 		"  xor %%r9, %%r9;"
 		"  xor %%r10, %%r10;"
 		"  xor %%r11, %%r11;"
 		"  xor %1, %1;"
 
-		// Begin addition chain
+		/* Begin addition chain */
 		"  addq 0(%3), %0;"
 		"  movq %0, 0(%2);"
 		"  adcxq 8(%3), %%r8;"
@@ -55,7 +55,7 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 		"  adcxq 24(%3), %%r10;"
 		"  movq %%r10, 24(%2);"
 
-		// Return the carry bit in a register
+		/* Return the carry bit in a register */
 		"  adcx %%r11, %1;"
 	: "+&r" (f2), "=&r" (carry_r)
 	: "r" (out), "r" (f1)
@@ -65,11 +65,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 	return carry_r;
 }
 
-// Computes the field addition of two field elements
+/* Computes the field addition of two field elements */
 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 {
 	asm volatile(
-		// Compute the raw addition of f1 + f2
+		/* Compute the raw addition of f1 + f2 */
 		"  movq 0(%0), %%r8;"
 		"  addq 0(%2), %%r8;"
 		"  movq 8(%0), %%r9;"
@@ -79,14 +79,14 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  movq 24(%0), %%r11;"
 		"  adcxq 24(%2), %%r11;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $0, %%rax;"
 		"  mov $38, %0;"
 		"  cmovc %0, %%rax;"
 
-		// Step 2: Add carry*38 to the original sum
+		/* Step 2: Add carry*38 to the original sum */
 		"  xor %%rcx, %%rcx;"
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
@@ -96,7 +96,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%1);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %0, %%rax;"
 		"  add %%rax, %%r8;"
@@ -107,11 +107,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 	);
 }
 
-// Computes the field substraction of two field elements
+/* Computes the field substraction of two field elements */
 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 {
 	asm volatile(
-		// Compute the raw substraction of f1-f2
+		/* Compute the raw substraction of f1-f2 */
 		"  movq 0(%1), %%r8;"
 		"  subq 0(%2), %%r8;"
 		"  movq 8(%1), %%r9;"
@@ -121,25 +121,25 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 		"  movq 24(%1), %%r11;"
 		"  sbbq 24(%2), %%r11;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $0, %%rax;"
 		"  mov $38, %%rcx;"
 		"  cmovc %%rcx, %%rax;"
 
-		// Step 2: Substract carry*38 from the original difference
+		/* Step 2: Substract carry*38 from the original difference */
 		"  sub %%rax, %%r8;"
 		"  sbb $0, %%r9;"
 		"  sbb $0, %%r10;"
 		"  sbb $0, %%r11;"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rcx, %%rax;"
 		"  sub %%rax, %%r8;"
 
-		// Store the result
+		/* Store the result */
 		"  movq %%r8, 0(%0);"
 		"  movq %%r9, 8(%0);"
 		"  movq %%r10, 16(%0);"
@@ -150,54 +150,48 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 	);
 }
 
-// Computes a field multiplication: out <- f1 * f2
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication: tmp <- src1 * src2 //////
+		/* Compute the raw multiplication: tmp <- src1 * src2 */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
-
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -215,7 +209,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -224,7 +218,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -235,42 +229,37 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 	);
 }
 
-// Computes two field multiplications:
-//   out[0] <- f1[0] * f2[0]
-//   out[1] <- f1[1] * f2[1]
-// Uses the 16-element buffer tmp for intermediate results:
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results. */
 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] //////
+		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 32(%0);"
@@ -278,49 +267,43 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
 
-		/////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] //////
+		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 32(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 40(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 80(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 48(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 88(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 56(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%rbx, %%r10;"    "  movq %%r10, 96(%0);"
 		"  mulxq 48(%3), %%rbx, %%r13;"    "  adox %%r11, %%rbx;"    "  adcx %%r14, %%rbx;"    "  movq %%rbx, 104(%0);"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
-
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the results back into the field //////
+		/* Wrap the results back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -338,7 +321,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -347,13 +330,13 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%0);"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -371,7 +354,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 40(%0);"
@@ -380,7 +363,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 56(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -391,30 +374,30 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 	);
 }
 
-// Computes the field multiplication of four-element f1 with value in f2
+/* Computes the field multiplication of four-element f1 with value in f2 */
 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 {
 	register u64 f2_r asm("rdx") = f2;
 
 	asm volatile(
-		// Compute the raw multiplication of f1*f2
-		"  mulxq 0(%2), %%r8, %%rcx;"      // f1[0]*f2
-		"  mulxq 8(%2), %%r9, %%rbx;"      // f1[1]*f2
+		/* Compute the raw multiplication of f1*f2 */
+		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
+		"  mulxq 8(%2), %%r9, %%rbx;"      /* f1[1]*f2 */
 		"  add %%rcx, %%r9;"
 		"  mov $0, %%rcx;"
-		"  mulxq 16(%2), %%r10, %%r13;"    // f1[2]*f2
+		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
 		"  adcx %%rbx, %%r10;"
-		"  mulxq 24(%2), %%r11, %%rax;"    // f1[3]*f2
+		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
 		"  adcx %%r13, %%r11;"
 		"  adcx %%rcx, %%rax;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $38, %%rdx;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%1);"
@@ -423,7 +406,7 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%1);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -434,14 +417,14 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 	);
 }
 
-// Computes p1 <- bit ? p2 : p1 in constant time
+/* Computes p1 <- bit ? p2 : p1 in constant time */
 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 {
 	asm volatile(
-		// Invert the polarity of bit to match cmov expectations
+		/* Invert the polarity of bit to match cmov expectations */
 		"  add $18446744073709551615, %0;"
 
-		// cswap p1[0], p2[0]
+		/* cswap p1[0], p2[0] */
 		"  movq 0(%1), %%r8;"
 		"  movq 0(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -450,7 +433,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 0(%1);"
 		"  movq %%r9, 0(%2);"
 
-		// cswap p1[1], p2[1]
+		/* cswap p1[1], p2[1] */
 		"  movq 8(%1), %%r8;"
 		"  movq 8(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -459,7 +442,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 8(%1);"
 		"  movq %%r9, 8(%2);"
 
-		// cswap p1[2], p2[2]
+		/* cswap p1[2], p2[2] */
 		"  movq 16(%1), %%r8;"
 		"  movq 16(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -468,7 +451,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 16(%1);"
 		"  movq %%r9, 16(%2);"
 
-		// cswap p1[3], p2[3]
+		/* cswap p1[3], p2[3] */
 		"  movq 24(%1), %%r8;"
 		"  movq 24(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -477,7 +460,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 24(%1);"
 		"  movq %%r9, 24(%2);"
 
-		// cswap p1[4], p2[4]
+		/* cswap p1[4], p2[4] */
 		"  movq 32(%1), %%r8;"
 		"  movq 32(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -486,7 +469,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 32(%1);"
 		"  movq %%r9, 32(%2);"
 
-		// cswap p1[5], p2[5]
+		/* cswap p1[5], p2[5] */
 		"  movq 40(%1), %%r8;"
 		"  movq 40(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -495,7 +478,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 40(%1);"
 		"  movq %%r9, 40(%2);"
 
-		// cswap p1[6], p2[6]
+		/* cswap p1[6], p2[6] */
 		"  movq 48(%1), %%r8;"
 		"  movq 48(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -504,7 +487,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 48(%1);"
 		"  movq %%r9, 48(%2);"
 
-		// cswap p1[7], p2[7]
+		/* cswap p1[7], p2[7] */
 		"  movq 56(%1), %%r8;"
 		"  movq 56(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -518,25 +501,25 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 	);
 }
 
-// Computes the square of a field element: out <- f * f
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication: tmp <- f * f //////
-
-		// Step 1: Compute all partial products
-		"  movq 0(%1), %%rdx;"                                       // f[0]
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 24(%1), %%rdx;"                                      // f[3]
-		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    // f[2]*f[3]
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Compute the raw multiplication: tmp <- f * f */
+
+		/* Step 1: Compute all partial products */
+		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -551,27 +534,27 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 0(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -589,7 +572,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -598,7 +581,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -609,25 +592,25 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 	);
 }
 
-// Computes two field squarings:
-//   out[0] <- f[0] * f[0]
-//   out[1] <- f[1] * f[1]
-// Uses the 16-element buffer tmp for intermediate results
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 {
 	asm volatile(
-		// Step 1: Compute all partial products
-		"  movq 0(%1), %%rdx;"                                       // f[0]
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 24(%1), %%rdx;"                                      // f[3]
-		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    // f[2]*f[3]
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Step 1: Compute all partial products */
+		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 8(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -642,32 +625,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 0(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
-		// Step 1: Compute all partial products
-		"  movq 32(%1), %%rdx;"                                       // f[0]
-		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 56(%1), %%rdx;"                                      // f[3]
-		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    // f[2]*f[3]
-		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Step 1: Compute all partial products */
+		"  movq 32(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 56(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 40(%1), %%r11, %%rbx;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%rbx;"    /* f[2]*f[3] */
+		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -682,25 +665,25 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 64(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
-		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
-		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
 		"  adcx %%rcx, %%rbx;"     "  movq %%rbx, 104(%0);"
-		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
 
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -718,7 +701,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -727,13 +710,13 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%0);"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -751,7 +734,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 40(%0);"
@@ -760,7 +743,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 56(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -906,9 +889,12 @@ static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
 	point_double(nq1, tmp1, tmp2);
 	point_double(nq1, tmp1, tmp2);
 	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
+
+	memzero_explicit(tmp2, sizeof(tmp2));
+	memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap));
 }
 
-static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
 {
 	u32 i;
 	fsqr(o, inp, tmp);
@@ -916,7 +902,7 @@ static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
 		fsqr(o, o, tmp);
 }
 
-static void finv(u64 *o, u64 *i, u64 *tmp)
+static void finv(u64 *o, const u64 *i, u64 *tmp)
 {
 	u64 t1[16U] = { 0U };
 	u64 *a0 = t1;
@@ -956,10 +942,8 @@ static void store_felem(u64 *b, u64 *f)
 {
 	u64 f30 = f[3U];
 	u64 top_bit0 = f30 >> (u32)63U;
-	u64 carry0;
 	u64 f31;
 	u64 top_bit;
-	u64 carry;
 	u64 f0;
 	u64 f1;
 	u64 f2;
@@ -978,11 +962,11 @@ static void store_felem(u64 *b, u64 *f)
 	u64 o2;
 	u64 o3;
 	f[3U] = f30 & (u64)0x7fffffffffffffffU;
-	carry0 = add_scalar(f, f, (u64)19U * top_bit0);
+	add_scalar(f, f, (u64)19U * top_bit0);
 	f31 = f[3U];
 	top_bit = f31 >> (u32)63U;
 	f[3U] = f31 & (u64)0x7fffffffffffffffU;
-	carry = add_scalar(f, f, (u64)19U * top_bit);
+	add_scalar(f, f, (u64)19U * top_bit);
 	f0 = f[0U];
 	f1 = f[1U];
 	f2 = f[2U];
@@ -1006,10 +990,10 @@ static void store_felem(u64 *b, u64 *f)
 	b[3U] = o3;
 }
 
-static void encode_point(u8 *o, u64 *i)
+static void encode_point(u8 *o, const u64 *i)
 {
-	u64 *x = i;
-	u64 *z = i + (u32)4U;
+	const u64 *x = i;
+	const u64 *z = i + (u32)4U;
 	u64 tmp[4U] = { 0U };
 	u64 tmp_w[16U] = { 0U };
 	finv(tmp, z, tmp_w);
-- 
cgit v1.2.3-59-g8ed1b