1 files changed, 148 insertions, 167 deletions
diff --git a/curve25519-ever64.c b/curve25519-ever64.c
index c6f4043..aacfa80 100644
--- a/curve25519-ever64.c
+++ b/curve25519-ever64.c
@@ -7,8 +7,6 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 
-enum { CURVE25519_POINT_SIZE = 32 };
-
 static __always_inline u64 eq_mask(u64 a, u64 b)
 {
 	u64 x = a ^ b;
@@ -31,21 +29,21 @@ static __always_inline u64 gte_mask(u64 a, u64 b)
 	return x_xor_q_ - (u64)1U;
 }
 
-// Computes the addition of four-element f1 with value in f2
-// and returns the carry (if any)
+/* Computes the addition of four-element f1 with value in f2
+ * and returns the carry (if any) */
 static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 {
 	u64 carry_r;
 
 	asm volatile(
-		// Clear registers to propagate the carry bit
+		/* Clear registers to propagate the carry bit */
 		"  xor %%r8, %%r8;"
 		"  xor %%r9, %%r9;"
 		"  xor %%r10, %%r10;"
 		"  xor %%r11, %%r11;"
 		"  xor %1, %1;"
 
-		// Begin addition chain
+		/* Begin addition chain */
 		"  addq 0(%3), %0;"
 		"  movq %0, 0(%2);"
 		"  adcxq 8(%3), %%r8;"
@@ -55,7 +53,7 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 		"  adcxq 24(%3), %%r10;"
 		"  movq %%r10, 24(%2);"
 
-		// Return the carry bit in a register
+		/* Return the carry bit in a register */
 		"  adcx %%r11, %1;"
 	: "+&r" (f2), "=&r" (carry_r)
 	: "r" (out), "r" (f1)
@@ -65,11 +63,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2)
 	return carry_r;
 }
 
-// Computes the field addition of two field elements
+/* Computes the field addition of two field elements */
 static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 {
 	asm volatile(
-		// Compute the raw addition of f1 + f2
+		/* Compute the raw addition of f1 + f2 */
 		"  movq 0(%0), %%r8;"
 		"  addq 0(%2), %%r8;"
 		"  movq 8(%0), %%r9;"
@@ -79,14 +77,14 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  movq 24(%0), %%r11;"
 		"  adcxq 24(%2), %%r11;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $0, %%rax;"
 		"  mov $38, %0;"
 		"  cmovc %0, %%rax;"
 
-		// Step 2: Add carry*38 to the original sum
+		/* Step 2: Add carry*38 to the original sum */
 		"  xor %%rcx, %%rcx;"
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
@@ -96,7 +94,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%1);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %0, %%rax;"
 		"  add %%rax, %%r8;"
@@ -107,11 +105,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2)
 	);
 }
 
-// Computes the field substraction of two field elements
+/* Computes the field substraction of two field elements */
 static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 {
 	asm volatile(
-		// Compute the raw substraction of f1-f2
+		/* Compute the raw substraction of f1-f2 */
 		"  movq 0(%1), %%r8;"
 		"  subq 0(%2), %%r8;"
 		"  movq 8(%1), %%r9;"
@@ -121,25 +119,25 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 		"  movq 24(%1), %%r11;"
 		"  sbbq 24(%2), %%r11;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $0, %%rax;"
 		"  mov $38, %%rcx;"
 		"  cmovc %%rcx, %%rax;"
 
-		// Step 2: Substract carry*38 from the original difference
+		/* Step 2: Substract carry*38 from the original difference */
 		"  sub %%rax, %%r8;"
 		"  sbb $0, %%r9;"
 		"  sbb $0, %%r10;"
 		"  sbb $0, %%r11;"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rcx, %%rax;"
 		"  sub %%rax, %%r8;"
 
-		// Store the result
+		/* Store the result */
 		"  movq %%r8, 0(%0);"
 		"  movq %%r9, 8(%0);"
 		"  movq %%r10, 16(%0);"
@@ -150,54 +148,48 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2)
 	);
 }
 
-// Computes a field multiplication: out <- f1 * f2
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes a field multiplication: out <- f1 * f2
+ * Uses the 8-element buffer tmp for intermediate results */
 static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication: tmp <- src1 * src2 //////
+		/* Compute the raw multiplication: tmp <- src1 * src2 */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
-
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -215,7 +207,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -224,7 +216,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -235,42 +227,37 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 	);
 }
 
-// Computes two field multiplications:
-//   out[0] <- f1[0] * f2[0]
-//   out[1] <- f1[1] * f2[1]
-// Uses the 16-element buffer tmp for intermediate results:
+/* Computes two field multiplications:
+ * out[0] <- f1[0] * f2[0]
+ * out[1] <- f1[1] * f2[1]
+ * Uses the 16-element buffer tmp for intermediate results. */
 static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] //////
+		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 0(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 8(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 16(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
 		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 24(%1), %%rdx;"
 		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
 		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
@@ -278,49 +265,43 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
 
-		/////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] //////
+		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */
 
-		// Compute src1[0] * src2
+		/* Compute src1[0] * src2 */
 		"  movq 32(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
 		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"
-
-		// Compute src1[1] * src2
+		/* Compute src1[1] * src2 */
 		"  movq 40(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 80(%0);"
 		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[2] * src2
+		/* Compute src1[2] * src2 */
 		"  movq 48(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 88(%0);"
 		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
-
-
-		// Compute src1[3] * src2
+		/* Compute src1[3] * src2 */
 		"  movq 56(%1), %%rdx;"
 		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
 		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 96(%0);"
 		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 104(%0);"    "  mov $0, %%r8;"
 		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
 		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
-
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the results back into the field //////
+		/* Wrap the results back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -338,7 +319,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -347,13 +328,13 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%0);"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
 		"  xor %3, %3;"
@@ -371,7 +352,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adox %3, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %3, %%r9;"
 		"  movq %%r9, 40(%0);"
@@ -380,7 +361,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 		"  adcx %3, %%r11;"
 		"  movq %%r11, 56(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -391,30 +372,30 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp)
 	);
 }
 
-// Computes the field multiplication of four-element f1 with value in f2
+/* Computes the field multiplication of four-element f1 with value in f2 */
 static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 {
 	register u64 f2_r asm("rdx") = f2;
 
 	asm volatile(
-		// Compute the raw multiplication of f1*f2
-		"  mulxq 0(%2), %%r8, %%rcx;"      // f1[0]*f2
-		"  mulxq 8(%2), %%r9, %%r12;"      // f1[1]*f2
+		/* Compute the raw multiplication of f1*f2 */
+		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
+		"  mulxq 8(%2), %%r9, %%r12;"      /* f1[1]*f2 */
 		"  add %%rcx, %%r9;"
 		"  mov $0, %%rcx;"
-		"  mulxq 16(%2), %%r10, %%r13;"    // f1[2]*f2
+		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
 		"  adcx %%r12, %%r10;"
-		"  mulxq 24(%2), %%r11, %%rax;"    // f1[3]*f2
+		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
 		"  adcx %%r13, %%r11;"
 		"  adcx %%rcx, %%rax;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute carry*38
+		/* Step 1: Compute carry*38 */
 		"  mov $38, %%rdx;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%1);"
@@ -423,7 +404,7 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%1);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -434,14 +415,14 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2)
 	);
 }
 
-// Computes p1 <- bit ? p2 : p1 in constant time
+/* Computes p1 <- bit ? p2 : p1 in constant time */
 static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 {
 	asm volatile(
-		// Invert the polarity of bit to match cmov expectations
+		/* Invert the polarity of bit to match cmov expectations */
 		"  add $18446744073709551615, %0;"
 
-		// cswap p1[0], p2[0]
+		/* cswap p1[0], p2[0] */
 		"  movq 0(%1), %%r8;"
 		"  movq 0(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -450,7 +431,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 0(%1);"
 		"  movq %%r9, 0(%2);"
 
-		// cswap p1[1], p2[1]
+		/* cswap p1[1], p2[1] */
 		"  movq 8(%1), %%r8;"
 		"  movq 8(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -459,7 +440,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 8(%1);"
 		"  movq %%r9, 8(%2);"
 
-		// cswap p1[2], p2[2]
+		/* cswap p1[2], p2[2] */
 		"  movq 16(%1), %%r8;"
 		"  movq 16(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -468,7 +449,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 16(%1);"
 		"  movq %%r9, 16(%2);"
 
-		// cswap p1[3], p2[3]
+		/* cswap p1[3], p2[3] */
 		"  movq 24(%1), %%r8;"
 		"  movq 24(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -477,7 +458,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 24(%1);"
 		"  movq %%r9, 24(%2);"
 
-		// cswap p1[4], p2[4]
+		/* cswap p1[4], p2[4] */
 		"  movq 32(%1), %%r8;"
 		"  movq 32(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -486,7 +467,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 32(%1);"
 		"  movq %%r9, 32(%2);"
 
-		// cswap p1[5], p2[5]
+		/* cswap p1[5], p2[5] */
 		"  movq 40(%1), %%r8;"
 		"  movq 40(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -495,7 +476,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 40(%1);"
 		"  movq %%r9, 40(%2);"
 
-		// cswap p1[6], p2[6]
+		/* cswap p1[6], p2[6] */
 		"  movq 48(%1), %%r8;"
 		"  movq 48(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -504,7 +485,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 		"  movq %%r8, 48(%1);"
 		"  movq %%r9, 48(%2);"
 
-		// cswap p1[7], p2[7]
+		/* cswap p1[7], p2[7] */
 		"  movq 56(%1), %%r8;"
 		"  movq 56(%2), %%r9;"
 		"  mov %%r8, %%r10;"
@@ -518,25 +499,25 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2)
 	);
 }
 
-// Computes the square of a field element: out <- f * f
-// Uses the 8-element buffer tmp for intermediate results
+/* Computes the square of a field element: out <- f * f
+ * Uses the 8-element buffer tmp for intermediate results */
 static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 {
 	asm volatile(
-		/////// Compute the raw multiplication: tmp <- f * f //////
-
-		// Step 1: Compute all partial products
-		"  movq 0(%1), %%rdx;"                                       // f[0]
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 24(%1), %%rdx;"                                      // f[3]
-		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    // f[2]*f[3]
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Compute the raw multiplication: tmp <- f * f */
+
+		/* Step 1: Compute all partial products */
+		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -551,27 +532,27 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 0(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		/////// Wrap the result back into the field //////
+		/* Wrap the result back into the field */
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -589,7 +570,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -598,7 +579,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -609,25 +590,25 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp)
 	);
 }
 
-// Computes two field squarings:
-//   out[0] <- f[0] * f[0]
-//   out[1] <- f[1] * f[1]
-// Uses the 16-element buffer tmp for intermediate results
+/* Computes two field squarings:
+ * out[0] <- f[0] * f[0]
+ * out[1] <- f[1] * f[1]
+ * Uses the 16-element buffer tmp for intermediate results */
 static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 {
 	asm volatile(
-		// Step 1: Compute all partial products
-		"  movq 0(%1), %%rdx;"                                       // f[0]
-		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 24(%1), %%rdx;"                                      // f[3]
-		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    // f[2]*f[3]
-		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Step 1: Compute all partial products */
+		"  movq 0(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 24(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -642,32 +623,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 0(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
-		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
-		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
 		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
-		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"
 
-		// Step 1: Compute all partial products
-		"  movq 32(%1), %%rdx;"                                       // f[0]
-		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     // f[1]*f[0]
-		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     // f[2]*f[0]
-		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    // f[3]*f[0]
-		"  movq 56(%1), %%rdx;"                                      // f[3]
-		"  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    // f[1]*f[3]
-		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    // f[2]*f[3]
-		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    // f1
-		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        // f[2]*f[1]
-
-		// Step 2: Compute two parallel carry chains
+		/* Step 1: Compute all partial products */
+		"  movq 32(%1), %%rdx;"                                       /* f[0] */
+		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
+		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
+		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
+		"  movq 56(%1), %%rdx;"                                      /* f[3] */
+		"  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
+		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
+		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
+		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */
+
+		/* Step 2: Compute two parallel carry chains */
 		"  xor %%r15, %%r15;"
 		"  adox %%rax, %%r10;"
 		"  adcx %%r8, %%r8;"
@@ -682,25 +663,25 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%r13, %%r13;"
 		"  adcx %%r14, %%r14;"
 
-		// Step 3: Compute intermediate squares
-		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[0]^2
+		/* Step 3: Compute intermediate squares */
+		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
 		                           "  movq %%rax, 64(%0);"
 		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
-		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    // f[1]^2
+		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
 		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
 		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
-		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[2]^2
+		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
 		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
 		"  adcx %%rcx, %%r12;"     "  movq %%r12, 104(%0);"
-		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    // f[3]^2
+		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
 		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
 		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"
 
-		// Line up pointers
+		/* Line up pointers */
 		"  mov %0, %1;"
 		"  mov %2, %0;"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 32(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -718,7 +699,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 8(%0);"
@@ -727,13 +708,13 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 24(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
 		"  movq %%r8, 0(%0);"
 
-		// Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo
+		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
 		"  mov $38, %%rdx;"
 		"  mulxq 96(%1), %%r8, %%r13;"
 		"  xor %%rcx, %%rcx;"
@@ -751,7 +732,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adox %%rcx, %%rax;"
 		"  imul %%rdx, %%rax;"
 
-		// Step 2: Fold the carry back into dst
+		/* Step 2: Fold the carry back into dst */
 		"  add %%rax, %%r8;"
 		"  adcx %%rcx, %%r9;"
 		"  movq %%r9, 40(%0);"
@@ -760,7 +741,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp)
 		"  adcx %%rcx, %%r11;"
 		"  movq %%r11, 56(%0);"
 
-		// Step 3: Fold the carry bit back in; guaranteed not to carry at this point
+		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
 		"  mov $0, %%rax;"
 		"  cmovc %%rdx, %%rax;"
 		"  add %%rax, %%r8;"
@@ -908,7 +889,7 @@ static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1)
 	memcpy(out, p0, (u32)8U * sizeof(p0[0U]));
 }
 
-static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
+static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1)
 {
 	u32 i;
 	fsqr(o, inp, tmp);
@@ -916,7 +897,7 @@ static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1)
 		fsqr(o, o, tmp);
 }
 
-static void finv(u64 *o, u64 *i, u64 *tmp)
+static void finv(u64 *o, const u64 *i, u64 *tmp)
 {
 	u64 t1[16U] = { 0U };
 	u64 *a0 = t1;
@@ -1006,10 +987,10 @@ static void store_felem(u64 *b, u64 *f)
 	b[3U] = o3;
 }
 
-static void encode_point(u8 *o, u64 *i)
+static void encode_point(u8 *o, const u64 *i)
 {
-	u64 *x = i;
-	u64 *z = i + (u32)4U;
+	const u64 *x = i;
+	const u64 *z = i + (u32)4U;
 	u64 tmp[4U] = { 0U };
 	u64 tmp_w[16U] = { 0U };
 	finv(tmp, z, tmp_w);