From 23bbc5804a414530f2e5ce62b09aa7d13631deec Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Tue, 1 Sep 2020 20:49:41 +0200 Subject: Sync ever64 with upstream kernel code --- curve25519-ever64.c | 322 +++++++++++++++++++++++++--------------------------- 1 file changed, 153 insertions(+), 169 deletions(-) diff --git a/curve25519-ever64.c b/curve25519-ever64.c index 14bdd7d..0c75c0b 100644 --- a/curve25519-ever64.c +++ b/curve25519-ever64.c @@ -31,21 +31,21 @@ static __always_inline u64 gte_mask(u64 a, u64 b) return x_xor_q_ - (u64)1U; } -// Computes the addition of four-element f1 with value in f2 -// and returns the carry (if any) +/* Computes the addition of four-element f1 with value in f2 + * and returns the carry (if any) */ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) { u64 carry_r; asm volatile( - // Clear registers to propagate the carry bit + /* Clear registers to propagate the carry bit */ " xor %%r8, %%r8;" " xor %%r9, %%r9;" " xor %%r10, %%r10;" " xor %%r11, %%r11;" " xor %1, %1;" - // Begin addition chain + /* Begin addition chain */ " addq 0(%3), %0;" " movq %0, 0(%2);" " adcxq 8(%3), %%r8;" @@ -55,7 +55,7 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) " adcxq 24(%3), %%r10;" " movq %%r10, 24(%2);" - // Return the carry bit in a register + /* Return the carry bit in a register */ " adcx %%r11, %1;" : "+&r" (f2), "=&r" (carry_r) : "r" (out), "r" (f1) @@ -65,11 +65,11 @@ static inline u64 add_scalar(u64 *out, const u64 *f1, u64 f2) return carry_r; } -// Computes the field addition of two field elements +/* Computes the field addition of two field elements */ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - // Compute the raw addition of f1 + f2 + /* Compute the raw addition of f1 + f2 */ " movq 0(%0), %%r8;" " addq 0(%2), %%r8;" " movq 8(%0), %%r9;" @@ -79,14 +79,14 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) " movq 24(%0), %%r11;" " adcxq 24(%2), %%r11;" - /////// Wrap the result back into the field ////// + /* Wrap the result back into the field */ - // Step 1: Compute carry*38 + /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %0;" " cmovc %0, %%rax;" - // Step 2: Add carry*38 to the original sum + /* Step 2: Add carry*38 to the original sum */ " xor %%rcx, %%rcx;" " add %%rax, %%r8;" " adcx %%rcx, %%r9;" @@ -96,7 +96,7 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %0, %%rax;" " add %%rax, %%r8;" @@ -107,11 +107,11 @@ static inline void fadd(u64 *out, const u64 *f1, const u64 *f2) ); } -// Computes the field substraction of two field elements +/* Computes the field substraction of two field elements */ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) { asm volatile( - // Compute the raw substraction of f1-f2 + /* Compute the raw substraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" @@ -121,25 +121,25 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) " movq 24(%1), %%r11;" " sbbq 24(%2), %%r11;" - /////// Wrap the result back into the field ////// + /* Wrap the result back into the field */ - // Step 1: Compute carry*38 + /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" - // Step 2: Substract carry*38 from the original difference + /* Step 2: Substract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" " sbb $0, %%r11;" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rcx, %%rax;" " sub %%rax, %%r8;" - // Store the result + /* Store the result */ " movq %%r8, 0(%0);" " movq %%r9, 8(%0);" " movq %%r10, 16(%0);" @@ -150,54 +150,48 @@ static inline void fsub(u64 *out, const u64 *f1, const u64 *f2) ); } -// Computes a field multiplication: out <- f1 * f2 -// Uses the 8-element buffer tmp for intermediate results +/* Computes a field multiplication: out <- f1 * f2 + * Uses the 8-element buffer tmp for intermediate results */ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( - /////// Compute the raw multiplication: tmp <- src1 * src2 ////// + /* Compute the raw multiplication: tmp <- src1 * src2 */ - // Compute src1[0] * src2 + /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" - - // Compute src1[1] * src2 + /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[2] * src2 + /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[3] * src2 + /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 40(%0);" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" - - // Line up pointers + /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" - /////// Wrap the result back into the field ////// + /* Wrap the result back into the field */ - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %3, %3;" @@ -215,7 +209,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adox %3, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 8(%0);" @@ -224,7 +218,7 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adcx %3, %%r11;" " movq %%r11, 24(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" @@ -235,42 +229,37 @@ static inline void fmul(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) ); } -// Computes two field multiplications: -// out[0] <- f1[0] * f2[0] -// out[1] <- f1[1] * f2[1] -// Uses the 16-element buffer tmp for intermediate results: +/* Computes two field multiplications: + * out[0] <- f1[0] * f2[0] + * out[1] <- f1[1] * f2[1] + * Uses the 16-element buffer tmp for intermediate results. */ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) { asm volatile( - /////// Compute the raw multiplication tmp[0] <- f1[0] * f2[0] ////// + /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ - // Compute src1[0] * src2 + /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" - - // Compute src1[1] * src2 + /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[2] * src2 + /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[3] * src2 + /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 32(%0);" @@ -278,49 +267,43 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" - /////// Compute the raw multiplication tmp[1] <- f1[1] * f2[1] ////// + /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ - // Compute src1[0] * src2 + /* Compute src1[0] * src2 */ " movq 32(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" - - // Compute src1[1] * src2 + /* Compute src1[1] * src2 */ " movq 40(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 80(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[2] * src2 + /* Compute src1[2] * src2 */ " movq 48(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 88(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" - - - // Compute src1[3] * src2 + /* Compute src1[3] * src2 */ " movq 56(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%rbx, %%r10;" " movq %%r10, 96(%0);" " mulxq 48(%3), %%rbx, %%r13;" " adox %%r11, %%rbx;" " adcx %%r14, %%rbx;" " movq %%rbx, 104(%0);" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);" - - // Line up pointers + /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" - /////// Wrap the results back into the field ////// + /* Wrap the results back into the field */ - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %3, %3;" @@ -338,7 +321,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adox %3, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 8(%0);" @@ -347,13 +330,13 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adcx %3, %%r11;" " movq %%r11, 24(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" " xor %3, %3;" @@ -371,7 +354,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adox %3, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 40(%0);" @@ -380,7 +363,7 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) " adcx %3, %%r11;" " movq %%r11, 56(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" @@ -391,30 +374,30 @@ static inline void fmul2(u64 *out, const u64 *f1, const u64 *f2, u64 *tmp) ); } -// Computes the field multiplication of four-element f1 with value in f2 +/* Computes the field multiplication of four-element f1 with value in f2 */ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) { register u64 f2_r asm("rdx") = f2; asm volatile( - // Compute the raw multiplication of f1*f2 - " mulxq 0(%2), %%r8, %%rcx;" // f1[0]*f2 - " mulxq 8(%2), %%r9, %%rbx;" // f1[1]*f2 + /* Compute the raw multiplication of f1*f2 */ + " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ + " mulxq 8(%2), %%r9, %%rbx;" /* f1[1]*f2 */ " add %%rcx, %%r9;" " mov $0, %%rcx;" - " mulxq 16(%2), %%r10, %%r13;" // f1[2]*f2 + " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ " adcx %%rbx, %%r10;" - " mulxq 24(%2), %%r11, %%rax;" // f1[3]*f2 + " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ " adcx %%r13, %%r11;" " adcx %%rcx, %%rax;" - /////// Wrap the result back into the field ////// + /* Wrap the result back into the field */ - // Step 1: Compute carry*38 + /* Step 1: Compute carry*38 */ " mov $38, %%rdx;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" @@ -423,7 +406,7 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" @@ -434,14 +417,14 @@ static inline void fmul_scalar(u64 *out, const u64 *f1, u64 f2) ); } -// Computes p1 <- bit ? p2 : p1 in constant time +/* Computes p1 <- bit ? p2 : p1 in constant time */ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) { asm volatile( - // Invert the polarity of bit to match cmov expectations + /* Invert the polarity of bit to match cmov expectations */ " add $18446744073709551615, %0;" - // cswap p1[0], p2[0] + /* cswap p1[0], p2[0] */ " movq 0(%1), %%r8;" " movq 0(%2), %%r9;" " mov %%r8, %%r10;" @@ -450,7 +433,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 0(%1);" " movq %%r9, 0(%2);" - // cswap p1[1], p2[1] + /* cswap p1[1], p2[1] */ " movq 8(%1), %%r8;" " movq 8(%2), %%r9;" " mov %%r8, %%r10;" @@ -459,7 +442,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 8(%1);" " movq %%r9, 8(%2);" - // cswap p1[2], p2[2] + /* cswap p1[2], p2[2] */ " movq 16(%1), %%r8;" " movq 16(%2), %%r9;" " mov %%r8, %%r10;" @@ -468,7 +451,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 16(%1);" " movq %%r9, 16(%2);" - // cswap p1[3], p2[3] + /* cswap p1[3], p2[3] */ " movq 24(%1), %%r8;" " movq 24(%2), %%r9;" " mov %%r8, %%r10;" @@ -477,7 +460,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 24(%1);" " movq %%r9, 24(%2);" - // cswap p1[4], p2[4] + /* cswap p1[4], p2[4] */ " movq 32(%1), %%r8;" " movq 32(%2), %%r9;" " mov %%r8, %%r10;" @@ -486,7 +469,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 32(%1);" " movq %%r9, 32(%2);" - // cswap p1[5], p2[5] + /* cswap p1[5], p2[5] */ " movq 40(%1), %%r8;" " movq 40(%2), %%r9;" " mov %%r8, %%r10;" @@ -495,7 +478,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 40(%1);" " movq %%r9, 40(%2);" - // cswap p1[6], p2[6] + /* cswap p1[6], p2[6] */ " movq 48(%1), %%r8;" " movq 48(%2), %%r9;" " mov %%r8, %%r10;" @@ -504,7 +487,7 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) " movq %%r8, 48(%1);" " movq %%r9, 48(%2);" - // cswap p1[7], p2[7] + /* cswap p1[7], p2[7] */ " movq 56(%1), %%r8;" " movq 56(%2), %%r9;" " mov %%r8, %%r10;" @@ -518,25 +501,25 @@ static inline void cswap2(u64 bit, const u64 *p1, const u64 *p2) ); } -// Computes the square of a field element: out <- f * f -// Uses the 8-element buffer tmp for intermediate results +/* Computes the square of a field element: out <- f * f + * Uses the 8-element buffer tmp for intermediate results */ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) { asm volatile( - /////// Compute the raw multiplication: tmp <- f * f ////// - - // Step 1: Compute all partial products - " movq 0(%1), %%rdx;" // f[0] - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0] - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0] - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0] - " movq 24(%1), %%rdx;" // f[3] - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" // f[1]*f[3] - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" // f[2]*f[3] - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" // f1 - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1] - - // Step 2: Compute two parallel carry chains + /* Compute the raw multiplication: tmp <- f * f */ + + /* Step 1: Compute all partial products */ + " movq 0(%1), %%rdx;" /* f[0] */ + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%1), %%rdx;" /* f[3] */ + " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" @@ -551,27 +534,27 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" - // Step 3: Compute intermediate squares - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 + /* Step 3: Compute intermediate squares */ + " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%0);" " add %%rcx, %%r8;" " movq %%r8, 8(%0);" - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 + " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 + " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 + " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" - // Line up pointers + /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" - /////// Wrap the result back into the field ////// + /* Wrap the result back into the field */ - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" @@ -589,7 +572,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%0);" @@ -598,7 +581,7 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) " adcx %%rcx, %%r11;" " movq %%r11, 24(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" @@ -609,25 +592,25 @@ static inline void fsqr(u64 *out, const u64 *f, u64 *tmp) ); } -// Computes two field squarings: -// out[0] <- f[0] * f[0] -// out[1] <- f[1] * f[1] -// Uses the 16-element buffer tmp for intermediate results +/* Computes two field squarings: + * out[0] <- f[0] * f[0] + * out[1] <- f[1] * f[1] + * Uses the 16-element buffer tmp for intermediate results */ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) { asm volatile( - // Step 1: Compute all partial products - " movq 0(%1), %%rdx;" // f[0] - " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0] - " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0] - " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0] - " movq 24(%1), %%rdx;" // f[3] - " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" // f[1]*f[3] - " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" // f[2]*f[3] - " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" // f1 - " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1] - - // Step 2: Compute two parallel carry chains + /* Step 1: Compute all partial products */ + " movq 0(%1), %%rdx;" /* f[0] */ + " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 24(%1), %%rdx;" /* f[3] */ + " mulxq 8(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ + " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" @@ -642,32 +625,32 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" - // Step 3: Compute intermediate squares - " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 + /* Step 3: Compute intermediate squares */ + " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%0);" " add %%rcx, %%r8;" " movq %%r8, 8(%0);" - " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 + " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" - " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 + " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" " adcx %%rcx, %%rbx;" " movq %%rbx, 40(%0);" - " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 + " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" - // Step 1: Compute all partial products - " movq 32(%1), %%rdx;" // f[0] - " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" // f[1]*f[0] - " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" // f[2]*f[0] - " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" // f[3]*f[0] - " movq 56(%1), %%rdx;" // f[3] - " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" // f[1]*f[3] - " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" // f[2]*f[3] - " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" // f1 - " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" // f[2]*f[1] - - // Step 2: Compute two parallel carry chains + /* Step 1: Compute all partial products */ + " movq 32(%1), %%rdx;" /* f[0] */ + " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ + " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ + " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ + " movq 56(%1), %%rdx;" /* f[3] */ + " mulxq 40(%1), %%r11, %%rbx;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ + " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%rbx;" /* f[2]*f[3] */ + " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ + " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ + + /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" @@ -682,25 +665,25 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" - // Step 3: Compute intermediate squares - " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[0]^2 + /* Step 3: Compute intermediate squares */ + " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 64(%0);" " add %%rcx, %%r8;" " movq %%r8, 72(%0);" - " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[1]^2 + " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 80(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);" - " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[2]^2 + " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 96(%0);" " adcx %%rcx, %%rbx;" " movq %%rbx, 104(%0);" - " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" // f[3]^2 + " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 112(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);" - // Line up pointers + /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" @@ -718,7 +701,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%0);" @@ -727,13 +710,13 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%rcx, %%r11;" " movq %%r11, 24(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" - // Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo + /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" @@ -751,7 +734,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" - // Step 2: Fold the carry back into dst + /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 40(%0);" @@ -760,7 +743,7 @@ static inline void fsqr2(u64 *out, const u64 *f, u64 *tmp) " adcx %%rcx, %%r11;" " movq %%r11, 56(%0);" - // Step 3: Fold the carry bit back in; guaranteed not to carry at this point + /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" @@ -906,9 +889,12 @@ static void montgomery_ladder(u64 *out, const u8 *key, u64 *init1) point_double(nq1, tmp1, tmp2); point_double(nq1, tmp1, tmp2); memcpy(out, p0, (u32)8U * sizeof(p0[0U])); + + memzero_explicit(tmp2, sizeof(tmp2)); + memzero_explicit(p01_tmp1_swap, sizeof(p01_tmp1_swap)); } -static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1) +static void fsquare_times(u64 *o, const u64 *inp, u64 *tmp, u32 n1) { u32 i; fsqr(o, inp, tmp); @@ -916,7 +902,7 @@ static void fsquare_times(u64 *o, u64 *inp, u64 *tmp, u32 n1) fsqr(o, o, tmp); } -static void finv(u64 *o, u64 *i, u64 *tmp) +static void finv(u64 *o, const u64 *i, u64 *tmp) { u64 t1[16U] = { 0U }; u64 *a0 = t1; @@ -956,10 +942,8 @@ static void store_felem(u64 *b, u64 *f) { u64 f30 = f[3U]; u64 top_bit0 = f30 >> (u32)63U; - u64 carry0; u64 f31; u64 top_bit; - u64 carry; u64 f0; u64 f1; u64 f2; @@ -978,11 +962,11 @@ static void store_felem(u64 *b, u64 *f) u64 o2; u64 o3; f[3U] = f30 & (u64)0x7fffffffffffffffU; - carry0 = add_scalar(f, f, (u64)19U * top_bit0); + add_scalar(f, f, (u64)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (u32)63U; f[3U] = f31 & (u64)0x7fffffffffffffffU; - carry = add_scalar(f, f, (u64)19U * top_bit); + add_scalar(f, f, (u64)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; @@ -1006,10 +990,10 @@ static void store_felem(u64 *b, u64 *f) b[3U] = o3; } -static void encode_point(u8 *o, u64 *i) +static void encode_point(u8 *o, const u64 *i) { - u64 *x = i; - u64 *z = i + (u32)4U; + const u64 *x = i; + const u64 *z = i + (u32)4U; u64 tmp[4U] = { 0U }; u64 tmp_w[16U] = { 0U }; finv(tmp, z, tmp_w); -- cgit v1.2.3-59-g8ed1b