#include /* Computes the addition of four-element f1 with value in f2 * and returns the carry (if any) */ static uint64_t add_scalar(uint64_t *out, const uint64_t *f1, uint64_t f2) { uint64_t carry_r; asm volatile( /* Clear registers to propagate the carry bit */ " xor %%r8, %%r8;" " xor %%r9, %%r9;" " xor %%r10, %%r10;" " xor %%r11, %%r11;" " xor %1, %1;" /* Begin addition chain */ " addq 0(%3), %0;" " movq %0, 0(%2);" " adcxq 8(%3), %%r8;" " movq %%r8, 8(%2);" " adcxq 16(%3), %%r9;" " movq %%r9, 16(%2);" " adcxq 24(%3), %%r10;" " movq %%r10, 24(%2);" /* Return the carry bit in a register */ " adcx %%r11, %1;" : "+&r" (f2), "=&r" (carry_r) : "r" (out), "r" (f1) : "%r8", "%r9", "%r10", "%r11", "memory", "cc" ); return carry_r; } /* Computes the field addition of two field elements */ static void fadd(uint64_t *out, const uint64_t *f1, const uint64_t *f2) { asm volatile( /* Compute the raw addition of f1 + f2 */ " movq 0(%0), %%r8;" " addq 0(%2), %%r8;" " movq 8(%0), %%r9;" " adcxq 8(%2), %%r9;" " movq 16(%0), %%r10;" " adcxq 16(%2), %%r10;" " movq 24(%0), %%r11;" " adcxq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %0;" " cmovc %0, %%rax;" /* Step 2: Add carry*38 to the original sum */ " xor %%rcx, %%rcx;" " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %0, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r" (f2) : "r" (out), "r" (f1) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" ); } /* Computes the field substraction of two field elements */ static void fsub(uint64_t *out, const uint64_t *f1, const uint64_t *f2) { asm volatile( /* Compute the raw substraction of f1-f2 */ " movq 0(%1), %%r8;" " subq 0(%2), %%r8;" " movq 8(%1), %%r9;" " sbbq 8(%2), %%r9;" " movq 16(%1), %%r10;" " sbbq 16(%2), %%r10;" " movq 24(%1), %%r11;" " sbbq 24(%2), %%r11;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $0, %%rax;" " mov $38, %%rcx;" " cmovc %%rcx, %%rax;" /* Step 2: Substract carry*38 from the original difference */ " sub %%rax, %%r8;" " sbb $0, %%r9;" " sbb $0, %%r10;" " sbb $0, %%r11;" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rcx, %%rax;" " sub %%rax, %%r8;" /* Store the result */ " movq %%r8, 0(%0);" " movq %%r9, 8(%0);" " movq %%r10, 16(%0);" " movq %%r11, 24(%0);" : : "r" (out), "r" (f1), "r" (f2) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc" ); } /* Computes a field multiplication: out <- f1 * f2 * Uses the 8-element buffer tmp for intermediate results */ static void fmul(uint64_t *out, const uint64_t *f1, const uint64_t *f2, uint64_t *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- src1 * src2 */ /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %3, %3;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 8(%1), %%r9;" " mulxq 48(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 16(%1), %%r10;" " mulxq 56(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%1), %%r11;" " adcx %3, %%rax;" " adox %3, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 8(%0);" " adcx %3, %%r10;" " movq %%r10, 16(%0);" " adcx %3, %%r11;" " movq %%r11, 24(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) : : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc" ); } /* Computes two field multiplications: * out[0] <- f1[0] * f2[0] * out[1] <- f1[1] * f2[1] * Uses the 16-element buffer tmp for intermediate results. */ static void fmul2(uint64_t *out, const uint64_t *f1, const uint64_t *f2, uint64_t *tmp) { asm volatile( /* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */ /* Compute src1[0] * src2 */ " movq 0(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 0(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 8(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 8(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 8(%0), %%r8;" " movq %%r8, 8(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 16(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 16(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 16(%0), %%r8;" " movq %%r8, 16(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 24(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 24(%1), %%rdx;" " mulxq 0(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 24(%0), %%r8;" " movq %%r8, 24(%0);" " mulxq 8(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 32(%0);" " mulxq 16(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 40(%0);" " mov $0, %%r8;" " mulxq 24(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 48(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 56(%0);" /* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */ /* Compute src1[0] * src2 */ " movq 32(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " movq %%r8, 64(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " movq %%r10, 72(%0);" " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" /* Compute src1[1] * src2 */ " movq 40(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 72(%0), %%r8;" " movq %%r8, 72(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 80(%0);" " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[2] * src2 */ " movq 48(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 80(%0), %%r8;" " movq %%r8, 80(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 88(%0);" " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" /* Compute src1[3] * src2 */ " movq 56(%1), %%rdx;" " mulxq 32(%3), %%r8, %%r9;" " xor %%r10, %%r10;" " adcxq 88(%0), %%r8;" " movq %%r8, 88(%0);" " mulxq 40(%3), %%r10, %%r11;" " adox %%r9, %%r10;" " adcx %%r12, %%r10;" " movq %%r10, 96(%0);" " mulxq 48(%3), %%r12, %%r13;" " adox %%r11, %%r12;" " adcx %%r14, %%r12;" " movq %%r12, 104(%0);" " mov $0, %%r8;" " mulxq 56(%3), %%r14, %%rdx;" " adox %%r13, %%r14;" " adcx %%rax, %%r14;" " movq %%r14, 112(%0);" " mov $0, %%rax;" " adox %%rdx, %%rax;" " adcx %%r8, %%rax;" " movq %%rax, 120(%0);" /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" /* Wrap the results back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %3, %3;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 8(%1), %%r9;" " mulxq 48(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 16(%1), %%r10;" " mulxq 56(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%1), %%r11;" " adcx %3, %%rax;" " adox %3, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 8(%0);" " adcx %3, %%r10;" " movq %%r10, 16(%0);" " adcx %3, %%r11;" " movq %%r11, 24(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" " xor %3, %3;" " adoxq 64(%1), %%r8;" " mulxq 104(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 72(%1), %%r9;" " mulxq 112(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 80(%1), %%r10;" " mulxq 120(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%1), %%r11;" " adcx %3, %%rax;" " adox %3, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %3, %%r9;" " movq %%r9, 40(%0);" " adcx %3, %%r10;" " movq %%r10, 48(%0);" " adcx %3, %%r11;" " movq %%r11, 56(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%0);" : "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2) : : "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc" ); } /* Computes the field multiplication of four-element f1 with value in f2 */ static void fmul_scalar(uint64_t *out, const uint64_t *f1, uint64_t f2) { register uint64_t f2_r asm("rdx") = f2; asm volatile( /* Compute the raw multiplication of f1*f2 */ " mulxq 0(%2), %%r8, %%rcx;" /* f1[0]*f2 */ " mulxq 8(%2), %%r9, %%r12;" /* f1[1]*f2 */ " add %%rcx, %%r9;" " mov $0, %%rcx;" " mulxq 16(%2), %%r10, %%r13;" /* f1[2]*f2 */ " adcx %%r12, %%r10;" " mulxq 24(%2), %%r11, %%rax;" /* f1[3]*f2 */ " adcx %%r13, %%r11;" " adcx %%rcx, %%rax;" /* Wrap the result back into the field */ /* Step 1: Compute carry*38 */ " mov $38, %%rdx;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%1);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%1);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%1);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%1);" : "+&r" (f2_r) : "r" (out), "r" (f1) : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc" ); } /* Computes p1 <- bit ? p2 : p1 in constant time */ static void cswap2(uint64_t bit, const uint64_t *p1, const uint64_t *p2) { asm volatile( /* Invert the polarity of bit to match cmov expectations */ " add $18446744073709551615, %0;" /* cswap p1[0], p2[0] */ " movq 0(%1), %%r8;" " movq 0(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 0(%1);" " movq %%r9, 0(%2);" /* cswap p1[1], p2[1] */ " movq 8(%1), %%r8;" " movq 8(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 8(%1);" " movq %%r9, 8(%2);" /* cswap p1[2], p2[2] */ " movq 16(%1), %%r8;" " movq 16(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 16(%1);" " movq %%r9, 16(%2);" /* cswap p1[3], p2[3] */ " movq 24(%1), %%r8;" " movq 24(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 24(%1);" " movq %%r9, 24(%2);" /* cswap p1[4], p2[4] */ " movq 32(%1), %%r8;" " movq 32(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 32(%1);" " movq %%r9, 32(%2);" /* cswap p1[5], p2[5] */ " movq 40(%1), %%r8;" " movq 40(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 40(%1);" " movq %%r9, 40(%2);" /* cswap p1[6], p2[6] */ " movq 48(%1), %%r8;" " movq 48(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 48(%1);" " movq %%r9, 48(%2);" /* cswap p1[7], p2[7] */ " movq 56(%1), %%r8;" " movq 56(%2), %%r9;" " mov %%r8, %%r10;" " cmovc %%r9, %%r8;" " cmovc %%r10, %%r9;" " movq %%r8, 56(%1);" " movq %%r9, 56(%2);" : "+&r" (bit) : "r" (p1), "r" (p2) : "%r8", "%r9", "%r10", "memory", "cc" ); } /* Computes the square of a field element: out <- f * f * Uses the 8-element buffer tmp for intermediate results */ static void fsqr(uint64_t *out, const uint64_t *f, uint64_t *tmp) { asm volatile( /* Compute the raw multiplication: tmp <- f * f */ /* Step 1: Compute all partial products */ " movq 0(%1), %%rdx;" /* f[0] */ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%1), %%rdx;" /* f[3] */ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%r12;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%r12, %%r12;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%0);" " add %%rcx, %%r8;" " movq %%r8, 8(%0);" " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);" " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" /* Wrap the result back into the field */ /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 8(%1), %%r9;" " mulxq 48(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 16(%1), %%r10;" " mulxq 56(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%1), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%0);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" : "+&r" (tmp), "+&r" (f), "+&r" (out) : : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" ); } /* Computes two field squarings: * out[0] <- f[0] * f[0] * out[1] <- f[1] * f[1] * Uses the 16-element buffer tmp for intermediate results */ static void fsqr2(uint64_t *out, const uint64_t *f, uint64_t *tmp) { asm volatile( /* Step 1: Compute all partial products */ " movq 0(%1), %%rdx;" /* f[0] */ " mulxq 8(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ " mulxq 16(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 24(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 24(%1), %%rdx;" /* f[3] */ " mulxq 8(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 16(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */ " movq 8(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 16(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%r12;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%r12, %%r12;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 0(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 0(%0);" " add %%rcx, %%r8;" " movq %%r8, 8(%0);" " movq 8(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 16(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 24(%0);" " movq 16(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 32(%0);" " adcx %%rcx, %%r12;" " movq %%r12, 40(%0);" " movq 24(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 48(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 56(%0);" /* Step 1: Compute all partial products */ " movq 32(%1), %%rdx;" /* f[0] */ " mulxq 40(%1), %%r8, %%r14;" " xor %%r15, %%r15;" /* f[1]*f[0] */ " mulxq 48(%1), %%r9, %%r10;" " adcx %%r14, %%r9;" /* f[2]*f[0] */ " mulxq 56(%1), %%rax, %%rcx;" " adcx %%rax, %%r10;" /* f[3]*f[0] */ " movq 56(%1), %%rdx;" /* f[3] */ " mulxq 40(%1), %%r11, %%r12;" " adcx %%rcx, %%r11;" /* f[1]*f[3] */ " mulxq 48(%1), %%rax, %%r13;" " adcx %%rax, %%r12;" /* f[2]*f[3] */ " movq 40(%1), %%rdx;" " adcx %%r15, %%r13;" /* f1 */ " mulxq 48(%1), %%rax, %%rcx;" " mov $0, %%r14;" /* f[2]*f[1] */ /* Step 2: Compute two parallel carry chains */ " xor %%r15, %%r15;" " adox %%rax, %%r10;" " adcx %%r8, %%r8;" " adox %%rcx, %%r11;" " adcx %%r9, %%r9;" " adox %%r15, %%r12;" " adcx %%r10, %%r10;" " adox %%r15, %%r13;" " adcx %%r11, %%r11;" " adox %%r15, %%r14;" " adcx %%r12, %%r12;" " adcx %%r13, %%r13;" " adcx %%r14, %%r14;" /* Step 3: Compute intermediate squares */ " movq 32(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[0]^2 */ " movq %%rax, 64(%0);" " add %%rcx, %%r8;" " movq %%r8, 72(%0);" " movq 40(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[1]^2 */ " adcx %%rax, %%r9;" " movq %%r9, 80(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 88(%0);" " movq 48(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[2]^2 */ " adcx %%rax, %%r11;" " movq %%r11, 96(%0);" " adcx %%rcx, %%r12;" " movq %%r12, 104(%0);" " movq 56(%1), %%rdx;" " mulx %%rdx, %%rax, %%rcx;" /* f[3]^2 */ " adcx %%rax, %%r13;" " movq %%r13, 112(%0);" " adcx %%rcx, %%r14;" " movq %%r14, 120(%0);" /* Line up pointers */ " mov %0, %1;" " mov %2, %0;" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 32(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" " adoxq 0(%1), %%r8;" " mulxq 40(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 8(%1), %%r9;" " mulxq 48(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 16(%1), %%r10;" " mulxq 56(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 24(%1), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 8(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 16(%0);" " adcx %%rcx, %%r11;" " movq %%r11, 24(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 0(%0);" /* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */ " mov $38, %%rdx;" " mulxq 96(%1), %%r8, %%r13;" " xor %%rcx, %%rcx;" " adoxq 64(%1), %%r8;" " mulxq 104(%1), %%r9, %%r12;" " adcx %%r13, %%r9;" " adoxq 72(%1), %%r9;" " mulxq 112(%1), %%r10, %%r13;" " adcx %%r12, %%r10;" " adoxq 80(%1), %%r10;" " mulxq 120(%1), %%r11, %%rax;" " adcx %%r13, %%r11;" " adoxq 88(%1), %%r11;" " adcx %%rcx, %%rax;" " adox %%rcx, %%rax;" " imul %%rdx, %%rax;" /* Step 2: Fold the carry back into dst */ " add %%rax, %%r8;" " adcx %%rcx, %%r9;" " movq %%r9, 40(%0);" " adcx %%rcx, %%r10;" " movq %%r10, 48(%0);" " adcx %%rcx, %%r11;" " movq %%r11, 56(%0);" /* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */ " mov $0, %%rax;" " cmovc %%rdx, %%rax;" " add %%rax, %%r8;" " movq %%r8, 32(%0);" : "+&r" (tmp), "+&r" (f), "+&r" (out) : : "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc" ); } static __always_inline uint64_t eq_mask(uint64_t a, uint64_t b) { uint64_t x = a ^ b; uint64_t minus_x = ~x + (uint64_t)1U; uint64_t x_or_minus_x = x | minus_x; uint64_t xnx = x_or_minus_x >> (uint32_t)63U; return xnx - (uint64_t)1U; } static __always_inline uint64_t gte_mask(uint64_t a, uint64_t b) { uint64_t x = a; uint64_t y = b; uint64_t x_xor_y = x ^ y; uint64_t x_sub_y = x - y; uint64_t x_sub_y_xor_y = x_sub_y ^ y; uint64_t q = x_xor_y | x_sub_y_xor_y; uint64_t x_xor_q = x ^ q; uint64_t x_xor_q_ = x_xor_q >> (uint32_t)63U; return x_xor_q_ - (uint64_t)1U; } static void store_felem(uint64_t *b, uint64_t *f) { uint64_t f30 = f[3U]; uint64_t top_bit0 = f30 >> (uint32_t)63U; uint64_t carry0; uint64_t f31; uint64_t top_bit; uint64_t carry; uint64_t f0; uint64_t f1; uint64_t f2; uint64_t f3; uint64_t m0; uint64_t m1; uint64_t m2; uint64_t m3; uint64_t mask; uint64_t f0_; uint64_t f1_; uint64_t f2_; uint64_t f3_; uint64_t o0; uint64_t o1; uint64_t o2; uint64_t o3; f[3U] = f30 & (uint64_t)0x7fffffffffffffffU; carry0 = add_scalar(f, f, (uint64_t)19U * top_bit0); f31 = f[3U]; top_bit = f31 >> (uint32_t)63U; f[3U] = f31 & (uint64_t)0x7fffffffffffffffU; carry = add_scalar(f, f, (uint64_t)19U * top_bit); f0 = f[0U]; f1 = f[1U]; f2 = f[2U]; f3 = f[3U]; m0 = gte_mask(f0, (uint64_t)0xffffffffffffffedU); m1 = eq_mask(f1, (uint64_t)0xffffffffffffffffU); m2 = eq_mask(f2, (uint64_t)0xffffffffffffffffU); m3 = eq_mask(f3, (uint64_t)0x7fffffffffffffffU); mask = ((m0 & m1) & m2) & m3; f0_ = f0 - (mask & (uint64_t)0xffffffffffffffedU); f1_ = f1 - (mask & (uint64_t)0xffffffffffffffffU); f2_ = f2 - (mask & (uint64_t)0xffffffffffffffffU); f3_ = f3 - (mask & (uint64_t)0x7fffffffffffffffU); o0 = f0_; o1 = f1_; o2 = f2_; o3 = f3_; b[0U] = o0; b[1U] = o1; b[2U] = o2; b[3U] = o3; } enum { QWORDS_PER_FIELD_ELEMENT = 4 }; uint64_t out_fadd[QWORDS_PER_FIELD_ELEMENT]; uint64_t f1_fadd[QWORDS_PER_FIELD_ELEMENT]; uint64_t f2_fadd[QWORDS_PER_FIELD_ELEMENT]; uint64_t out_fsub[QWORDS_PER_FIELD_ELEMENT]; uint64_t f1_fsub[QWORDS_PER_FIELD_ELEMENT]; uint64_t f2_fsub[QWORDS_PER_FIELD_ELEMENT]; uint64_t out_fmul[QWORDS_PER_FIELD_ELEMENT]; uint64_t f1_fmul[QWORDS_PER_FIELD_ELEMENT]; uint64_t f2_fmul[QWORDS_PER_FIELD_ELEMENT]; uint64_t out_fmul2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t f1_fmul2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t f2_fmul2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t out_fmul_scalar[QWORDS_PER_FIELD_ELEMENT]; uint64_t f1_fmul_scalar[QWORDS_PER_FIELD_ELEMENT]; uint64_t f2_fmul_scalar; uint64_t out_fsqr[QWORDS_PER_FIELD_ELEMENT]; uint64_t f1_fsqr[QWORDS_PER_FIELD_ELEMENT]; uint64_t out_fsqr2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t f1_fsqr2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t bit_cswap2; uint64_t p1_cswap2[QWORDS_PER_FIELD_ELEMENT * 2]; uint64_t p2_cswap2[QWORDS_PER_FIELD_ELEMENT * 2]; int main(int argc, char *argv[]) { uint64_t tmp[QWORDS_PER_FIELD_ELEMENT * 4]; fadd(out_fadd, f1_fadd, f2_fadd); store_felem(out_fadd, out_fadd); fsub(out_fsub, f1_fsub, f2_fsub); store_felem(out_fsub, out_fsub); fmul(out_fmul, f1_fmul, f2_fmul, tmp); store_felem(out_fmul, out_fmul); fmul2(out_fmul2, f1_fmul2, f2_fmul2, tmp); store_felem(out_fmul2, out_fmul2); store_felem(out_fmul2 + QWORDS_PER_FIELD_ELEMENT, out_fmul2 + QWORDS_PER_FIELD_ELEMENT); fmul_scalar(out_fmul_scalar, f1_fmul_scalar, f2_fmul_scalar); store_felem(out_fmul_scalar, out_fmul_scalar); fsqr(out_fsqr, f1_fsqr, tmp); store_felem(out_fsqr, out_fsqr); fsqr2(out_fsqr2, f1_fsqr2, tmp); store_felem(out_fsqr2, out_fsqr2); store_felem(out_fsqr2 + QWORDS_PER_FIELD_ELEMENT, out_fsqr2 + QWORDS_PER_FIELD_ELEMENT); cswap2(bit_cswap2, p1_cswap2, p2_cswap2); return 0; }