#include <stdint.h>

/* Computes the addition of four-element f1 with value in f2
 * and returns the carry (if any) */
static uint64_t add_scalar(uint64_t *out, const uint64_t *f1, uint64_t f2)
{
	uint64_t carry_r;

	asm volatile(
		/* Clear registers to propagate the carry bit */
		"  xor %%r8, %%r8;"
		"  xor %%r9, %%r9;"
		"  xor %%r10, %%r10;"
		"  xor %%r11, %%r11;"
		"  xor %1, %1;"

		/* Begin addition chain */
		"  addq 0(%3), %0;"
		"  movq %0, 0(%2);"
		"  adcxq 8(%3), %%r8;"
		"  movq %%r8, 8(%2);"
		"  adcxq 16(%3), %%r9;"
		"  movq %%r9, 16(%2);"
		"  adcxq 24(%3), %%r10;"
		"  movq %%r10, 24(%2);"

		/* Return the carry bit in a register */
		"  adcx %%r11, %1;"
	: "+&r" (f2), "=&r" (carry_r)
	: "r" (out), "r" (f1)
	: "%r8", "%r9", "%r10", "%r11", "memory", "cc"
	);

	return carry_r;
}

/* Computes the field addition of two field elements */
static void fadd(uint64_t *out, const uint64_t *f1, const uint64_t *f2)
{
	asm volatile(
		/* Compute the raw addition of f1 + f2 */
		"  movq 0(%0), %%r8;"
		"  addq 0(%2), %%r8;"
		"  movq 8(%0), %%r9;"
		"  adcxq 8(%2), %%r9;"
		"  movq 16(%0), %%r10;"
		"  adcxq 16(%2), %%r10;"
		"  movq 24(%0), %%r11;"
		"  adcxq 24(%2), %%r11;"

		/* Wrap the result back into the field */

		/* Step 1: Compute carry*38 */
		"  mov $0, %%rax;"
		"  mov $38, %0;"
		"  cmovc %0, %%rax;"

		/* Step 2: Add carry*38 to the original sum */
		"  xor %%rcx, %%rcx;"
		"  add %%rax, %%r8;"
		"  adcx %%rcx, %%r9;"
		"  movq %%r9, 8(%1);"
		"  adcx %%rcx, %%r10;"
		"  movq %%r10, 16(%1);"
		"  adcx %%rcx, %%r11;"
		"  movq %%r11, 24(%1);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %0, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%1);"
	: "+&r" (f2)
	: "r" (out), "r" (f1)
	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
	);
}

/* Computes the field substraction of two field elements */
static void fsub(uint64_t *out, const uint64_t *f1, const uint64_t *f2)
{
	asm volatile(
		/* Compute the raw substraction of f1-f2 */
		"  movq 0(%1), %%r8;"
		"  subq 0(%2), %%r8;"
		"  movq 8(%1), %%r9;"
		"  sbbq 8(%2), %%r9;"
		"  movq 16(%1), %%r10;"
		"  sbbq 16(%2), %%r10;"
		"  movq 24(%1), %%r11;"
		"  sbbq 24(%2), %%r11;"

		/* Wrap the result back into the field */

		/* Step 1: Compute carry*38 */
		"  mov $0, %%rax;"
		"  mov $38, %%rcx;"
		"  cmovc %%rcx, %%rax;"

		/* Step 2: Substract carry*38 from the original difference */
		"  sub %%rax, %%r8;"
		"  sbb $0, %%r9;"
		"  sbb $0, %%r10;"
		"  sbb $0, %%r11;"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rcx, %%rax;"
		"  sub %%rax, %%r8;"

		/* Store the result */
		"  movq %%r8, 0(%0);"
		"  movq %%r9, 8(%0);"
		"  movq %%r10, 16(%0);"
		"  movq %%r11, 24(%0);"
	:
	: "r" (out), "r" (f1), "r" (f2)
	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
	);
}

/* Computes a field multiplication: out <- f1 * f2
 * Uses the 8-element buffer tmp for intermediate results */
static void fmul(uint64_t *out, const uint64_t *f1, const uint64_t *f2, uint64_t *tmp)
{
	asm volatile(
		/* Compute the raw multiplication: tmp <- src1 * src2 */

		/* Compute src1[0] * src2 */
		"  movq 0(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"
		/* Compute src1[1] * src2 */
		"  movq 8(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[2] * src2 */
		"  movq 16(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[3] * src2 */
		"  movq 24(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"
		/* Line up pointers */
		"  mov %0, %1;"
		"  mov %2, %0;"

		/* Wrap the result back into the field */

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 32(%1), %%r8, %%r13;"
		"  xor %3, %3;"
		"  adoxq 0(%1), %%r8;"
		"  mulxq 40(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 8(%1), %%r9;"
		"  mulxq 48(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 16(%1), %%r10;"
		"  mulxq 56(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 24(%1), %%r11;"
		"  adcx %3, %%rax;"
		"  adox %3, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %3, %%r9;"
		"  movq %%r9, 8(%0);"
		"  adcx %3, %%r10;"
		"  movq %%r10, 16(%0);"
		"  adcx %3, %%r11;"
		"  movq %%r11, 24(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%0);"
	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
	:
	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
	);
}

/* Computes two field multiplications:
 * out[0] <- f1[0] * f2[0]
 * out[1] <- f1[1] * f2[1]
 * Uses the 16-element buffer tmp for intermediate results. */
static void fmul2(uint64_t *out, const uint64_t *f1, const uint64_t *f2, uint64_t *tmp)
{
	asm volatile(
		/* Compute the raw multiplication tmp[0] <- f1[0] * f2[0] */

		/* Compute src1[0] * src2 */
		"  movq 0(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 0(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 8(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"
		/* Compute src1[1] * src2 */
		"  movq 8(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 8(%0), %%r8;"    "  movq %%r8, 8(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 16(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[2] * src2 */
		"  movq 16(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 16(%0), %%r8;"    "  movq %%r8, 16(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 24(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[3] * src2 */
		"  movq 24(%1), %%rdx;"
		"  mulxq 0(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 24(%0), %%r8;"    "  movq %%r8, 24(%0);"
		"  mulxq 8(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 32(%0);"
		"  mulxq 16(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 40(%0);"    "  mov $0, %%r8;"
		"  mulxq 24(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 48(%0);"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 56(%0);"

		/* Compute the raw multiplication tmp[1] <- f1[1] * f2[1] */

		/* Compute src1[0] * src2 */
		"  movq 32(%1), %%rdx;"
		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  movq %%r8, 64(%0);"
		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  movq %%r10, 72(%0);"
		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"
		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"
		/* Compute src1[1] * src2 */
		"  movq 40(%1), %%rdx;"
		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"     "  adcxq 72(%0), %%r8;"    "  movq %%r8, 72(%0);"
		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 80(%0);"
		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[2] * src2 */
		"  movq 48(%1), %%rdx;"
		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 80(%0), %%r8;"    "  movq %%r8, 80(%0);"
		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 88(%0);"
		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  mov $0, %%r8;"
		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"
		/* Compute src1[3] * src2 */
		"  movq 56(%1), %%rdx;"
		"  mulxq 32(%3), %%r8, %%r9;"       "  xor %%r10, %%r10;"    "  adcxq 88(%0), %%r8;"    "  movq %%r8, 88(%0);"
		"  mulxq 40(%3), %%r10, %%r11;"     "  adox %%r9, %%r10;"     "  adcx %%r12, %%r10;"    "  movq %%r10, 96(%0);"
		"  mulxq 48(%3), %%r12, %%r13;"    "  adox %%r11, %%r12;"    "  adcx %%r14, %%r12;"    "  movq %%r12, 104(%0);"    "  mov $0, %%r8;"
		"  mulxq 56(%3), %%r14, %%rdx;"    "  adox %%r13, %%r14;"    "  adcx %%rax, %%r14;"    "  movq %%r14, 112(%0);"    "  mov $0, %%rax;"
		                                   "  adox %%rdx, %%rax;"    "  adcx %%r8, %%rax;"     "  movq %%rax, 120(%0);"
		/* Line up pointers */
		"  mov %0, %1;"
		"  mov %2, %0;"

		/* Wrap the results back into the field */

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 32(%1), %%r8, %%r13;"
		"  xor %3, %3;"
		"  adoxq 0(%1), %%r8;"
		"  mulxq 40(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 8(%1), %%r9;"
		"  mulxq 48(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 16(%1), %%r10;"
		"  mulxq 56(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 24(%1), %%r11;"
		"  adcx %3, %%rax;"
		"  adox %3, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %3, %%r9;"
		"  movq %%r9, 8(%0);"
		"  adcx %3, %%r10;"
		"  movq %%r10, 16(%0);"
		"  adcx %3, %%r11;"
		"  movq %%r11, 24(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%0);"

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 96(%1), %%r8, %%r13;"
		"  xor %3, %3;"
		"  adoxq 64(%1), %%r8;"
		"  mulxq 104(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 72(%1), %%r9;"
		"  mulxq 112(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 80(%1), %%r10;"
		"  mulxq 120(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 88(%1), %%r11;"
		"  adcx %3, %%rax;"
		"  adox %3, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %3, %%r9;"
		"  movq %%r9, 40(%0);"
		"  adcx %3, %%r10;"
		"  movq %%r10, 48(%0);"
		"  adcx %3, %%r11;"
		"  movq %%r11, 56(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 32(%0);"
	: "+&r" (tmp), "+&r" (f1), "+&r" (out), "+&r" (f2)
	:
	: "%rax", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "memory", "cc"
	);
}

/* Computes the field multiplication of four-element f1 with value in f2 */
static void fmul_scalar(uint64_t *out, const uint64_t *f1, uint64_t f2)
{
	register uint64_t f2_r asm("rdx") = f2;

	asm volatile(
		/* Compute the raw multiplication of f1*f2 */
		"  mulxq 0(%2), %%r8, %%rcx;"      /* f1[0]*f2 */
		"  mulxq 8(%2), %%r9, %%r12;"      /* f1[1]*f2 */
		"  add %%rcx, %%r9;"
		"  mov $0, %%rcx;"
		"  mulxq 16(%2), %%r10, %%r13;"    /* f1[2]*f2 */
		"  adcx %%r12, %%r10;"
		"  mulxq 24(%2), %%r11, %%rax;"    /* f1[3]*f2 */
		"  adcx %%r13, %%r11;"
		"  adcx %%rcx, %%rax;"

		/* Wrap the result back into the field */

		/* Step 1: Compute carry*38 */
		"  mov $38, %%rdx;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %%rcx, %%r9;"
		"  movq %%r9, 8(%1);"
		"  adcx %%rcx, %%r10;"
		"  movq %%r10, 16(%1);"
		"  adcx %%rcx, %%r11;"
		"  movq %%r11, 24(%1);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%1);"
	: "+&r" (f2_r)
	: "r" (out), "r" (f1)
	: "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
	);
}

/* Computes p1 <- bit ? p2 : p1 in constant time */
static void cswap2(uint64_t bit, const uint64_t *p1, const uint64_t *p2)
{
	asm volatile(
		/* Invert the polarity of bit to match cmov expectations */
		"  add $18446744073709551615, %0;"

		/* cswap p1[0], p2[0] */
		"  movq 0(%1), %%r8;"
		"  movq 0(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 0(%1);"
		"  movq %%r9, 0(%2);"

		/* cswap p1[1], p2[1] */
		"  movq 8(%1), %%r8;"
		"  movq 8(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 8(%1);"
		"  movq %%r9, 8(%2);"

		/* cswap p1[2], p2[2] */
		"  movq 16(%1), %%r8;"
		"  movq 16(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 16(%1);"
		"  movq %%r9, 16(%2);"

		/* cswap p1[3], p2[3] */
		"  movq 24(%1), %%r8;"
		"  movq 24(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 24(%1);"
		"  movq %%r9, 24(%2);"

		/* cswap p1[4], p2[4] */
		"  movq 32(%1), %%r8;"
		"  movq 32(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 32(%1);"
		"  movq %%r9, 32(%2);"

		/* cswap p1[5], p2[5] */
		"  movq 40(%1), %%r8;"
		"  movq 40(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 40(%1);"
		"  movq %%r9, 40(%2);"

		/* cswap p1[6], p2[6] */
		"  movq 48(%1), %%r8;"
		"  movq 48(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 48(%1);"
		"  movq %%r9, 48(%2);"

		/* cswap p1[7], p2[7] */
		"  movq 56(%1), %%r8;"
		"  movq 56(%2), %%r9;"
		"  mov %%r8, %%r10;"
		"  cmovc %%r9, %%r8;"
		"  cmovc %%r10, %%r9;"
		"  movq %%r8, 56(%1);"
		"  movq %%r9, 56(%2);"
	: "+&r" (bit)
	: "r" (p1), "r" (p2)
	: "%r8", "%r9", "%r10", "memory", "cc"
	);
}

/* Computes the square of a field element: out <- f * f
 * Uses the 8-element buffer tmp for intermediate results */
static void fsqr(uint64_t *out, const uint64_t *f, uint64_t *tmp)
{
	asm volatile(
		/* Compute the raw multiplication: tmp <- f * f */

		/* Step 1: Compute all partial products */
		"  movq 0(%1), %%rdx;"                                       /* f[0] */
		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
		"  movq 24(%1), %%rdx;"                                      /* f[3] */
		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */

		/* Step 2: Compute two parallel carry chains */
		"  xor %%r15, %%r15;"
		"  adox %%rax, %%r10;"
		"  adcx %%r8, %%r8;"
		"  adox %%rcx, %%r11;"
		"  adcx %%r9, %%r9;"
		"  adox %%r15, %%r12;"
		"  adcx %%r10, %%r10;"
		"  adox %%r15, %%r13;"
		"  adcx %%r11, %%r11;"
		"  adox %%r15, %%r14;"
		"  adcx %%r12, %%r12;"
		"  adcx %%r13, %%r13;"
		"  adcx %%r14, %%r14;"

		/* Step 3: Compute intermediate squares */
		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
		                           "  movq %%rax, 0(%0);"
		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"

		/* Line up pointers */
		"  mov %0, %1;"
		"  mov %2, %0;"

		/* Wrap the result back into the field */

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 32(%1), %%r8, %%r13;"
		"  xor %%rcx, %%rcx;"
		"  adoxq 0(%1), %%r8;"
		"  mulxq 40(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 8(%1), %%r9;"
		"  mulxq 48(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 16(%1), %%r10;"
		"  mulxq 56(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 24(%1), %%r11;"
		"  adcx %%rcx, %%rax;"
		"  adox %%rcx, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %%rcx, %%r9;"
		"  movq %%r9, 8(%0);"
		"  adcx %%rcx, %%r10;"
		"  movq %%r10, 16(%0);"
		"  adcx %%rcx, %%r11;"
		"  movq %%r11, 24(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%0);"
	: "+&r" (tmp), "+&r" (f), "+&r" (out)
	:
	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
	);
}

/* Computes two field squarings:
 * out[0] <- f[0] * f[0]
 * out[1] <- f[1] * f[1]
 * Uses the 16-element buffer tmp for intermediate results */
static void fsqr2(uint64_t *out, const uint64_t *f, uint64_t *tmp)
{
	asm volatile(
		/* Step 1: Compute all partial products */
		"  movq 0(%1), %%rdx;"                                       /* f[0] */
		"  mulxq 8(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
		"  mulxq 16(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
		"  mulxq 24(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
		"  movq 24(%1), %%rdx;"                                      /* f[3] */
		"  mulxq 8(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
		"  mulxq 16(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
		"  movq 8(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
		"  mulxq 16(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */

		/* Step 2: Compute two parallel carry chains */
		"  xor %%r15, %%r15;"
		"  adox %%rax, %%r10;"
		"  adcx %%r8, %%r8;"
		"  adox %%rcx, %%r11;"
		"  adcx %%r9, %%r9;"
		"  adox %%r15, %%r12;"
		"  adcx %%r10, %%r10;"
		"  adox %%r15, %%r13;"
		"  adcx %%r11, %%r11;"
		"  adox %%r15, %%r14;"
		"  adcx %%r12, %%r12;"
		"  adcx %%r13, %%r13;"
		"  adcx %%r14, %%r14;"

		/* Step 3: Compute intermediate squares */
		"  movq 0(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
		                           "  movq %%rax, 0(%0);"
		"  add %%rcx, %%r8;"       "  movq %%r8, 8(%0);"
		"  movq 8(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
		"  adcx %%rax, %%r9;"      "  movq %%r9, 16(%0);"
		"  adcx %%rcx, %%r10;"     "  movq %%r10, 24(%0);"
		"  movq 16(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
		"  adcx %%rax, %%r11;"     "  movq %%r11, 32(%0);"
		"  adcx %%rcx, %%r12;"     "  movq %%r12, 40(%0);"
		"  movq 24(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
		"  adcx %%rax, %%r13;"     "  movq %%r13, 48(%0);"
		"  adcx %%rcx, %%r14;"     "  movq %%r14, 56(%0);"

		/* Step 1: Compute all partial products */
		"  movq 32(%1), %%rdx;"                                       /* f[0] */
		"  mulxq 40(%1), %%r8, %%r14;"      "  xor %%r15, %%r15;"     /* f[1]*f[0] */
		"  mulxq 48(%1), %%r9, %%r10;"     "  adcx %%r14, %%r9;"     /* f[2]*f[0] */
		"  mulxq 56(%1), %%rax, %%rcx;"    "  adcx %%rax, %%r10;"    /* f[3]*f[0] */
		"  movq 56(%1), %%rdx;"                                      /* f[3] */
		"  mulxq 40(%1), %%r11, %%r12;"     "  adcx %%rcx, %%r11;"    /* f[1]*f[3] */
		"  mulxq 48(%1), %%rax, %%r13;"    "  adcx %%rax, %%r12;"    /* f[2]*f[3] */
		"  movq 40(%1), %%rdx;"             "  adcx %%r15, %%r13;"    /* f1 */
		"  mulxq 48(%1), %%rax, %%rcx;"    "  mov $0, %%r14;"        /* f[2]*f[1] */

		/* Step 2: Compute two parallel carry chains */
		"  xor %%r15, %%r15;"
		"  adox %%rax, %%r10;"
		"  adcx %%r8, %%r8;"
		"  adox %%rcx, %%r11;"
		"  adcx %%r9, %%r9;"
		"  adox %%r15, %%r12;"
		"  adcx %%r10, %%r10;"
		"  adox %%r15, %%r13;"
		"  adcx %%r11, %%r11;"
		"  adox %%r15, %%r14;"
		"  adcx %%r12, %%r12;"
		"  adcx %%r13, %%r13;"
		"  adcx %%r14, %%r14;"

		/* Step 3: Compute intermediate squares */
		"  movq 32(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[0]^2 */
		                           "  movq %%rax, 64(%0);"
		"  add %%rcx, %%r8;"       "  movq %%r8, 72(%0);"
		"  movq 40(%1), %%rdx;"     "  mulx %%rdx, %%rax, %%rcx;"    /* f[1]^2 */
		"  adcx %%rax, %%r9;"      "  movq %%r9, 80(%0);"
		"  adcx %%rcx, %%r10;"     "  movq %%r10, 88(%0);"
		"  movq 48(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[2]^2 */
		"  adcx %%rax, %%r11;"     "  movq %%r11, 96(%0);"
		"  adcx %%rcx, %%r12;"     "  movq %%r12, 104(%0);"
		"  movq 56(%1), %%rdx;"    "  mulx %%rdx, %%rax, %%rcx;"    /* f[3]^2 */
		"  adcx %%rax, %%r13;"     "  movq %%r13, 112(%0);"
		"  adcx %%rcx, %%r14;"     "  movq %%r14, 120(%0);"

		/* Line up pointers */
		"  mov %0, %1;"
		"  mov %2, %0;"

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 32(%1), %%r8, %%r13;"
		"  xor %%rcx, %%rcx;"
		"  adoxq 0(%1), %%r8;"
		"  mulxq 40(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 8(%1), %%r9;"
		"  mulxq 48(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 16(%1), %%r10;"
		"  mulxq 56(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 24(%1), %%r11;"
		"  adcx %%rcx, %%rax;"
		"  adox %%rcx, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %%rcx, %%r9;"
		"  movq %%r9, 8(%0);"
		"  adcx %%rcx, %%r10;"
		"  movq %%r10, 16(%0);"
		"  adcx %%rcx, %%r11;"
		"  movq %%r11, 24(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 0(%0);"

		/* Step 1: Compute dst + carry == tmp_hi * 38 + tmp_lo */
		"  mov $38, %%rdx;"
		"  mulxq 96(%1), %%r8, %%r13;"
		"  xor %%rcx, %%rcx;"
		"  adoxq 64(%1), %%r8;"
		"  mulxq 104(%1), %%r9, %%r12;"
		"  adcx %%r13, %%r9;"
		"  adoxq 72(%1), %%r9;"
		"  mulxq 112(%1), %%r10, %%r13;"
		"  adcx %%r12, %%r10;"
		"  adoxq 80(%1), %%r10;"
		"  mulxq 120(%1), %%r11, %%rax;"
		"  adcx %%r13, %%r11;"
		"  adoxq 88(%1), %%r11;"
		"  adcx %%rcx, %%rax;"
		"  adox %%rcx, %%rax;"
		"  imul %%rdx, %%rax;"

		/* Step 2: Fold the carry back into dst */
		"  add %%rax, %%r8;"
		"  adcx %%rcx, %%r9;"
		"  movq %%r9, 40(%0);"
		"  adcx %%rcx, %%r10;"
		"  movq %%r10, 48(%0);"
		"  adcx %%rcx, %%r11;"
		"  movq %%r11, 56(%0);"

		/* Step 3: Fold the carry bit back in; guaranteed not to carry at this point */
		"  mov $0, %%rax;"
		"  cmovc %%rdx, %%rax;"
		"  add %%rax, %%r8;"
		"  movq %%r8, 32(%0);"
	: "+&r" (tmp), "+&r" (f), "+&r" (out)
	:
	: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
	);
}

static __always_inline uint64_t eq_mask(uint64_t a, uint64_t b)
{
	uint64_t x = a ^ b;
	uint64_t minus_x = ~x + (uint64_t)1U;
	uint64_t x_or_minus_x = x | minus_x;
	uint64_t xnx = x_or_minus_x >> (uint32_t)63U;
	return xnx - (uint64_t)1U;
}

static __always_inline uint64_t gte_mask(uint64_t a, uint64_t b)
{
	uint64_t x = a;
	uint64_t y = b;
	uint64_t x_xor_y = x ^ y;
	uint64_t x_sub_y = x - y;
	uint64_t x_sub_y_xor_y = x_sub_y ^ y;
	uint64_t q = x_xor_y | x_sub_y_xor_y;
	uint64_t x_xor_q = x ^ q;
	uint64_t x_xor_q_ = x_xor_q >> (uint32_t)63U;
	return x_xor_q_ - (uint64_t)1U;
}

static void store_felem(uint64_t *b, uint64_t *f)
{
	uint64_t f30 = f[3U];
	uint64_t top_bit0 = f30 >> (uint32_t)63U;
	uint64_t carry0;
	uint64_t f31;
	uint64_t top_bit;
	uint64_t carry;
	uint64_t f0;
	uint64_t f1;
	uint64_t f2;
	uint64_t f3;
	uint64_t m0;
	uint64_t m1;
	uint64_t m2;
	uint64_t m3;
	uint64_t mask;
	uint64_t f0_;
	uint64_t f1_;
	uint64_t f2_;
	uint64_t f3_;
	uint64_t o0;
	uint64_t o1;
	uint64_t o2;
	uint64_t o3;
	f[3U] = f30 & (uint64_t)0x7fffffffffffffffU;
	carry0 = add_scalar(f, f, (uint64_t)19U * top_bit0);
	f31 = f[3U];
	top_bit = f31 >> (uint32_t)63U;
	f[3U] = f31 & (uint64_t)0x7fffffffffffffffU;
	carry = add_scalar(f, f, (uint64_t)19U * top_bit);
	f0 = f[0U];
	f1 = f[1U];
	f2 = f[2U];
	f3 = f[3U];
	m0 = gte_mask(f0, (uint64_t)0xffffffffffffffedU);
	m1 = eq_mask(f1, (uint64_t)0xffffffffffffffffU);
	m2 = eq_mask(f2, (uint64_t)0xffffffffffffffffU);
	m3 = eq_mask(f3, (uint64_t)0x7fffffffffffffffU);
	mask = ((m0 & m1) & m2) & m3;
	f0_ = f0 - (mask & (uint64_t)0xffffffffffffffedU);
	f1_ = f1 - (mask & (uint64_t)0xffffffffffffffffU);
	f2_ = f2 - (mask & (uint64_t)0xffffffffffffffffU);
	f3_ = f3 - (mask & (uint64_t)0x7fffffffffffffffU);
	o0 = f0_;
	o1 = f1_;
	o2 = f2_;
	o3 = f3_;
	b[0U] = o0;
	b[1U] = o1;
	b[2U] = o2;
	b[3U] = o3;
}

enum { QWORDS_PER_FIELD_ELEMENT = 4 };

uint64_t out_fadd[QWORDS_PER_FIELD_ELEMENT];
uint64_t f1_fadd[QWORDS_PER_FIELD_ELEMENT];
uint64_t f2_fadd[QWORDS_PER_FIELD_ELEMENT];

uint64_t out_fsub[QWORDS_PER_FIELD_ELEMENT];
uint64_t f1_fsub[QWORDS_PER_FIELD_ELEMENT];
uint64_t f2_fsub[QWORDS_PER_FIELD_ELEMENT];

uint64_t out_fmul[QWORDS_PER_FIELD_ELEMENT];
uint64_t f1_fmul[QWORDS_PER_FIELD_ELEMENT];
uint64_t f2_fmul[QWORDS_PER_FIELD_ELEMENT];

uint64_t out_fmul2[QWORDS_PER_FIELD_ELEMENT * 2];
uint64_t f1_fmul2[QWORDS_PER_FIELD_ELEMENT * 2];
uint64_t f2_fmul2[QWORDS_PER_FIELD_ELEMENT * 2];

uint64_t out_fmul_scalar[QWORDS_PER_FIELD_ELEMENT];
uint64_t f1_fmul_scalar[QWORDS_PER_FIELD_ELEMENT];
uint64_t f2_fmul_scalar;

uint64_t out_fsqr[QWORDS_PER_FIELD_ELEMENT];
uint64_t f1_fsqr[QWORDS_PER_FIELD_ELEMENT];

uint64_t out_fsqr2[QWORDS_PER_FIELD_ELEMENT * 2];
uint64_t f1_fsqr2[QWORDS_PER_FIELD_ELEMENT * 2];

uint64_t bit_cswap2;
uint64_t p1_cswap2[QWORDS_PER_FIELD_ELEMENT * 2];
uint64_t p2_cswap2[QWORDS_PER_FIELD_ELEMENT * 2];

int main(int argc, char *argv[])
{
	uint64_t tmp[QWORDS_PER_FIELD_ELEMENT * 4];

	fadd(out_fadd, f1_fadd, f2_fadd);
	store_felem(out_fadd, out_fadd);
	fsub(out_fsub, f1_fsub, f2_fsub);
	store_felem(out_fsub, out_fsub);
	fmul(out_fmul, f1_fmul, f2_fmul, tmp);
	store_felem(out_fmul, out_fmul);
	fmul2(out_fmul2, f1_fmul2, f2_fmul2, tmp);
	store_felem(out_fmul2, out_fmul2);
	store_felem(out_fmul2 + QWORDS_PER_FIELD_ELEMENT, out_fmul2 + QWORDS_PER_FIELD_ELEMENT);
	fmul_scalar(out_fmul_scalar, f1_fmul_scalar, f2_fmul_scalar);
	store_felem(out_fmul_scalar, out_fmul_scalar);
	fsqr(out_fsqr, f1_fsqr, tmp);
	store_felem(out_fsqr, out_fsqr);
	fsqr2(out_fsqr2, f1_fsqr2, tmp);
	store_felem(out_fsqr2, out_fsqr2);
	store_felem(out_fsqr2 + QWORDS_PER_FIELD_ELEMENT, out_fsqr2 + QWORDS_PER_FIELD_ELEMENT);
	cswap2(bit_cswap2, p1_cswap2, p2_cswap2);

	return 0;
}