aboutsummaryrefslogtreecommitdiffstats
path: root/curve25519-vale-inline.h
diff options
context:
space:
mode:
Diffstat (limited to 'curve25519-vale-inline.h')
-rw-r--r--curve25519-vale-inline.h810
1 files changed, 810 insertions, 0 deletions
diff --git a/curve25519-vale-inline.h b/curve25519-vale-inline.h
new file mode 100644
index 0000000..9817fab
--- /dev/null
+++ b/curve25519-vale-inline.h
@@ -0,0 +1,810 @@
+#ifdef __GNUC__
+#pragma once
+#include <linux/kernel.h>
+#include <linux/string.h>
+typedef __uint128_t uint128_t;
+typedef u64 uint64_t;
+typedef u8 uint8_t;
+
+
+static inline uint64_t add1 (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t arg2_r asm("rdx") = arg2;
+ register uint64_t carry_r asm("rax");
+
+ __asm__ __volatile__(
+ " xor %%r8, %%r8;"
+ " xor %%r9, %%r9;"
+ " xor %%r10, %%r10;"
+ " xor %%r11, %%r11;"
+ " xor %%rax, %%rax;"
+ " addq 0(%%rsi), %%rdx;"
+ " movq %%rdx, 0(%%rdi);"
+ " adcxq 8(%%rsi), %%r8;"
+ " movq %%r8, 8(%%rdi);"
+ " adcxq 16(%%rsi), %%r9;"
+ " movq %%r9, 16(%%rdi);"
+ " adcxq 24(%%rsi), %%r10;"
+ " movq %%r10, 24(%%rdi);"
+ " adcx %%r11, %%rax;"
+ : "+r" (arg2_r), "=r" (carry_r)
+ : "r" (arg0_r), "r" (arg1_r)
+ : "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
+
+return carry_r;
+}
+
+static inline void fadd (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " movq 0(%%rdx), %%r8;"
+ " addq 0(%%rsi), %%r8;"
+ " movq 8(%%rdx), %%r9;"
+ " adcxq 8(%%rsi), %%r9;"
+ " movq 16(%%rdx), %%r10;"
+ " adcxq 16(%%rsi), %%r10;"
+ " movq 24(%%rdx), %%r11;"
+ " adcxq 24(%%rsi), %%r11;"
+ " mov $0, %%rax;"
+ " mov $38, %%rdx;"
+ " cmovc %%rdx, %%rax;"
+ " xor %%rcx, %%rcx;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ : "+r" (arg2_r)
+ : "r" (arg0_r), "r" (arg1_r)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
+}
+
+static inline void fsub (uint64_t* arg0, uint64_t* arg1, uint64_t* arg2) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " movq 0(%%rsi), %%r8;"
+ " subq 0(%%rdx), %%r8;"
+ " movq 8(%%rsi), %%r9;"
+ " sbbq 8(%%rdx), %%r9;"
+ " movq 16(%%rsi), %%r10;"
+ " sbbq 16(%%rdx), %%r10;"
+ " movq 24(%%rsi), %%r11;"
+ " sbbq 24(%%rdx), %%r11;"
+ " mov $0, %%rax;"
+ " mov $38, %%rcx;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+ " sbb $0, %%r9;"
+ " sbb $0, %%r10;"
+ " sbb $0, %%r11;"
+ " mov $0, %%rax;"
+ " cmovc %%rcx, %%rax;"
+ " sub %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ " movq %%r9, 8(%%rdi);"
+ " movq %%r10, 16(%%rdi);"
+ " movq %%r11, 24(%%rdi);"
+ :
+ : "r" (arg0_r), "r" (arg1_r), "r" (arg2_r)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "memory", "cc"
+ );
+}
+
+static inline void fmul (uint64_t* arg2, uint64_t* arg1, uint64_t* arg3, uint64_t* arg0) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+ register uint64_t* arg3_r asm("rcx") = arg3;
+
+ __asm__ __volatile__(
+ " mov %%rdx, %%r15;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " movq %%r8, 0(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " movq 8(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 8(%%rdi), %%r8;"
+ " movq %%r8, 8(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 16(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 16(%%rdi), %%r8;"
+ " movq %%r8, 16(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 24(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 24(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 24(%%rdi), %%r8;"
+ " movq %%r8, 24(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 32(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " movq %%r12, 40(%%rdi);"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%%rdi);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%%rdi);"
+ " mov %%rdi, %%rsi;"
+ " mov %%r15, %%rdi;"
+ " mov $38, %%rdx;"
+ " mulxq 32(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%%rsi), %%r8;"
+ " mulxq 40(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%%rsi), %%r9;"
+ " mulxq 48(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%%rsi), %%r10;"
+ " mulxq 56(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r)
+ :
+ : "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
+}
+
+static inline void fmul2 (uint64_t* arg2, uint64_t* arg1, uint64_t* arg3, uint64_t* arg0) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+ register uint64_t* arg3_r asm("rcx") = arg3;
+
+ __asm__ __volatile__(
+ " mov %%rdx, %%r15;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " movq %%r8, 0(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 8(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " movq 8(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 8(%%rdi), %%r8;"
+ " movq %%r8, 8(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 16(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 16(%%rdi), %%r8;"
+ " movq %%r8, 16(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 24(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 24(%%rsi), %%rdx;"
+ " mulxq 0(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 24(%%rdi), %%r8;"
+ " movq %%r8, 24(%%rdi);"
+ " mulxq 8(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 32(%%rdi);"
+ " mulxq 16(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " movq %%r12, 40(%%rdi);"
+ " mov $0, %%r8;"
+ " mulxq 24(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 48(%%rdi);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 56(%%rdi);"
+ " movq 32(%%rsi), %%rdx;"
+ " mulxq 32(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " movq %%r8, 64(%%rdi);"
+ " mulxq 40(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " movq %%r10, 72(%%rdi);"
+ " mulxq 48(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " mulxq 56(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " movq 40(%%rsi), %%rdx;"
+ " mulxq 32(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 72(%%rdi), %%r8;"
+ " movq %%r8, 72(%%rdi);"
+ " mulxq 40(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 80(%%rdi);"
+ " mulxq 48(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 56(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 48(%%rsi), %%rdx;"
+ " mulxq 32(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 80(%%rdi), %%r8;"
+ " movq %%r8, 80(%%rdi);"
+ " mulxq 40(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 88(%%rdi);"
+ " mulxq 48(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " mov $0, %%r8;"
+ " mulxq 56(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq 56(%%rsi), %%rdx;"
+ " mulxq 32(%%rcx), %%r8, %%r9;"
+ " xor %%r10, %%r10;"
+ " adcxq 88(%%rdi), %%r8;"
+ " movq %%r8, 88(%%rdi);"
+ " mulxq 40(%%rcx), %%r10, %%r11;"
+ " adox %%r9, %%r10;"
+ " adcx %%r12, %%r10;"
+ " movq %%r10, 96(%%rdi);"
+ " mulxq 48(%%rcx), %%r12, %%r13;"
+ " adox %%r11, %%r12;"
+ " adcx %%r14, %%r12;"
+ " movq %%r12, 104(%%rdi);"
+ " mov $0, %%r8;"
+ " mulxq 56(%%rcx), %%r14, %%rdx;"
+ " adox %%r13, %%r14;"
+ " adcx %%rax, %%r14;"
+ " movq %%r14, 112(%%rdi);"
+ " mov $0, %%rax;"
+ " adox %%rdx, %%rax;"
+ " adcx %%r8, %%rax;"
+ " movq %%rax, 120(%%rdi);"
+ " mov %%rdi, %%rsi;"
+ " mov %%r15, %%rdi;"
+ " mov $38, %%rdx;"
+ " mulxq 32(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%%rsi), %%r8;"
+ " mulxq 40(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%%rsi), %%r9;"
+ " mulxq 48(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%%rsi), %%r10;"
+ " mulxq 56(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ " mov $38, %%rdx;"
+ " mulxq 96(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 64(%%rsi), %%r8;"
+ " mulxq 104(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%%rsi), %%r9;"
+ " mulxq 112(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 80(%%rsi), %%r10;"
+ " mulxq 120(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%%rdi);"
+ : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r), "+r" (arg3_r)
+ :
+ : "%rax", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
+}
+
+static inline void fmul1 (uint64_t* arg0, uint64_t* arg1, uint64_t arg2) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " mulxq 0(%%rsi), %%r8, %%rcx;"
+ " mulxq 8(%%rsi), %%r9, %%r12;"
+ " add %%rcx, %%r9;"
+ " mov $0, %%rcx;"
+ " mulxq 16(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " mulxq 24(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adcx %%rcx, %%rax;"
+ " mov $38, %%rdx;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ : "+r" (arg2_r)
+ : "r" (arg0_r), "r" (arg1_r)
+ : "%rax", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "memory", "cc"
+ );
+}
+
+static inline void cswap2 (uint64_t arg2,uint64_t* arg0, uint64_t* arg1) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " add $18446744073709551615, %%rdx;"
+ " movq 0(%%rdi), %%r8;"
+ " movq 0(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 0(%%rdi);"
+ " movq %%r9, 0(%%rsi);"
+ " movq 8(%%rdi), %%r8;"
+ " movq 8(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 8(%%rdi);"
+ " movq %%r9, 8(%%rsi);"
+ " movq 16(%%rdi), %%r8;"
+ " movq 16(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 16(%%rdi);"
+ " movq %%r9, 16(%%rsi);"
+ " movq 24(%%rdi), %%r8;"
+ " movq 24(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 24(%%rdi);"
+ " movq %%r9, 24(%%rsi);"
+ " movq 32(%%rdi), %%r8;"
+ " movq 32(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 32(%%rdi);"
+ " movq %%r9, 32(%%rsi);"
+ " movq 40(%%rdi), %%r8;"
+ " movq 40(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 40(%%rdi);"
+ " movq %%r9, 40(%%rsi);"
+ " movq 48(%%rdi), %%r8;"
+ " movq 48(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 48(%%rdi);"
+ " movq %%r9, 48(%%rsi);"
+ " movq 56(%%rdi), %%r8;"
+ " movq 56(%%rsi), %%r9;"
+ " mov %%r8, %%r10;"
+ " cmovc %%r9, %%r8;"
+ " cmovc %%r10, %%r9;"
+ " movq %%r8, 56(%%rdi);"
+ " movq %%r9, 56(%%rsi);"
+ : "+r" (arg2_r)
+ : "r" (arg0_r), "r" (arg1_r)
+ : "%r8", "%r9", "%r10", "memory", "cc"
+ );
+}
+
+static inline void fsqr (uint64_t* arg2, uint64_t* arg1, uint64_t* arg0) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " mov %%rdx, %%rbx;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulxq 8(%%rsi), %%r8, %%r14;"
+ " xor %%r15, %%r15;"
+ " mulxq 16(%%rsi), %%r9, %%r10;"
+ " adcx %%r14, %%r9;"
+ " mulxq 24(%%rsi), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;"
+ " movq 24(%%rsi), %%rdx;"
+ " mulxq 8(%%rsi), %%r11, %%r12;"
+ " adcx %%rcx, %%r11;"
+ " mulxq 16(%%rsi), %%rax, %%r13;"
+ " adcx %%rax, %%r12;"
+ " movq 8(%%rsi), %%rdx;"
+ " adcx %%r15, %%r13;"
+ " mulxq 16(%%rsi), %%rax, %%rcx;"
+ " mov $0, %%r14;"
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " movq %%rax, 0(%%rdi);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%%rdi);"
+ " movq 8(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%%rdi);"
+ " movq 16(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%%rdi);"
+ " adcx %%rcx, %%r12;"
+ " movq %%r12, 40(%%rdi);"
+ " movq 24(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%%rdi);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%%rdi);"
+ " mov %%rdi, %%rsi;"
+ " mov %%rbx, %%rdi;"
+ " mov $38, %%rdx;"
+ " mulxq 32(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%%rsi), %%r8;"
+ " mulxq 40(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%%rsi), %%r9;"
+ " mulxq 48(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%%rsi), %%r10;"
+ " mulxq 56(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r)
+ :
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
+}
+
+static inline void fsqr2 (uint64_t* arg2, uint64_t* arg1, uint64_t* arg0) {
+ register uint64_t* arg0_r asm("rdi") = arg0;
+ register uint64_t* arg1_r asm("rsi") = arg1;
+ register uint64_t* arg2_r asm("rdx") = arg2;
+
+ __asm__ __volatile__(
+ " mov %%rdx, %%rbx;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulxq 8(%%rsi), %%r8, %%r14;"
+ " xor %%r15, %%r15;"
+ " mulxq 16(%%rsi), %%r9, %%r10;"
+ " adcx %%r14, %%r9;"
+ " mulxq 24(%%rsi), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;"
+ " movq 24(%%rsi), %%rdx;"
+ " mulxq 8(%%rsi), %%r11, %%r12;"
+ " adcx %%rcx, %%r11;"
+ " mulxq 16(%%rsi), %%rax, %%r13;"
+ " adcx %%rax, %%r12;"
+ " movq 8(%%rsi), %%rdx;"
+ " adcx %%r15, %%r13;"
+ " mulxq 16(%%rsi), %%rax, %%rcx;"
+ " mov $0, %%r14;"
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+ " movq 0(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " movq %%rax, 0(%%rdi);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 8(%%rdi);"
+ " movq 8(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 16(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 24(%%rdi);"
+ " movq 16(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 32(%%rdi);"
+ " adcx %%rcx, %%r12;"
+ " movq %%r12, 40(%%rdi);"
+ " movq 24(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 48(%%rdi);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 56(%%rdi);"
+ " movq 32(%%rsi), %%rdx;"
+ " mulxq 40(%%rsi), %%r8, %%r14;"
+ " xor %%r15, %%r15;"
+ " mulxq 48(%%rsi), %%r9, %%r10;"
+ " adcx %%r14, %%r9;"
+ " mulxq 56(%%rsi), %%rax, %%rcx;"
+ " adcx %%rax, %%r10;"
+ " movq 56(%%rsi), %%rdx;"
+ " mulxq 40(%%rsi), %%r11, %%r12;"
+ " adcx %%rcx, %%r11;"
+ " mulxq 48(%%rsi), %%rax, %%r13;"
+ " adcx %%rax, %%r12;"
+ " movq 40(%%rsi), %%rdx;"
+ " adcx %%r15, %%r13;"
+ " mulxq 48(%%rsi), %%rax, %%rcx;"
+ " mov $0, %%r14;"
+ " xor %%r15, %%r15;"
+ " adox %%rax, %%r10;"
+ " adcx %%r8, %%r8;"
+ " adox %%rcx, %%r11;"
+ " adcx %%r9, %%r9;"
+ " adox %%r15, %%r12;"
+ " adcx %%r10, %%r10;"
+ " adox %%r15, %%r13;"
+ " adcx %%r11, %%r11;"
+ " adox %%r15, %%r14;"
+ " adcx %%r12, %%r12;"
+ " adcx %%r13, %%r13;"
+ " adcx %%r14, %%r14;"
+ " movq 32(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " movq %%rax, 64(%%rdi);"
+ " add %%rcx, %%r8;"
+ " movq %%r8, 72(%%rdi);"
+ " movq 40(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r9;"
+ " movq %%r9, 80(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 88(%%rdi);"
+ " movq 48(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r11;"
+ " movq %%r11, 96(%%rdi);"
+ " adcx %%rcx, %%r12;"
+ " movq %%r12, 104(%%rdi);"
+ " movq 56(%%rsi), %%rdx;"
+ " mulx %%rdx, %%rax, %%rcx;"
+ " adcx %%rax, %%r13;"
+ " movq %%r13, 112(%%rdi);"
+ " adcx %%rcx, %%r14;"
+ " movq %%r14, 120(%%rdi);"
+ " mov %%rdi, %%rsi;"
+ " mov %%rbx, %%rdi;"
+ " mov $38, %%rdx;"
+ " mulxq 32(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 0(%%rsi), %%r8;"
+ " mulxq 40(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 8(%%rsi), %%r9;"
+ " mulxq 48(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 16(%%rsi), %%r10;"
+ " mulxq 56(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 24(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 8(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 16(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 24(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 0(%%rdi);"
+ " mov $38, %%rdx;"
+ " mulxq 96(%%rsi), %%r8, %%r13;"
+ " xor %%rcx, %%rcx;"
+ " adoxq 64(%%rsi), %%r8;"
+ " mulxq 104(%%rsi), %%r9, %%r12;"
+ " adcx %%r13, %%r9;"
+ " adoxq 72(%%rsi), %%r9;"
+ " mulxq 112(%%rsi), %%r10, %%r13;"
+ " adcx %%r12, %%r10;"
+ " adoxq 80(%%rsi), %%r10;"
+ " mulxq 120(%%rsi), %%r11, %%rax;"
+ " adcx %%r13, %%r11;"
+ " adoxq 88(%%rsi), %%r11;"
+ " adcx %%rcx, %%rax;"
+ " adox %%rcx, %%rax;"
+ " imul %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " adcx %%rcx, %%r9;"
+ " movq %%r9, 40(%%rdi);"
+ " adcx %%rcx, %%r10;"
+ " movq %%r10, 48(%%rdi);"
+ " adcx %%rcx, %%r11;"
+ " movq %%r11, 56(%%rdi);"
+ " mov $0, %%rax;"
+ " cmovc %%rdx, %%rax;"
+ " add %%rax, %%r8;"
+ " movq %%r8, 32(%%rdi);"
+ : "+r" (arg0_r), "+r" (arg1_r), "+r" (arg2_r)
+ :
+ : "%rax", "%rbx", "%rcx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "memory", "cc"
+ );
+}
+
+#endif