diff options
-rw-r--r-- | Makefile | 2 | ||||
-rw-r--r-- | curve25519-amd64-asm.S | 1888 | ||||
-rw-r--r-- | curve25519-amd64.c | 234 | ||||
-rw-r--r-- | curve25519-donna32.c | 861 | ||||
-rw-r--r-- | curve25519-donna64.c | 414 | ||||
-rw-r--r-- | curve25519-fiat32.c | 840 | ||||
-rw-r--r-- | curve25519-fiat64.c | 577 | ||||
-rw-r--r-- | curve25519-hacl64.c | 755 | ||||
-rw-r--r-- | curve25519-sandy2x-asm.S | 3261 | ||||
-rw-r--r-- | curve25519-sandy2x.c | 139 | ||||
-rw-r--r-- | curve25519-tweetnacl.c | 169 | ||||
-rw-r--r-- | main.c | 60 |
12 files changed, 1 insertions, 9199 deletions
@@ -1,5 +1,5 @@ ifneq ($(KERNELRELEASE),) -kbench9000-y := main.o curve25519-donna64.o curve25519-hacl64.o curve25519-fiat64.o curve25519-sandy2x.o curve25519-sandy2x-asm.o curve25519-amd64.o curve25519-precomp.o curve25519-amd64-asm.o curve25519-donna32.o curve25519-fiat32.o curve25519-tweetnacl.o curve25519-ever64.o +kbench9000-y := main.o curve25519-precomp.o curve25519-ever64.o obj-m := kbench9000.o ccflags-y += -O3 ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt' diff --git a/curve25519-amd64-asm.S b/curve25519-amd64-asm.S deleted file mode 100644 index 27a5b6a..0000000 --- a/curve25519-amd64-asm.S +++ /dev/null @@ -1,1888 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015 Google Inc. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Peter Schwabe <peter@cryptojedi.org> - */ - -/************************************************ - * W A R N I N G - * W A R N I N G - * W A R N I N G - * W A R N I N G - * W A R N I N G - * - * Do not import this file into the kernel as-is, - * because it makes use of the x86_64 redzone, - * which will entirely melt the kernel. We're sort - * of getting away with it here, since interrupts - * are disabled, but DANGER this will kill kittens. - * - * W A R N I N G - * W A R N I N G - * W A R N I N G - * W A R N I N G - * W A R N I N G - ************************************************/ - -.data -.p2align 4 - -x25519_x86_64_REDMASK51: .quad 0x0007FFFFFFFFFFFF -x25519_x86_64_121666_213: .quad 996687872 -x25519_x86_64_2P0: .quad 0xFFFFFFFFFFFDA -x25519_x86_64_2P1234: .quad 0xFFFFFFFFFFFFE -x25519_x86_64_4P0: .quad 0x1FFFFFFFFFFFB4 -x25519_x86_64_4P1234: .quad 0x1FFFFFFFFFFFFC -x25519_x86_64_MU0: .quad 0xED9CE5A30A2C131B -x25519_x86_64_MU1: .quad 0x2106215D086329A7 -x25519_x86_64_MU2: .quad 0xFFFFFFFFFFFFFFEB -x25519_x86_64_MU3: .quad 0xFFFFFFFFFFFFFFFF -x25519_x86_64_MU4: .quad 0x000000000000000F -x25519_x86_64_ORDER0: .quad 0x5812631A5CF5D3ED -x25519_x86_64_ORDER1: .quad 0x14DEF9DEA2F79CD6 -x25519_x86_64_ORDER2: .quad 0x0000000000000000 -x25519_x86_64_ORDER3: .quad 0x1000000000000000 -x25519_x86_64_EC2D0: .quad 1859910466990425 -x25519_x86_64_EC2D1: .quad 932731440258426 -x25519_x86_64_EC2D2: .quad 1072319116312658 -x25519_x86_64_EC2D3: .quad 1815898335770999 -x25519_x86_64_EC2D4: .quad 633789495995903 -x25519_x86_64__38: .quad 38 - -.text -.p2align 5 - -.globl x25519_x86_64_freeze -.hidden x25519_x86_64_freeze -x25519_x86_64_freeze: -.cfi_startproc -/* This is a leaf function and uses the redzone for saving registers. */ -movq %r12,-8(%rsp) -.cfi_rel_offset r12, -8 -movq 0(%rdi),%rsi -movq 8(%rdi),%rdx -movq 16(%rdi),%rcx -movq 24(%rdi),%r8 -movq 32(%rdi),%r9 -movq x25519_x86_64_REDMASK51(%rip),%rax -mov %rax,%r10 -sub $18,%r10 -mov $3,%r11 -._reduceloop: -mov %rsi,%r12 -shr $51,%r12 -and %rax,%rsi -add %r12,%rdx -mov %rdx,%r12 -shr $51,%r12 -and %rax,%rdx -add %r12,%rcx -mov %rcx,%r12 -shr $51,%r12 -and %rax,%rcx -add %r12,%r8 -mov %r8,%r12 -shr $51,%r12 -and %rax,%r8 -add %r12,%r9 -mov %r9,%r12 -shr $51,%r12 -and %rax,%r9 -imulq $19,%r12,%r12 -add %r12,%rsi -sub $1,%r11 -ja ._reduceloop -mov $1,%r12 -cmp %r10,%rsi -cmovl %r11,%r12 -cmp %rax,%rdx -cmovne %r11,%r12 -cmp %rax,%rcx -cmovne %r11,%r12 -cmp %rax,%r8 -cmovne %r11,%r12 -cmp %rax,%r9 -cmovne %r11,%r12 -neg %r12 -and %r12,%rax -and %r12,%r10 -sub %r10,%rsi -sub %rax,%rdx -sub %rax,%rcx -sub %rax,%r8 -sub %rax,%r9 -movq %rsi,0(%rdi) -movq %rdx,8(%rdi) -movq %rcx,16(%rdi) -movq %r8,24(%rdi) -movq %r9,32(%rdi) -movq -8(%rsp),%r12 -ret -.cfi_endproc - -.p2align 5 -.globl x25519_x86_64_mul -.hidden x25519_x86_64_mul -x25519_x86_64_mul: -.cfi_startproc -/* This is a leaf function and uses the redzone for saving registers. */ -movq %r12,-8(%rsp) -.cfi_rel_offset r12, -8 -movq %r13,-16(%rsp) -.cfi_rel_offset r13, -16 -movq %r14,-24(%rsp) -.cfi_rel_offset r14, -24 -movq %r15,-32(%rsp) -.cfi_rel_offset r15, -32 -movq %rbx,-40(%rsp) -.cfi_rel_offset rbx, -40 -movq %rbp,-48(%rsp) -.cfi_rel_offset rbp, -48 -mov %rdx,%rcx -movq 24(%rsi),%rdx -imulq $19,%rdx,%rax -movq %rax,-64(%rsp) -mulq 16(%rcx) -mov %rax,%r8 -mov %rdx,%r9 -movq 32(%rsi),%rdx -imulq $19,%rdx,%rax -movq %rax,-72(%rsp) -mulq 8(%rcx) -add %rax,%r8 -adc %rdx,%r9 -movq 0(%rsi),%rax -mulq 0(%rcx) -add %rax,%r8 -adc %rdx,%r9 -movq 0(%rsi),%rax -mulq 8(%rcx) -mov %rax,%r10 -mov %rdx,%r11 -movq 0(%rsi),%rax -mulq 16(%rcx) -mov %rax,%r12 -mov %rdx,%r13 -movq 0(%rsi),%rax -mulq 24(%rcx) -mov %rax,%r14 -mov %rdx,%r15 -movq 0(%rsi),%rax -mulq 32(%rcx) -mov %rax,%rbx -mov %rdx,%rbp -movq 8(%rsi),%rax -mulq 0(%rcx) -add %rax,%r10 -adc %rdx,%r11 -movq 8(%rsi),%rax -mulq 8(%rcx) -add %rax,%r12 -adc %rdx,%r13 -movq 8(%rsi),%rax -mulq 16(%rcx) -add %rax,%r14 -adc %rdx,%r15 -movq 8(%rsi),%rax -mulq 24(%rcx) -add %rax,%rbx -adc %rdx,%rbp -movq 8(%rsi),%rdx -imulq $19,%rdx,%rax -mulq 32(%rcx) -add %rax,%r8 -adc %rdx,%r9 -movq 16(%rsi),%rax -mulq 0(%rcx) -add %rax,%r12 -adc %rdx,%r13 -movq 16(%rsi),%rax -mulq 8(%rcx) -add %rax,%r14 -adc %rdx,%r15 -movq 16(%rsi),%rax -mulq 16(%rcx) -add %rax,%rbx -adc %rdx,%rbp -movq 16(%rsi),%rdx -imulq $19,%rdx,%rax -mulq 24(%rcx) -add %rax,%r8 -adc %rdx,%r9 -movq 16(%rsi),%rdx -imulq $19,%rdx,%rax -mulq 32(%rcx) -add %rax,%r10 -adc %rdx,%r11 -movq 24(%rsi),%rax -mulq 0(%rcx) -add %rax,%r14 -adc %rdx,%r15 -movq 24(%rsi),%rax -mulq 8(%rcx) -add %rax,%rbx -adc %rdx,%rbp -movq -64(%rsp),%rax -mulq 24(%rcx) -add %rax,%r10 -adc %rdx,%r11 -movq -64(%rsp),%rax -mulq 32(%rcx) -add %rax,%r12 -adc %rdx,%r13 -movq 32(%rsi),%rax -mulq 0(%rcx) -add %rax,%rbx -adc %rdx,%rbp -movq -72(%rsp),%rax -mulq 16(%rcx) -add %rax,%r10 -adc %rdx,%r11 -movq -72(%rsp),%rax -mulq 24(%rcx) -add %rax,%r12 -adc %rdx,%r13 -movq -72(%rsp),%rax -mulq 32(%rcx) -add %rax,%r14 -adc %rdx,%r15 -movq x25519_x86_64_REDMASK51(%rip),%rsi -shld $13,%r8,%r9 -and %rsi,%r8 -shld $13,%r10,%r11 -and %rsi,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rsi,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rsi,%r14 -add %r13,%r14 -shld $13,%rbx,%rbp -and %rsi,%rbx -add %r15,%rbx -imulq $19,%rbp,%rdx -add %rdx,%r8 -mov %r8,%rdx -shr $51,%rdx -add %r10,%rdx -mov %rdx,%rcx -shr $51,%rdx -and %rsi,%r8 -add %r12,%rdx -mov %rdx,%r9 -shr $51,%rdx -and %rsi,%rcx -add %r14,%rdx -mov %rdx,%rax -shr $51,%rdx -and %rsi,%r9 -add %rbx,%rdx -mov %rdx,%r10 -shr $51,%rdx -and %rsi,%rax -imulq $19,%rdx,%rdx -add %rdx,%r8 -and %rsi,%r10 -movq %r8,0(%rdi) -movq %rcx,8(%rdi) -movq %r9,16(%rdi) -movq %rax,24(%rdi) -movq %r10,32(%rdi) -movq -8(%rsp),%r12 -movq -16(%rsp),%r13 -movq -24(%rsp),%r14 -movq -32(%rsp),%r15 -movq -40(%rsp),%rbx -movq -48(%rsp),%rbp -ret -.cfi_endproc - -.p2align 5 -.globl x25519_x86_64_square -.hidden x25519_x86_64_square -x25519_x86_64_square: -.cfi_startproc -/* This is a leaf function and uses the redzone for saving registers. */ -movq %r12,-8(%rsp) -.cfi_rel_offset r12, -8 -movq %r13,-16(%rsp) -.cfi_rel_offset r13, -16 -movq %r14,-24(%rsp) -.cfi_rel_offset r14, -24 -movq %r15,-32(%rsp) -.cfi_rel_offset r15, -32 -movq %rbx,-40(%rsp) -.cfi_rel_offset rbx, -40 -movq 0(%rsi),%rax -mulq 0(%rsi) -mov %rax,%rcx -mov %rdx,%r8 -movq 0(%rsi),%rax -shl $1,%rax -mulq 8(%rsi) -mov %rax,%r9 -mov %rdx,%r10 -movq 0(%rsi),%rax -shl $1,%rax -mulq 16(%rsi) -mov %rax,%r11 -mov %rdx,%r12 -movq 0(%rsi),%rax -shl $1,%rax -mulq 24(%rsi) -mov %rax,%r13 -mov %rdx,%r14 -movq 0(%rsi),%rax -shl $1,%rax -mulq 32(%rsi) -mov %rax,%r15 -mov %rdx,%rbx -movq 8(%rsi),%rax -mulq 8(%rsi) -add %rax,%r11 -adc %rdx,%r12 -movq 8(%rsi),%rax -shl $1,%rax -mulq 16(%rsi) -add %rax,%r13 -adc %rdx,%r14 -movq 8(%rsi),%rax -shl $1,%rax -mulq 24(%rsi) -add %rax,%r15 -adc %rdx,%rbx -movq 8(%rsi),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsi) -add %rax,%rcx -adc %rdx,%r8 -movq 16(%rsi),%rax -mulq 16(%rsi) -add %rax,%r15 -adc %rdx,%rbx -movq 16(%rsi),%rdx -imulq $38,%rdx,%rax -mulq 24(%rsi) -add %rax,%rcx -adc %rdx,%r8 -movq 16(%rsi),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsi) -add %rax,%r9 -adc %rdx,%r10 -movq 24(%rsi),%rdx -imulq $19,%rdx,%rax -mulq 24(%rsi) -add %rax,%r9 -adc %rdx,%r10 -movq 24(%rsi),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsi) -add %rax,%r11 -adc %rdx,%r12 -movq 32(%rsi),%rdx -imulq $19,%rdx,%rax -mulq 32(%rsi) -add %rax,%r13 -adc %rdx,%r14 -movq x25519_x86_64_REDMASK51(%rip),%rsi -shld $13,%rcx,%r8 -and %rsi,%rcx -shld $13,%r9,%r10 -and %rsi,%r9 -add %r8,%r9 -shld $13,%r11,%r12 -and %rsi,%r11 -add %r10,%r11 -shld $13,%r13,%r14 -and %rsi,%r13 -add %r12,%r13 -shld $13,%r15,%rbx -and %rsi,%r15 -add %r14,%r15 -imulq $19,%rbx,%rdx -add %rdx,%rcx -mov %rcx,%rdx -shr $51,%rdx -add %r9,%rdx -and %rsi,%rcx -mov %rdx,%r8 -shr $51,%rdx -add %r11,%rdx -and %rsi,%r8 -mov %rdx,%r9 -shr $51,%rdx -add %r13,%rdx -and %rsi,%r9 -mov %rdx,%rax -shr $51,%rdx -add %r15,%rdx -and %rsi,%rax -mov %rdx,%r10 -shr $51,%rdx -imulq $19,%rdx,%rdx -add %rdx,%rcx -and %rsi,%r10 -movq %rcx,0(%rdi) -movq %r8,8(%rdi) -movq %r9,16(%rdi) -movq %rax,24(%rdi) -movq %r10,32(%rdi) -movq -8(%rsp),%r12 -movq -16(%rsp),%r13 -movq -24(%rsp),%r14 -movq -32(%rsp),%r15 -movq -40(%rsp),%rbx -ret -.cfi_endproc - -.p2align 5 -.globl x25519_x86_64_ladderstep -.hidden x25519_x86_64_ladderstep -x25519_x86_64_ladderstep: -.cfi_startproc -sub $344,%rsp -.cfi_adjust_cfa_offset 344 -movq %r12,296(%rsp) -.cfi_rel_offset r12, 296 -movq %r13,304(%rsp) -.cfi_rel_offset r13, 304 -movq %r14,312(%rsp) -.cfi_rel_offset r14, 312 -movq %r15,320(%rsp) -.cfi_rel_offset r15, 320 -movq %rbx,328(%rsp) -.cfi_rel_offset rbx, 328 -movq %rbp,336(%rsp) -.cfi_rel_offset rbp, 336 -movq 40(%rdi),%rsi -movq 48(%rdi),%rdx -movq 56(%rdi),%rcx -movq 64(%rdi),%r8 -movq 72(%rdi),%r9 -mov %rsi,%rax -mov %rdx,%r10 -mov %rcx,%r11 -mov %r8,%r12 -mov %r9,%r13 -add x25519_x86_64_2P0(%rip),%rax -add x25519_x86_64_2P1234(%rip),%r10 -add x25519_x86_64_2P1234(%rip),%r11 -add x25519_x86_64_2P1234(%rip),%r12 -add x25519_x86_64_2P1234(%rip),%r13 -addq 80(%rdi),%rsi -addq 88(%rdi),%rdx -addq 96(%rdi),%rcx -addq 104(%rdi),%r8 -addq 112(%rdi),%r9 -subq 80(%rdi),%rax -subq 88(%rdi),%r10 -subq 96(%rdi),%r11 -subq 104(%rdi),%r12 -subq 112(%rdi),%r13 -movq %rsi,0(%rsp) -movq %rdx,8(%rsp) -movq %rcx,16(%rsp) -movq %r8,24(%rsp) -movq %r9,32(%rsp) -movq %rax,40(%rsp) -movq %r10,48(%rsp) -movq %r11,56(%rsp) -movq %r12,64(%rsp) -movq %r13,72(%rsp) -movq 40(%rsp),%rax -mulq 40(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 40(%rsp),%rax -shl $1,%rax -mulq 48(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 40(%rsp),%rax -shl $1,%rax -mulq 56(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 40(%rsp),%rax -shl $1,%rax -mulq 64(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 40(%rsp),%rax -shl $1,%rax -mulq 72(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 48(%rsp),%rax -mulq 48(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 48(%rsp),%rax -shl $1,%rax -mulq 56(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 48(%rsp),%rax -shl $1,%rax -mulq 64(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 48(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 72(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 56(%rsp),%rax -mulq 56(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 56(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 64(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 56(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 72(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 64(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 64(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 64(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 72(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 72(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 72(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -and %rdx,%rsi -mov %rcx,%r8 -shr $51,%rcx -add %r10,%rcx -and %rdx,%r8 -mov %rcx,%r9 -shr $51,%rcx -add %r12,%rcx -and %rdx,%r9 -mov %rcx,%rax -shr $51,%rcx -add %r14,%rcx -and %rdx,%rax -mov %rcx,%r10 -shr $51,%rcx -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,80(%rsp) -movq %r8,88(%rsp) -movq %r9,96(%rsp) -movq %rax,104(%rsp) -movq %r10,112(%rsp) -movq 0(%rsp),%rax -mulq 0(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 0(%rsp),%rax -shl $1,%rax -mulq 8(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 0(%rsp),%rax -shl $1,%rax -mulq 16(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 0(%rsp),%rax -shl $1,%rax -mulq 24(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 0(%rsp),%rax -shl $1,%rax -mulq 32(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 8(%rsp),%rax -mulq 8(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 8(%rsp),%rax -shl $1,%rax -mulq 16(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 8(%rsp),%rax -shl $1,%rax -mulq 24(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 8(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 16(%rsp),%rax -mulq 16(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 16(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 24(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 16(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 24(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 24(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 24(%rsp),%rdx -imulq $38,%rdx,%rax -mulq 32(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 32(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 32(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -and %rdx,%rsi -mov %rcx,%r8 -shr $51,%rcx -add %r10,%rcx -and %rdx,%r8 -mov %rcx,%r9 -shr $51,%rcx -add %r12,%rcx -and %rdx,%r9 -mov %rcx,%rax -shr $51,%rcx -add %r14,%rcx -and %rdx,%rax -mov %rcx,%r10 -shr $51,%rcx -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,120(%rsp) -movq %r8,128(%rsp) -movq %r9,136(%rsp) -movq %rax,144(%rsp) -movq %r10,152(%rsp) -mov %rsi,%rsi -mov %r8,%rdx -mov %r9,%rcx -mov %rax,%r8 -mov %r10,%r9 -add x25519_x86_64_2P0(%rip),%rsi -add x25519_x86_64_2P1234(%rip),%rdx -add x25519_x86_64_2P1234(%rip),%rcx -add x25519_x86_64_2P1234(%rip),%r8 -add x25519_x86_64_2P1234(%rip),%r9 -subq 80(%rsp),%rsi -subq 88(%rsp),%rdx -subq 96(%rsp),%rcx -subq 104(%rsp),%r8 -subq 112(%rsp),%r9 -movq %rsi,160(%rsp) -movq %rdx,168(%rsp) -movq %rcx,176(%rsp) -movq %r8,184(%rsp) -movq %r9,192(%rsp) -movq 120(%rdi),%rsi -movq 128(%rdi),%rdx -movq 136(%rdi),%rcx -movq 144(%rdi),%r8 -movq 152(%rdi),%r9 -mov %rsi,%rax -mov %rdx,%r10 -mov %rcx,%r11 -mov %r8,%r12 -mov %r9,%r13 -add x25519_x86_64_2P0(%rip),%rax -add x25519_x86_64_2P1234(%rip),%r10 -add x25519_x86_64_2P1234(%rip),%r11 -add x25519_x86_64_2P1234(%rip),%r12 -add x25519_x86_64_2P1234(%rip),%r13 -addq 160(%rdi),%rsi -addq 168(%rdi),%rdx -addq 176(%rdi),%rcx -addq 184(%rdi),%r8 -addq 192(%rdi),%r9 -subq 160(%rdi),%rax -subq 168(%rdi),%r10 -subq 176(%rdi),%r11 -subq 184(%rdi),%r12 -subq 192(%rdi),%r13 -movq %rsi,200(%rsp) -movq %rdx,208(%rsp) -movq %rcx,216(%rsp) -movq %r8,224(%rsp) -movq %r9,232(%rsp) -movq %rax,240(%rsp) -movq %r10,248(%rsp) -movq %r11,256(%rsp) -movq %r12,264(%rsp) -movq %r13,272(%rsp) -movq 224(%rsp),%rsi -imulq $19,%rsi,%rax -movq %rax,280(%rsp) -mulq 56(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 232(%rsp),%rdx -imulq $19,%rdx,%rax -movq %rax,288(%rsp) -mulq 48(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 200(%rsp),%rax -mulq 40(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 200(%rsp),%rax -mulq 48(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 200(%rsp),%rax -mulq 56(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 200(%rsp),%rax -mulq 64(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 200(%rsp),%rax -mulq 72(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 208(%rsp),%rax -mulq 40(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 208(%rsp),%rax -mulq 48(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 208(%rsp),%rax -mulq 56(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 208(%rsp),%rax -mulq 64(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 208(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 72(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 216(%rsp),%rax -mulq 40(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 216(%rsp),%rax -mulq 48(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 216(%rsp),%rax -mulq 56(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 216(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 64(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 216(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 72(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 224(%rsp),%rax -mulq 40(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 224(%rsp),%rax -mulq 48(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 280(%rsp),%rax -mulq 64(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 280(%rsp),%rax -mulq 72(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 232(%rsp),%rax -mulq 40(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 288(%rsp),%rax -mulq 56(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 288(%rsp),%rax -mulq 64(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 288(%rsp),%rax -mulq 72(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -mov %rcx,%r8 -shr $51,%rcx -and %rdx,%rsi -add %r10,%rcx -mov %rcx,%r9 -shr $51,%rcx -and %rdx,%r8 -add %r12,%rcx -mov %rcx,%rax -shr $51,%rcx -and %rdx,%r9 -add %r14,%rcx -mov %rcx,%r10 -shr $51,%rcx -and %rdx,%rax -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,40(%rsp) -movq %r8,48(%rsp) -movq %r9,56(%rsp) -movq %rax,64(%rsp) -movq %r10,72(%rsp) -movq 264(%rsp),%rsi -imulq $19,%rsi,%rax -movq %rax,200(%rsp) -mulq 16(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 272(%rsp),%rdx -imulq $19,%rdx,%rax -movq %rax,208(%rsp) -mulq 8(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 240(%rsp),%rax -mulq 0(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 240(%rsp),%rax -mulq 8(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 240(%rsp),%rax -mulq 16(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 240(%rsp),%rax -mulq 24(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 240(%rsp),%rax -mulq 32(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 248(%rsp),%rax -mulq 0(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 248(%rsp),%rax -mulq 8(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 248(%rsp),%rax -mulq 16(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 248(%rsp),%rax -mulq 24(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 248(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 32(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 256(%rsp),%rax -mulq 0(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 256(%rsp),%rax -mulq 8(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 256(%rsp),%rax -mulq 16(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 256(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 24(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 256(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 32(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 264(%rsp),%rax -mulq 0(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 264(%rsp),%rax -mulq 8(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 200(%rsp),%rax -mulq 24(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 200(%rsp),%rax -mulq 32(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 272(%rsp),%rax -mulq 0(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 208(%rsp),%rax -mulq 16(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 208(%rsp),%rax -mulq 24(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 208(%rsp),%rax -mulq 32(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -mov %rcx,%r8 -shr $51,%rcx -and %rdx,%rsi -add %r10,%rcx -mov %rcx,%r9 -shr $51,%rcx -and %rdx,%r8 -add %r12,%rcx -mov %rcx,%rax -shr $51,%rcx -and %rdx,%r9 -add %r14,%rcx -mov %rcx,%r10 -shr $51,%rcx -and %rdx,%rax -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -mov %rsi,%rdx -mov %r8,%rcx -mov %r9,%r11 -mov %rax,%r12 -mov %r10,%r13 -add x25519_x86_64_2P0(%rip),%rdx -add x25519_x86_64_2P1234(%rip),%rcx -add x25519_x86_64_2P1234(%rip),%r11 -add x25519_x86_64_2P1234(%rip),%r12 -add x25519_x86_64_2P1234(%rip),%r13 -addq 40(%rsp),%rsi -addq 48(%rsp),%r8 -addq 56(%rsp),%r9 -addq 64(%rsp),%rax -addq 72(%rsp),%r10 -subq 40(%rsp),%rdx -subq 48(%rsp),%rcx -subq 56(%rsp),%r11 -subq 64(%rsp),%r12 -subq 72(%rsp),%r13 -movq %rsi,120(%rdi) -movq %r8,128(%rdi) -movq %r9,136(%rdi) -movq %rax,144(%rdi) -movq %r10,152(%rdi) -movq %rdx,160(%rdi) -movq %rcx,168(%rdi) -movq %r11,176(%rdi) -movq %r12,184(%rdi) -movq %r13,192(%rdi) -movq 120(%rdi),%rax -mulq 120(%rdi) -mov %rax,%rsi -mov %rdx,%rcx -movq 120(%rdi),%rax -shl $1,%rax -mulq 128(%rdi) -mov %rax,%r8 -mov %rdx,%r9 -movq 120(%rdi),%rax -shl $1,%rax -mulq 136(%rdi) -mov %rax,%r10 -mov %rdx,%r11 -movq 120(%rdi),%rax -shl $1,%rax -mulq 144(%rdi) -mov %rax,%r12 -mov %rdx,%r13 -movq 120(%rdi),%rax -shl $1,%rax -mulq 152(%rdi) -mov %rax,%r14 -mov %rdx,%r15 -movq 128(%rdi),%rax -mulq 128(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 128(%rdi),%rax -shl $1,%rax -mulq 136(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq 128(%rdi),%rax -shl $1,%rax -mulq 144(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 128(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 152(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 136(%rdi),%rax -mulq 136(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 136(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 144(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 136(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 152(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 144(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 144(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 144(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 152(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 152(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 152(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -and %rdx,%rsi -mov %rcx,%r8 -shr $51,%rcx -add %r10,%rcx -and %rdx,%r8 -mov %rcx,%r9 -shr $51,%rcx -add %r12,%rcx -and %rdx,%r9 -mov %rcx,%rax -shr $51,%rcx -add %r14,%rcx -and %rdx,%rax -mov %rcx,%r10 -shr $51,%rcx -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,120(%rdi) -movq %r8,128(%rdi) -movq %r9,136(%rdi) -movq %rax,144(%rdi) -movq %r10,152(%rdi) -movq 160(%rdi),%rax -mulq 160(%rdi) -mov %rax,%rsi -mov %rdx,%rcx -movq 160(%rdi),%rax -shl $1,%rax -mulq 168(%rdi) -mov %rax,%r8 -mov %rdx,%r9 -movq 160(%rdi),%rax -shl $1,%rax -mulq 176(%rdi) -mov %rax,%r10 -mov %rdx,%r11 -movq 160(%rdi),%rax -shl $1,%rax -mulq 184(%rdi) -mov %rax,%r12 -mov %rdx,%r13 -movq 160(%rdi),%rax -shl $1,%rax -mulq 192(%rdi) -mov %rax,%r14 -mov %rdx,%r15 -movq 168(%rdi),%rax -mulq 168(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 168(%rdi),%rax -shl $1,%rax -mulq 176(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq 168(%rdi),%rax -shl $1,%rax -mulq 184(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 168(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 192(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 176(%rdi),%rax -mulq 176(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 176(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 184(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 176(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 192(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 184(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 184(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 184(%rdi),%rdx -imulq $38,%rdx,%rax -mulq 192(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 192(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 192(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -and %rdx,%rsi -mov %rcx,%r8 -shr $51,%rcx -add %r10,%rcx -and %rdx,%r8 -mov %rcx,%r9 -shr $51,%rcx -add %r12,%rcx -and %rdx,%r9 -mov %rcx,%rax -shr $51,%rcx -add %r14,%rcx -and %rdx,%rax -mov %rcx,%r10 -shr $51,%rcx -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,160(%rdi) -movq %r8,168(%rdi) -movq %r9,176(%rdi) -movq %rax,184(%rdi) -movq %r10,192(%rdi) -movq 184(%rdi),%rsi -imulq $19,%rsi,%rax -movq %rax,0(%rsp) -mulq 16(%rdi) -mov %rax,%rsi -mov %rdx,%rcx -movq 192(%rdi),%rdx -imulq $19,%rdx,%rax -movq %rax,8(%rsp) -mulq 8(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 160(%rdi),%rax -mulq 0(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 160(%rdi),%rax -mulq 8(%rdi) -mov %rax,%r8 -mov %rdx,%r9 -movq 160(%rdi),%rax -mulq 16(%rdi) -mov %rax,%r10 -mov %rdx,%r11 -movq 160(%rdi),%rax -mulq 24(%rdi) -mov %rax,%r12 -mov %rdx,%r13 -movq 160(%rdi),%rax -mulq 32(%rdi) -mov %rax,%r14 -mov %rdx,%r15 -movq 168(%rdi),%rax -mulq 0(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 168(%rdi),%rax -mulq 8(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 168(%rdi),%rax -mulq 16(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq 168(%rdi),%rax -mulq 24(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 168(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 32(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 176(%rdi),%rax -mulq 0(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 176(%rdi),%rax -mulq 8(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq 176(%rdi),%rax -mulq 16(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 176(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 24(%rdi) -add %rax,%rsi -adc %rdx,%rcx -movq 176(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 32(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 184(%rdi),%rax -mulq 0(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq 184(%rdi),%rax -mulq 8(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 0(%rsp),%rax -mulq 24(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 0(%rsp),%rax -mulq 32(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 192(%rdi),%rax -mulq 0(%rdi) -add %rax,%r14 -adc %rdx,%r15 -movq 8(%rsp),%rax -mulq 16(%rdi) -add %rax,%r8 -adc %rdx,%r9 -movq 8(%rsp),%rax -mulq 24(%rdi) -add %rax,%r10 -adc %rdx,%r11 -movq 8(%rsp),%rax -mulq 32(%rdi) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -mov %rcx,%r8 -shr $51,%rcx -and %rdx,%rsi -add %r10,%rcx -mov %rcx,%r9 -shr $51,%rcx -and %rdx,%r8 -add %r12,%rcx -mov %rcx,%rax -shr $51,%rcx -and %rdx,%r9 -add %r14,%rcx -mov %rcx,%r10 -shr $51,%rcx -and %rdx,%rax -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,160(%rdi) -movq %r8,168(%rdi) -movq %r9,176(%rdi) -movq %rax,184(%rdi) -movq %r10,192(%rdi) -movq 144(%rsp),%rsi -imulq $19,%rsi,%rax -movq %rax,0(%rsp) -mulq 96(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 152(%rsp),%rdx -imulq $19,%rdx,%rax -movq %rax,8(%rsp) -mulq 88(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 120(%rsp),%rax -mulq 80(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 120(%rsp),%rax -mulq 88(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 120(%rsp),%rax -mulq 96(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 120(%rsp),%rax -mulq 104(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 120(%rsp),%rax -mulq 112(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 128(%rsp),%rax -mulq 80(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 128(%rsp),%rax -mulq 88(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 128(%rsp),%rax -mulq 96(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 128(%rsp),%rax -mulq 104(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 128(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 112(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 136(%rsp),%rax -mulq 80(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 136(%rsp),%rax -mulq 88(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 136(%rsp),%rax -mulq 96(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 136(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 104(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 136(%rsp),%rdx -imulq $19,%rdx,%rax -mulq 112(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 144(%rsp),%rax -mulq 80(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 144(%rsp),%rax -mulq 88(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 0(%rsp),%rax -mulq 104(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 0(%rsp),%rax -mulq 112(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 152(%rsp),%rax -mulq 80(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 8(%rsp),%rax -mulq 96(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 8(%rsp),%rax -mulq 104(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 8(%rsp),%rax -mulq 112(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -mov %rcx,%r8 -shr $51,%rcx -and %rdx,%rsi -add %r10,%rcx -mov %rcx,%r9 -shr $51,%rcx -and %rdx,%r8 -add %r12,%rcx -mov %rcx,%rax -shr $51,%rcx -and %rdx,%r9 -add %r14,%rcx -mov %rcx,%r10 -shr $51,%rcx -and %rdx,%rax -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,40(%rdi) -movq %r8,48(%rdi) -movq %r9,56(%rdi) -movq %rax,64(%rdi) -movq %r10,72(%rdi) -movq 160(%rsp),%rax -mulq x25519_x86_64_121666_213(%rip) -shr $13,%rax -mov %rax,%rsi -mov %rdx,%rcx -movq 168(%rsp),%rax -mulq x25519_x86_64_121666_213(%rip) -shr $13,%rax -add %rax,%rcx -mov %rdx,%r8 -movq 176(%rsp),%rax -mulq x25519_x86_64_121666_213(%rip) -shr $13,%rax -add %rax,%r8 -mov %rdx,%r9 -movq 184(%rsp),%rax -mulq x25519_x86_64_121666_213(%rip) -shr $13,%rax -add %rax,%r9 -mov %rdx,%r10 -movq 192(%rsp),%rax -mulq x25519_x86_64_121666_213(%rip) -shr $13,%rax -add %rax,%r10 -imulq $19,%rdx,%rdx -add %rdx,%rsi -addq 80(%rsp),%rsi -addq 88(%rsp),%rcx -addq 96(%rsp),%r8 -addq 104(%rsp),%r9 -addq 112(%rsp),%r10 -movq %rsi,80(%rdi) -movq %rcx,88(%rdi) -movq %r8,96(%rdi) -movq %r9,104(%rdi) -movq %r10,112(%rdi) -movq 104(%rdi),%rsi -imulq $19,%rsi,%rax -movq %rax,0(%rsp) -mulq 176(%rsp) -mov %rax,%rsi -mov %rdx,%rcx -movq 112(%rdi),%rdx -imulq $19,%rdx,%rax -movq %rax,8(%rsp) -mulq 168(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 80(%rdi),%rax -mulq 160(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 80(%rdi),%rax -mulq 168(%rsp) -mov %rax,%r8 -mov %rdx,%r9 -movq 80(%rdi),%rax -mulq 176(%rsp) -mov %rax,%r10 -mov %rdx,%r11 -movq 80(%rdi),%rax -mulq 184(%rsp) -mov %rax,%r12 -mov %rdx,%r13 -movq 80(%rdi),%rax -mulq 192(%rsp) -mov %rax,%r14 -mov %rdx,%r15 -movq 88(%rdi),%rax -mulq 160(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 88(%rdi),%rax -mulq 168(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 88(%rdi),%rax -mulq 176(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 88(%rdi),%rax -mulq 184(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 88(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 192(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 96(%rdi),%rax -mulq 160(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 96(%rdi),%rax -mulq 168(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 96(%rdi),%rax -mulq 176(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 96(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 184(%rsp) -add %rax,%rsi -adc %rdx,%rcx -movq 96(%rdi),%rdx -imulq $19,%rdx,%rax -mulq 192(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 104(%rdi),%rax -mulq 160(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq 104(%rdi),%rax -mulq 168(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 0(%rsp),%rax -mulq 184(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 0(%rsp),%rax -mulq 192(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 112(%rdi),%rax -mulq 160(%rsp) -add %rax,%r14 -adc %rdx,%r15 -movq 8(%rsp),%rax -mulq 176(%rsp) -add %rax,%r8 -adc %rdx,%r9 -movq 8(%rsp),%rax -mulq 184(%rsp) -add %rax,%r10 -adc %rdx,%r11 -movq 8(%rsp),%rax -mulq 192(%rsp) -add %rax,%r12 -adc %rdx,%r13 -movq x25519_x86_64_REDMASK51(%rip),%rdx -shld $13,%rsi,%rcx -and %rdx,%rsi -shld $13,%r8,%r9 -and %rdx,%r8 -add %rcx,%r8 -shld $13,%r10,%r11 -and %rdx,%r10 -add %r9,%r10 -shld $13,%r12,%r13 -and %rdx,%r12 -add %r11,%r12 -shld $13,%r14,%r15 -and %rdx,%r14 -add %r13,%r14 -imulq $19,%r15,%rcx -add %rcx,%rsi -mov %rsi,%rcx -shr $51,%rcx -add %r8,%rcx -mov %rcx,%r8 -shr $51,%rcx -and %rdx,%rsi -add %r10,%rcx -mov %rcx,%r9 -shr $51,%rcx -and %rdx,%r8 -add %r12,%rcx -mov %rcx,%rax -shr $51,%rcx -and %rdx,%r9 -add %r14,%rcx -mov %rcx,%r10 -shr $51,%rcx -and %rdx,%rax -imulq $19,%rcx,%rcx -add %rcx,%rsi -and %rdx,%r10 -movq %rsi,80(%rdi) -movq %r8,88(%rdi) -movq %r9,96(%rdi) -movq %rax,104(%rdi) -movq %r10,112(%rdi) -movq 296(%rsp),%r12 -movq 304(%rsp),%r13 -movq 312(%rsp),%r14 -movq 320(%rsp),%r15 -movq 328(%rsp),%rbx -movq 336(%rsp),%rbp -add $344,%rsp -.cfi_adjust_cfa_offset -344 -ret -.cfi_endproc - -.p2align 5 -.globl x25519_x86_64_work_cswap -.hidden x25519_x86_64_work_cswap -x25519_x86_64_work_cswap: -.cfi_startproc -subq $1,%rsi -notq %rsi -movq %rsi,%xmm15 -pshufd $0x44,%xmm15,%xmm15 -movdqu 0(%rdi),%xmm0 -movdqu 16(%rdi),%xmm2 -movdqu 32(%rdi),%xmm4 -movdqu 48(%rdi),%xmm6 -movdqu 64(%rdi),%xmm8 -movdqu 80(%rdi),%xmm1 -movdqu 96(%rdi),%xmm3 -movdqu 112(%rdi),%xmm5 -movdqu 128(%rdi),%xmm7 -movdqu 144(%rdi),%xmm9 -movdqa %xmm1,%xmm10 -movdqa %xmm3,%xmm11 -movdqa %xmm5,%xmm12 -movdqa %xmm7,%xmm13 -movdqa %xmm9,%xmm14 -pxor %xmm0,%xmm10 -pxor %xmm2,%xmm11 -pxor %xmm4,%xmm12 -pxor %xmm6,%xmm13 -pxor %xmm8,%xmm14 -pand %xmm15,%xmm10 -pand %xmm15,%xmm11 -pand %xmm15,%xmm12 -pand %xmm15,%xmm13 -pand %xmm15,%xmm14 -pxor %xmm10,%xmm0 -pxor %xmm10,%xmm1 -pxor %xmm11,%xmm2 -pxor %xmm11,%xmm3 -pxor %xmm12,%xmm4 -pxor %xmm12,%xmm5 -pxor %xmm13,%xmm6 -pxor %xmm13,%xmm7 -pxor %xmm14,%xmm8 -pxor %xmm14,%xmm9 -movdqu %xmm0,0(%rdi) -movdqu %xmm2,16(%rdi) -movdqu %xmm4,32(%rdi) -movdqu %xmm6,48(%rdi) -movdqu %xmm8,64(%rdi) -movdqu %xmm1,80(%rdi) -movdqu %xmm3,96(%rdi) -movdqu %xmm5,112(%rdi) -movdqu %xmm7,128(%rdi) -movdqu %xmm9,144(%rdi) -ret -.cfi_endproc diff --git a/curve25519-amd64.c b/curve25519-amd64.c deleted file mode 100644 index 095b0d2..0000000 --- a/curve25519-amd64.c +++ /dev/null @@ -1,234 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015 Google Inc. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Peter Schwabe <peter@cryptojedi.org> - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -typedef struct { uint64_t v[5]; } fe25519; - -asmlinkage void x25519_x86_64_work_cswap(fe25519 *, uint64_t); -asmlinkage void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b); -asmlinkage void x25519_x86_64_square(fe25519 *out, const fe25519 *a); -asmlinkage void x25519_x86_64_freeze(fe25519 *); -asmlinkage void x25519_x86_64_ladderstep(fe25519 *work); - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -static void fe25519_setint(fe25519 *r, unsigned v) -{ - r->v[0] = v; - r->v[1] = 0; - r->v[2] = 0; - r->v[3] = 0; - r->v[4] = 0; -} - -// Assumes input x being reduced below 2^255 -static void fe25519_pack(unsigned char r[32], const fe25519 *x) -{ - fe25519 t; - t = *x; - x25519_x86_64_freeze(&t); - - r[0] = (uint8_t)(t.v[0] & 0xff); - r[1] = (uint8_t)((t.v[0] >> 8) & 0xff); - r[2] = (uint8_t)((t.v[0] >> 16) & 0xff); - r[3] = (uint8_t)((t.v[0] >> 24) & 0xff); - r[4] = (uint8_t)((t.v[0] >> 32) & 0xff); - r[5] = (uint8_t)((t.v[0] >> 40) & 0xff); - r[6] = (uint8_t)((t.v[0] >> 48)); - - r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8); - r[7] = (uint8_t)((t.v[1] >> 5) & 0xff); - r[8] = (uint8_t)((t.v[1] >> 13) & 0xff); - r[9] = (uint8_t)((t.v[1] >> 21) & 0xff); - r[10] = (uint8_t)((t.v[1] >> 29) & 0xff); - r[11] = (uint8_t)((t.v[1] >> 37) & 0xff); - r[12] = (uint8_t)((t.v[1] >> 45)); - - r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0); - r[13] = (uint8_t)((t.v[2] >> 2) & 0xff); - r[14] = (uint8_t)((t.v[2] >> 10) & 0xff); - r[15] = (uint8_t)((t.v[2] >> 18) & 0xff); - r[16] = (uint8_t)((t.v[2] >> 26) & 0xff); - r[17] = (uint8_t)((t.v[2] >> 34) & 0xff); - r[18] = (uint8_t)((t.v[2] >> 42) & 0xff); - r[19] = (uint8_t)((t.v[2] >> 50)); - - r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe); - r[20] = (uint8_t)((t.v[3] >> 7) & 0xff); - r[21] = (uint8_t)((t.v[3] >> 15) & 0xff); - r[22] = (uint8_t)((t.v[3] >> 23) & 0xff); - r[23] = (uint8_t)((t.v[3] >> 31) & 0xff); - r[24] = (uint8_t)((t.v[3] >> 39) & 0xff); - r[25] = (uint8_t)((t.v[3] >> 47)); - - r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0); - r[26] = (uint8_t)((t.v[4] >> 4) & 0xff); - r[27] = (uint8_t)((t.v[4] >> 12) & 0xff); - r[28] = (uint8_t)((t.v[4] >> 20) & 0xff); - r[29] = (uint8_t)((t.v[4] >> 28) & 0xff); - r[30] = (uint8_t)((t.v[4] >> 36) & 0xff); - r[31] = (uint8_t)((t.v[4] >> 44)); -} - -static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) -{ - r->v[0] = x[0]; - r->v[0] += (uint64_t)x[1] << 8; - r->v[0] += (uint64_t)x[2] << 16; - r->v[0] += (uint64_t)x[3] << 24; - r->v[0] += (uint64_t)x[4] << 32; - r->v[0] += (uint64_t)x[5] << 40; - r->v[0] += ((uint64_t)x[6] & 7) << 48; - - r->v[1] = x[6] >> 3; - r->v[1] += (uint64_t)x[7] << 5; - r->v[1] += (uint64_t)x[8] << 13; - r->v[1] += (uint64_t)x[9] << 21; - r->v[1] += (uint64_t)x[10] << 29; - r->v[1] += (uint64_t)x[11] << 37; - r->v[1] += ((uint64_t)x[12] & 63) << 45; - - r->v[2] = x[12] >> 6; - r->v[2] += (uint64_t)x[13] << 2; - r->v[2] += (uint64_t)x[14] << 10; - r->v[2] += (uint64_t)x[15] << 18; - r->v[2] += (uint64_t)x[16] << 26; - r->v[2] += (uint64_t)x[17] << 34; - r->v[2] += (uint64_t)x[18] << 42; - r->v[2] += ((uint64_t)x[19] & 1) << 50; - - r->v[3] = x[19] >> 1; - r->v[3] += (uint64_t)x[20] << 7; - r->v[3] += (uint64_t)x[21] << 15; - r->v[3] += (uint64_t)x[22] << 23; - r->v[3] += (uint64_t)x[23] << 31; - r->v[3] += (uint64_t)x[24] << 39; - r->v[3] += ((uint64_t)x[25] & 15) << 47; - - r->v[4] = x[25] >> 4; - r->v[4] += (uint64_t)x[26] << 4; - r->v[4] += (uint64_t)x[27] << 12; - r->v[4] += (uint64_t)x[28] << 20; - r->v[4] += (uint64_t)x[29] << 28; - r->v[4] += (uint64_t)x[30] << 36; - r->v[4] += ((uint64_t)x[31] & 127) << 44; -} - -static void fe25519_invert(fe25519 *r, const fe25519 *x) -{ - fe25519 z2; - fe25519 z9; - fe25519 z11; - fe25519 z2_5_0; - fe25519 z2_10_0; - fe25519 z2_20_0; - fe25519 z2_50_0; - fe25519 z2_100_0; - fe25519 t; - int i; - - /* 2 */ x25519_x86_64_square(&z2, x); - /* 4 */ x25519_x86_64_square(&t, &z2); - /* 8 */ x25519_x86_64_square(&t, &t); - /* 9 */ x25519_x86_64_mul(&z9, &t, x); - /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2); - /* 22 */ x25519_x86_64_square(&t, &z11); - /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9); - - /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0); - /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); } - /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0); - - /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0); - /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } - /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0); - - /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0); - /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); } - /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0); - - /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t); - /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } - /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0); - - /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0); - /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } - /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0); - - /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0); - /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) { - x25519_x86_64_square(&t, &t); - } - /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0); - - /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t); - /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } - /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0); - - /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t); - /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t); - /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t); - - /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t); - - /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t); - /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11); -} - -static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) -{ - int i, j; - uint8_t prevbit = 0; - fe25519 work[5]; - - work[0] = *xr; - fe25519_setint(work + 1, 1); - fe25519_setint(work + 2, 0); - work[3] = *xr; - fe25519_setint(work + 4, 1); - - j = 6; - for (i = 31; i >= 0; i--) { - while (j >= 0) { - const uint8_t bit = 1 & (s[i] >> j); - const uint64_t swap = bit ^ prevbit; - prevbit = bit; - x25519_x86_64_work_cswap(work + 1, swap); - x25519_x86_64_ladderstep(work); - j -= 1; - } - j = 7; - } - - *xr = work[1]; - *zr = work[2]; -} -bool curve25519_amd64(u8 out[CURVE25519_POINT_SIZE], const u8 scalar[CURVE25519_POINT_SIZE], const u8 point[CURVE25519_POINT_SIZE]) -{ - fe25519 t; - fe25519 z; - uint8_t e[32]; - memcpy(e, scalar, sizeof(e)); - normalize_secret(e); - - fe25519_unpack(&t, point); - mladder(&t, &z, e); - fe25519_invert(&z, &z); - x25519_x86_64_mul(&t, &t, &z); - fe25519_pack(out, &t); - return true; -} diff --git a/curve25519-donna32.c b/curve25519-donna32.c deleted file mode 100644 index 4721864..0000000 --- a/curve25519-donna32.c +++ /dev/null @@ -1,861 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2008 Google Inc. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Adam Langley <agl@imperialviolet.org> - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -typedef s64 limb; - -/* Field element representation: - * - * Field elements are written as an array of signed, 64-bit limbs, least - * significant first. The value of the field element is: - * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... - * - * i.e. the limbs are 26, 25, 26, 25, ... bits wide. - */ - -/* Sum two numbers: output += in */ -static void fsum(limb *output, const limb *in) -{ - unsigned int i; - - for (i = 0; i < 10; i += 2) { - output[0 + i] = output[0 + i] + in[0 + i]; - output[1 + i] = output[1 + i] + in[1 + i]; - } -} - -/* Find the difference of two numbers: output = in - output - * (note the order of the arguments!). - */ -static void fdifference(limb *output, const limb *in) -{ - unsigned int i; - - for (i = 0; i < 10; ++i) - output[i] = in[i] - output[i]; -} - -/* Multiply a number by a scalar: output = in * scalar */ -static void fscalar_product(limb *output, const limb *in, const limb scalar) -{ - unsigned int i; - - for (i = 0; i < 10; ++i) - output[i] = in[i] * scalar; -} - -/* Multiply two numbers: output = in2 * in - * - * output must be distinct to both inputs. The inputs are reduced coefficient - * form, the output is not. - * - * output[x] <= 14 * the largest product of the input limbs. - */ -static void fproduct(limb *output, const limb *in2, const limb *in) -{ - output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); - output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + - ((limb) ((s32) in2[1])) * ((s32) in[0]); - output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + - ((limb) ((s32) in2[0])) * ((s32) in[2]) + - ((limb) ((s32) in2[2])) * ((s32) in[0]); - output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + - ((limb) ((s32) in2[2])) * ((s32) in[1]) + - ((limb) ((s32) in2[0])) * ((s32) in[3]) + - ((limb) ((s32) in2[3])) * ((s32) in[0]); - output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + - 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) + - ((limb) ((s32) in2[3])) * ((s32) in[1])) + - ((limb) ((s32) in2[0])) * ((s32) in[4]) + - ((limb) ((s32) in2[4])) * ((s32) in[0]); - output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + - ((limb) ((s32) in2[3])) * ((s32) in[2]) + - ((limb) ((s32) in2[1])) * ((s32) in[4]) + - ((limb) ((s32) in2[4])) * ((s32) in[1]) + - ((limb) ((s32) in2[0])) * ((s32) in[5]) + - ((limb) ((s32) in2[5])) * ((s32) in[0]); - output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) + - ((limb) ((s32) in2[1])) * ((s32) in[5]) + - ((limb) ((s32) in2[5])) * ((s32) in[1])) + - ((limb) ((s32) in2[2])) * ((s32) in[4]) + - ((limb) ((s32) in2[4])) * ((s32) in[2]) + - ((limb) ((s32) in2[0])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[0]); - output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + - ((limb) ((s32) in2[4])) * ((s32) in[3]) + - ((limb) ((s32) in2[2])) * ((s32) in[5]) + - ((limb) ((s32) in2[5])) * ((s32) in[2]) + - ((limb) ((s32) in2[1])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[1]) + - ((limb) ((s32) in2[0])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[0]); - output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + - 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) + - ((limb) ((s32) in2[5])) * ((s32) in[3]) + - ((limb) ((s32) in2[1])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[1])) + - ((limb) ((s32) in2[2])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[2]) + - ((limb) ((s32) in2[0])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[0]); - output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + - ((limb) ((s32) in2[5])) * ((s32) in[4]) + - ((limb) ((s32) in2[3])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[3]) + - ((limb) ((s32) in2[2])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[2]) + - ((limb) ((s32) in2[1])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[1]) + - ((limb) ((s32) in2[0])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[0]); - output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) + - ((limb) ((s32) in2[3])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[3]) + - ((limb) ((s32) in2[1])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[1])) + - ((limb) ((s32) in2[4])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[4]) + - ((limb) ((s32) in2[2])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[2]); - output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + - ((limb) ((s32) in2[6])) * ((s32) in[5]) + - ((limb) ((s32) in2[4])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[4]) + - ((limb) ((s32) in2[3])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[3]) + - ((limb) ((s32) in2[2])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[2]); - output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + - 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[5]) + - ((limb) ((s32) in2[3])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[3])) + - ((limb) ((s32) in2[4])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[4]); - output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + - ((limb) ((s32) in2[7])) * ((s32) in[6]) + - ((limb) ((s32) in2[5])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[5]) + - ((limb) ((s32) in2[4])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[4]); - output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) + - ((limb) ((s32) in2[5])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[5])) + - ((limb) ((s32) in2[6])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[6]); - output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + - ((limb) ((s32) in2[8])) * ((s32) in[7]) + - ((limb) ((s32) in2[6])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[6]); - output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + - 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[7])); - output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + - ((limb) ((s32) in2[9])) * ((s32) in[8]); - output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); -} - -/* Reduce a long form to a short form by taking the input mod 2^255 - 19. - * - * On entry: |output[i]| < 14*2^54 - * On exit: |output[0..8]| < 280*2^54 - */ -static void freduce_degree(limb *output) -{ - /* Each of these shifts and adds ends up multiplying the value by 19. - * - * For output[0..8], the absolute entry value is < 14*2^54 and we add, at - * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. - */ - output[8] += output[18] << 4; - output[8] += output[18] << 1; - output[8] += output[18]; - output[7] += output[17] << 4; - output[7] += output[17] << 1; - output[7] += output[17]; - output[6] += output[16] << 4; - output[6] += output[16] << 1; - output[6] += output[16]; - output[5] += output[15] << 4; - output[5] += output[15] << 1; - output[5] += output[15]; - output[4] += output[14] << 4; - output[4] += output[14] << 1; - output[4] += output[14]; - output[3] += output[13] << 4; - output[3] += output[13] << 1; - output[3] += output[13]; - output[2] += output[12] << 4; - output[2] += output[12] << 1; - output[2] += output[12]; - output[1] += output[11] << 4; - output[1] += output[11] << 1; - output[1] += output[11]; - output[0] += output[10] << 4; - output[0] += output[10] << 1; - output[0] += output[10]; -} - -/* return v / 2^26, using only shifts and adds. - * - * On entry: v can take any value. - */ -static inline limb div_by_2_26(const limb v) -{ - /* High word of v; no shift needed. */ - const u32 highword = (u32) (((u64) v) >> 32); - /* Set to all 1s if v was negative; else set to 0s. */ - const s32 sign = ((s32) highword) >> 31; - /* Set to 0x3ffffff if v was negative; else set to 0. */ - const s32 roundoff = ((u32) sign) >> 6; - /* Should return v / (1<<26) */ - return (v + roundoff) >> 26; -} - -/* return v / (2^25), using only shifts and adds. - * - * On entry: v can take any value. - */ -static inline limb div_by_2_25(const limb v) -{ - /* High word of v; no shift needed*/ - const u32 highword = (u32) (((u64) v) >> 32); - /* Set to all 1s if v was negative; else set to 0s. */ - const s32 sign = ((s32) highword) >> 31; - /* Set to 0x1ffffff if v was negative; else set to 0. */ - const s32 roundoff = ((u32) sign) >> 7; - /* Should return v / (1<<25) */ - return (v + roundoff) >> 25; -} - -/* Reduce all coefficients of the short form input so that |x| < 2^26. - * - * On entry: |output[i]| < 280*2^54 - */ -static void freduce_coefficients(limb *output) -{ - unsigned int i; - - output[10] = 0; - - for (i = 0; i < 10; i += 2) { - limb over = div_by_2_26(output[i]); - /* The entry condition (that |output[i]| < 280*2^54) means that over is, at - * most, 280*2^28 in the first iteration of this loop. This is added to the - * next limb and we can approximate the resulting bound of that limb by - * 281*2^54. - */ - output[i] -= over << 26; - output[i+1] += over; - - /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < - * 281*2^29. When this is added to the next limb, the resulting bound can - * be approximated as 281*2^54. - * - * For subsequent iterations of the loop, 281*2^54 remains a conservative - * bound and no overflow occurs. - */ - over = div_by_2_25(output[i+1]); - output[i+1] -= over << 25; - output[i+2] += over; - } - /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ - output[0] += output[10] << 4; - output[0] += output[10] << 1; - output[0] += output[10]; - - output[10] = 0; - - /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 - * So |over| will be no more than 2^16. - */ - { - limb over = div_by_2_26(output[0]); - - output[0] -= over << 26; - output[1] += over; - } - - /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The - * bound on |output[1]| is sufficient to meet our needs. - */ -} - -/* A helpful wrapper around fproduct: output = in * in2. - * - * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. - * - * output must be distinct to both inputs. The output is reduced degree - * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. - */ -static void fmul(limb *output, const limb *in, const limb *in2) -{ - limb t[19]; - - fproduct(t, in, in2); - /* |t[i]| < 14*2^54 */ - freduce_degree(t); - freduce_coefficients(t); - /* |t[i]| < 2^26 */ - memcpy(output, t, sizeof(limb) * 10); -} - -/* Square a number: output = in**2 - * - * output must be distinct from the input. The inputs are reduced coefficient - * form, the output is not. - * - * output[x] <= 14 * the largest product of the input limbs. - */ -static void fsquare_inner(limb *output, const limb *in) -{ - output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); - output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); - output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + - ((limb) ((s32) in[0])) * ((s32) in[2])); - output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + - ((limb) ((s32) in[0])) * ((s32) in[3])); - output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + - 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + - 2 * ((limb) ((s32) in[0])) * ((s32) in[4]); - output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + - ((limb) ((s32) in[1])) * ((s32) in[4]) + - ((limb) ((s32) in[0])) * ((s32) in[5])); - output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + - ((limb) ((s32) in[2])) * ((s32) in[4]) + - ((limb) ((s32) in[0])) * ((s32) in[6]) + - 2 * ((limb) ((s32) in[1])) * ((s32) in[5])); - output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + - ((limb) ((s32) in[2])) * ((s32) in[5]) + - ((limb) ((s32) in[1])) * ((s32) in[6]) + - ((limb) ((s32) in[0])) * ((s32) in[7])); - output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + - 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + - ((limb) ((s32) in[0])) * ((s32) in[8]) + - 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + - ((limb) ((s32) in[3])) * ((s32) in[5]))); - output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + - ((limb) ((s32) in[3])) * ((s32) in[6]) + - ((limb) ((s32) in[2])) * ((s32) in[7]) + - ((limb) ((s32) in[1])) * ((s32) in[8]) + - ((limb) ((s32) in[0])) * ((s32) in[9])); - output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + - ((limb) ((s32) in[4])) * ((s32) in[6]) + - ((limb) ((s32) in[2])) * ((s32) in[8]) + - 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + - ((limb) ((s32) in[1])) * ((s32) in[9]))); - output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + - ((limb) ((s32) in[4])) * ((s32) in[7]) + - ((limb) ((s32) in[3])) * ((s32) in[8]) + - ((limb) ((s32) in[2])) * ((s32) in[9])); - output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + - 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + - 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + - ((limb) ((s32) in[3])) * ((s32) in[9]))); - output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + - ((limb) ((s32) in[5])) * ((s32) in[8]) + - ((limb) ((s32) in[4])) * ((s32) in[9])); - output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + - ((limb) ((s32) in[6])) * ((s32) in[8]) + - 2 * ((limb) ((s32) in[5])) * ((s32) in[9])); - output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + - ((limb) ((s32) in[6])) * ((s32) in[9])); - output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + - 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); - output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); - output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); -} - -/* fsquare sets output = in^2. - * - * On entry: The |in| argument is in reduced coefficients form and |in[i]| < - * 2^27. - * - * On exit: The |output| argument is in reduced coefficients form (indeed, one - * need only provide storage for 10 limbs) and |out[i]| < 2^26. - */ -static void fsquare(limb *output, const limb *in) -{ - limb t[19]; - - fsquare_inner(t, in); - /* |t[i]| < 14*2^54 because the largest product of two limbs will be < - * 2^(27+27) and fsquare_inner adds together, at most, 14 of those - * products. - */ - freduce_degree(t); - freduce_coefficients(t); - /* |t[i]| < 2^26 */ - memcpy(output, t, sizeof(limb) * 10); -} - -/* Take a little-endian, 32-byte number and expand it into polynomial form */ -static inline void fexpand(limb *output, const u8 *input) -{ -#define F(n, start, shift, mask) \ - output[n] = ((((limb) input[start + 0]) | \ - ((limb) input[start + 1]) << 8 | \ - ((limb) input[start + 2]) << 16 | \ - ((limb) input[start + 3]) << 24) >> shift) & mask; - F(0, 0, 0, 0x3ffffff); - F(1, 3, 2, 0x1ffffff); - F(2, 6, 3, 0x3ffffff); - F(3, 9, 5, 0x1ffffff); - F(4, 12, 6, 0x3ffffff); - F(5, 16, 0, 0x1ffffff); - F(6, 19, 1, 0x3ffffff); - F(7, 22, 3, 0x1ffffff); - F(8, 25, 4, 0x3ffffff); - F(9, 28, 6, 0x1ffffff); -#undef F -} - -/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ -static s32 s32_eq(s32 a, s32 b) -{ - a = ~(a ^ b); - a &= a << 16; - a &= a << 8; - a &= a << 4; - a &= a << 2; - a &= a << 1; - return a >> 31; -} - -/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are - * both non-negative. - */ -static s32 s32_gte(s32 a, s32 b) -{ - a -= b; - /* a >= 0 iff a >= b. */ - return ~(a >> 31); -} - -/* Take a fully reduced polynomial form number and contract it into a - * little-endian, 32-byte array. - * - * On entry: |input_limbs[i]| < 2^26 - */ -static void fcontract(u8 *output, limb *input_limbs) -{ - int i; - int j; - s32 input[10]; - s32 mask; - - /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ - for (i = 0; i < 10; i++) { - input[i] = input_limbs[i]; - } - - for (j = 0; j < 2; ++j) { - for (i = 0; i < 9; ++i) { - if ((i & 1) == 1) { - /* This calculation is a time-invariant way to make input[i] - * non-negative by borrowing from the next-larger limb. - */ - const s32 mask = input[i] >> 31; - const s32 carry = -((input[i] & mask) >> 25); - - input[i] = input[i] + (carry << 25); - input[i+1] = input[i+1] - carry; - } else { - const s32 mask = input[i] >> 31; - const s32 carry = -((input[i] & mask) >> 26); - - input[i] = input[i] + (carry << 26); - input[i+1] = input[i+1] - carry; - } - } - - /* There's no greater limb for input[9] to borrow from, but we can multiply - * by 19 and borrow from input[0], which is valid mod 2^255-19. - */ - { - const s32 mask = input[9] >> 31; - const s32 carry = -((input[9] & mask) >> 25); - - input[9] = input[9] + (carry << 25); - input[0] = input[0] - (carry * 19); - } - - /* After the first iteration, input[1..9] are non-negative and fit within - * 25 or 26 bits, depending on position. However, input[0] may be - * negative. - */ - } - - /* The first borrow-propagation pass above ended with every limb - except (possibly) input[0] non-negative. - If input[0] was negative after the first pass, then it was because of a - carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, - one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. - In the second pass, each limb is decreased by at most one. Thus the second - borrow-propagation pass could only have wrapped around to decrease - input[0] again if the first pass left input[0] negative *and* input[1] - through input[9] were all zero. In that case, input[1] is now 2^25 - 1, - and this last borrow-propagation step will leave input[1] non-negative. */ - { - const s32 mask = input[0] >> 31; - const s32 carry = -((input[0] & mask) >> 26); - - input[0] = input[0] + (carry << 26); - input[1] = input[1] - carry; - } - - /* All input[i] are now non-negative. However, there might be values between - * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. - */ - for (j = 0; j < 2; j++) { - for (i = 0; i < 9; i++) { - if ((i & 1) == 1) { - const s32 carry = input[i] >> 25; - - input[i] &= 0x1ffffff; - input[i+1] += carry; - } else { - const s32 carry = input[i] >> 26; - - input[i] &= 0x3ffffff; - input[i+1] += carry; - } - } - - { - const s32 carry = input[9] >> 25; - - input[9] &= 0x1ffffff; - input[0] += 19*carry; - } - } - - /* If the first carry-chain pass, just above, ended up with a carry from - * input[9], and that caused input[0] to be out-of-bounds, then input[0] was - * < 2^26 + 2*19, because the carry was, at most, two. - * - * If the second pass carried from input[9] again then input[0] is < 2*19 and - * the input[9] -> input[0] carry didn't push input[0] out of bounds. - */ - - /* It still remains the case that input might be between 2^255-19 and 2^255. - * In this case, input[1..9] must take their maximum value and input[0] must - * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. - */ - mask = s32_gte(input[0], 0x3ffffed); - for (i = 1; i < 10; i++) { - if ((i & 1) == 1) { - mask &= s32_eq(input[i], 0x1ffffff); - } else { - mask &= s32_eq(input[i], 0x3ffffff); - } - } - - /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus - * this conditionally subtracts 2^255-19. - */ - input[0] -= mask & 0x3ffffed; - - for (i = 1; i < 10; i++) { - if ((i & 1) == 1) { - input[i] -= mask & 0x1ffffff; - } else { - input[i] -= mask & 0x3ffffff; - } - } - - input[1] <<= 2; - input[2] <<= 3; - input[3] <<= 5; - input[4] <<= 6; - input[6] <<= 1; - input[7] <<= 3; - input[8] <<= 4; - input[9] <<= 6; -#define F(i, s) \ - output[s+0] |= input[i] & 0xff; \ - output[s+1] = (input[i] >> 8) & 0xff; \ - output[s+2] = (input[i] >> 16) & 0xff; \ - output[s+3] = (input[i] >> 24) & 0xff; - output[0] = 0; - output[16] = 0; - F(0, 0); - F(1, 3); - F(2, 6); - F(3, 9); - F(4, 12); - F(5, 16); - F(6, 19); - F(7, 22); - F(8, 25); - F(9, 28); -#undef F -} - -/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave - * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid - * side-channel attacks. - * - * NOTE that this function requires that 'iswap' be 1 or 0; other values give - * wrong results. Also, the two limb arrays must be in reduced-coefficient, - * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, - * and all all values in a[0..9],b[0..9] must have magnitude less than - * INT32_MAX. - */ -static void swap_conditional(limb a[19], limb b[19], limb iswap) -{ - unsigned int i; - const s32 swap = (s32) -iswap; - - for (i = 0; i < 10; ++i) { - const s32 x = swap & (((s32)a[i]) ^ ((s32)b[i])); - - a[i] = ((s32)a[i]) ^ x; - b[i] = ((s32)b[i]) ^ x; - } -} - -static void crecip(limb *out, const limb *z) -{ - limb z2[10]; - limb z9[10]; - limb z11[10]; - limb z2_5_0[10]; - limb z2_10_0[10]; - limb z2_20_0[10]; - limb z2_50_0[10]; - limb z2_100_0[10]; - limb t0[10]; - limb t1[10]; - int i; - - /* 2 */ fsquare(z2, z); - /* 4 */ fsquare(t1, z2); - /* 8 */ fsquare(t0, t1); - /* 9 */ fmul(z9, t0, z); - /* 11 */ fmul(z11, z9, z2); - /* 22 */ fsquare(t0, z11); - /* 2^5 - 2^0 = 31 */ fmul(z2_5_0, t0, z9); - - /* 2^6 - 2^1 */ fsquare(t0, z2_5_0); - /* 2^7 - 2^2 */ fsquare(t1, t0); - /* 2^8 - 2^3 */ fsquare(t0, t1); - /* 2^9 - 2^4 */ fsquare(t1, t0); - /* 2^10 - 2^5 */ fsquare(t0, t1); - /* 2^10 - 2^0 */ fmul(z2_10_0, t0, z2_5_0); - - /* 2^11 - 2^1 */ fsquare(t0, z2_10_0); - /* 2^12 - 2^2 */ fsquare(t1, t0); - /* 2^20 - 2^10 */ for (i = 2; i < 10; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } - /* 2^20 - 2^0 */ fmul(z2_20_0, t1, z2_10_0); - - /* 2^21 - 2^1 */ fsquare(t0, z2_20_0); - /* 2^22 - 2^2 */ fsquare(t1, t0); - /* 2^40 - 2^20 */ for (i = 2; i < 20; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } - /* 2^40 - 2^0 */ fmul(t0, t1, z2_20_0); - - /* 2^41 - 2^1 */ fsquare(t1, t0); - /* 2^42 - 2^2 */ fsquare(t0, t1); - /* 2^50 - 2^10 */ for (i = 2; i < 10; i += 2) { fsquare(t1, t0); fsquare(t0, t1); } - /* 2^50 - 2^0 */ fmul(z2_50_0, t0, z2_10_0); - - /* 2^51 - 2^1 */ fsquare(t0, z2_50_0); - /* 2^52 - 2^2 */ fsquare(t1, t0); - /* 2^100 - 2^50 */ for (i = 2; i < 50; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } - /* 2^100 - 2^0 */ fmul(z2_100_0, t1, z2_50_0); - - /* 2^101 - 2^1 */ fsquare(t1, z2_100_0); - /* 2^102 - 2^2 */ fsquare(t0, t1); - /* 2^200 - 2^100 */ for (i = 2; i < 100; i += 2) { fsquare(t1, t0); fsquare(t0, t1); } - /* 2^200 - 2^0 */ fmul(t1, t0, z2_100_0); - - /* 2^201 - 2^1 */ fsquare(t0, t1); - /* 2^202 - 2^2 */ fsquare(t1, t0); - /* 2^250 - 2^50 */ for (i = 2; i < 50; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } - /* 2^250 - 2^0 */ fmul(t0, t1, z2_50_0); - - /* 2^251 - 2^1 */ fsquare(t1, t0); - /* 2^252 - 2^2 */ fsquare(t0, t1); - /* 2^253 - 2^3 */ fsquare(t1, t0); - /* 2^254 - 2^4 */ fsquare(t0, t1); - /* 2^255 - 2^5 */ fsquare(t1, t0); - /* 2^255 - 21 */ fmul(out, t1, z11); -} - - -/* Input: Q, Q', Q-Q' - * Output: 2Q, Q+Q' - * - * x2 z3: long form - * x3 z3: long form - * x z: short form, destroyed - * xprime zprime: short form, destroyed - * qmqp: short form, preserved - * - * On entry and exit, the absolute value of the limbs of all inputs and outputs - * are < 2^26. - */ -static void fmonty(limb *x2, limb *z2, /* output 2Q */ - limb *x3, limb *z3, /* output Q + Q' */ - limb *x, limb *z, /* input Q */ - limb *xprime, limb *zprime, /* input Q' */ - - const limb *qmqp /* input Q - Q' */) -{ - limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], - zzprime[19], zzzprime[19], xxxprime[19]; - - memcpy(origx, x, 10 * sizeof(limb)); - fsum(x, z); - /* |x[i]| < 2^27 */ - fdifference(z, origx); /* does x - z */ - /* |z[i]| < 2^27 */ - - memcpy(origxprime, xprime, sizeof(limb) * 10); - fsum(xprime, zprime); - /* |xprime[i]| < 2^27 */ - fdifference(zprime, origxprime); - /* |zprime[i]| < 2^27 */ - fproduct(xxprime, xprime, z); - /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < - * 2^(27+27) and fproduct adds together, at most, 14 of those products. - * (Approximating that to 2^58 doesn't work out.) - */ - fproduct(zzprime, x, zprime); - /* |zzprime[i]| < 14*2^54 */ - freduce_degree(xxprime); - freduce_coefficients(xxprime); - /* |xxprime[i]| < 2^26 */ - freduce_degree(zzprime); - freduce_coefficients(zzprime); - /* |zzprime[i]| < 2^26 */ - memcpy(origxprime, xxprime, sizeof(limb) * 10); - fsum(xxprime, zzprime); - /* |xxprime[i]| < 2^27 */ - fdifference(zzprime, origxprime); - /* |zzprime[i]| < 2^27 */ - fsquare(xxxprime, xxprime); - /* |xxxprime[i]| < 2^26 */ - fsquare(zzzprime, zzprime); - /* |zzzprime[i]| < 2^26 */ - fproduct(zzprime, zzzprime, qmqp); - /* |zzprime[i]| < 14*2^52 */ - freduce_degree(zzprime); - freduce_coefficients(zzprime); - /* |zzprime[i]| < 2^26 */ - memcpy(x3, xxxprime, sizeof(limb) * 10); - memcpy(z3, zzprime, sizeof(limb) * 10); - - fsquare(xx, x); - /* |xx[i]| < 2^26 */ - fsquare(zz, z); - /* |zz[i]| < 2^26 */ - fproduct(x2, xx, zz); - /* |x2[i]| < 14*2^52 */ - freduce_degree(x2); - freduce_coefficients(x2); - /* |x2[i]| < 2^26 */ - fdifference(zz, xx); // does zz = xx - zz - /* |zz[i]| < 2^27 */ - memset(zzz + 10, 0, sizeof(limb) * 9); - fscalar_product(zzz, zz, 121665); - /* |zzz[i]| < 2^(27+17) */ - /* No need to call freduce_degree here: - fscalar_product doesn't increase the degree of its input. */ - freduce_coefficients(zzz); - /* |zzz[i]| < 2^26 */ - fsum(zzz, xx); - /* |zzz[i]| < 2^27 */ - fproduct(z2, zz, zzz); - /* |z2[i]| < 14*2^(26+27) */ - freduce_degree(z2); - freduce_coefficients(z2); - /* |z2|i| < 2^26 */ -} - -/* Calculates nQ where Q is the x-coordinate of a point on the curve - * - * resultx/resultz: the x coordinate of the resulting curve point (short form) - * n: a little endian, 32-byte number - * q: a point of the curve (short form) - */ -static void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) -{ - limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; - limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; - limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; - limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; - - unsigned int i, j; - - memcpy(nqpqx, q, sizeof(limb) * 10); - - for (i = 0; i < 32; ++i) { - u8 byte = n[31 - i]; - - for (j = 0; j < 8; ++j) { - const limb bit = byte >> 7; - - swap_conditional(nqx, nqpqx, bit); - swap_conditional(nqz, nqpqz, bit); - fmonty(nqx2, nqz2, - nqpqx2, nqpqz2, - nqx, nqz, - nqpqx, nqpqz, - q); - swap_conditional(nqx2, nqpqx2, bit); - swap_conditional(nqz2, nqpqz2, bit); - - t = nqx; - nqx = nqx2; - nqx2 = t; - t = nqz; - nqz = nqz2; - nqz2 = t; - t = nqpqx; - nqpqx = nqpqx2; - nqpqx2 = t; - t = nqpqz; - nqpqz = nqpqz2; - nqpqz2 = t; - - byte <<= 1; - } - } - - memcpy(resultx, nqx, sizeof(limb) * 10); - memcpy(resultz, nqz, sizeof(limb) * 10); -} - -bool curve25519_donna32(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) -{ - limb bp[10], x[10], z[11], zmone[10]; - u8 e[32]; - - memcpy(e, secret, 32); - normalize_secret(e); - - fexpand(bp, basepoint); - cmult(x, z, e, bp); - crecip(zmone, z); - fmul(z, x, zmone); - fcontract(mypublic, z); - - return true; -} diff --git a/curve25519-donna64.c b/curve25519-donna64.c deleted file mode 100644 index f294369..0000000 --- a/curve25519-donna64.c +++ /dev/null @@ -1,414 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2008 Google Inc. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Adam Langley <agl@imperialviolet.org> - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -typedef u64 limb; -typedef limb felem[5]; -typedef __uint128_t u128; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -/* Sum two numbers: output += in */ -static __always_inline void fsum(limb *output, const limb *in) -{ - output[0] += in[0]; - output[1] += in[1]; - output[2] += in[2]; - output[3] += in[3]; - output[4] += in[4]; -} - -/* Find the difference of two numbers: output = in - output - * (note the order of the arguments!) - * - * Assumes that out[i] < 2**52 - * On return, out[i] < 2**55 - */ -static __always_inline void fdifference_backwards(felem out, const felem in) -{ - /* 152 is 19 << 3 */ - static const limb two54m152 = (((limb)1) << 54) - 152; - static const limb two54m8 = (((limb)1) << 54) - 8; - - out[0] = in[0] + two54m152 - out[0]; - out[1] = in[1] + two54m8 - out[1]; - out[2] = in[2] + two54m8 - out[2]; - out[3] = in[3] + two54m8 - out[3]; - out[4] = in[4] + two54m8 - out[4]; -} - -/* Multiply a number by a scalar: output = in * scalar */ -static __always_inline void fscalar_product(felem output, const felem in, const limb scalar) -{ - u128 a; - - a = ((u128) in[0]) * scalar; - output[0] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[1]) * scalar + ((limb) (a >> 51)); - output[1] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[2]) * scalar + ((limb) (a >> 51)); - output[2] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[3]) * scalar + ((limb) (a >> 51)); - output[3] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[4]) * scalar + ((limb) (a >> 51)); - output[4] = ((limb)a) & 0x7ffffffffffffUL; - - output[0] += (a >> 51) * 19; -} - -/* Multiply two numbers: output = in2 * in - * - * output must be distinct to both inputs. The inputs are reduced coefficient - * form, the output is not. - * - * Assumes that in[i] < 2**55 and likewise for in2. - * On return, output[i] < 2**52 - */ -static __always_inline void fmul(felem output, const felem in2, const felem in) -{ - u128 t[5]; - limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c; - - r0 = in[0]; - r1 = in[1]; - r2 = in[2]; - r3 = in[3]; - r4 = in[4]; - - s0 = in2[0]; - s1 = in2[1]; - s2 = in2[2]; - s3 = in2[3]; - s4 = in2[4]; - - t[0] = ((u128) r0) * s0; - t[1] = ((u128) r0) * s1 + ((u128) r1) * s0; - t[2] = ((u128) r0) * s2 + ((u128) r2) * s0 + ((u128) r1) * s1; - t[3] = ((u128) r0) * s3 + ((u128) r3) * s0 + ((u128) r1) * s2 + ((u128) r2) * s1; - t[4] = ((u128) r0) * s4 + ((u128) r4) * s0 + ((u128) r3) * s1 + ((u128) r1) * s3 + ((u128) r2) * s2; - - r4 *= 19; - r1 *= 19; - r2 *= 19; - r3 *= 19; - - t[0] += ((u128) r4) * s1 + ((u128) r1) * s4 + ((u128) r2) * s3 + ((u128) r3) * s2; - t[1] += ((u128) r4) * s2 + ((u128) r2) * s4 + ((u128) r3) * s3; - t[2] += ((u128) r4) * s3 + ((u128) r3) * s4; - t[3] += ((u128) r4) * s4; - - r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); - t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); - t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); - t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); - t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); - r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; - r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; - r2 += c; - - output[0] = r0; - output[1] = r1; - output[2] = r2; - output[3] = r3; - output[4] = r4; -} - -static __always_inline void fsquare_times(felem output, const felem in, limb count) -{ - u128 t[5]; - limb r0, r1, r2, r3, r4, c; - limb d0, d1, d2, d4, d419; - - r0 = in[0]; - r1 = in[1]; - r2 = in[2]; - r3 = in[3]; - r4 = in[4]; - - do { - d0 = r0 * 2; - d1 = r1 * 2; - d2 = r2 * 2 * 19; - d419 = r4 * 19; - d4 = d419 * 2; - - t[0] = ((u128) r0) * r0 + ((u128) d4) * r1 + (((u128) d2) * (r3 )); - t[1] = ((u128) d0) * r1 + ((u128) d4) * r2 + (((u128) r3) * (r3 * 19)); - t[2] = ((u128) d0) * r2 + ((u128) r1) * r1 + (((u128) d4) * (r3 )); - t[3] = ((u128) d0) * r3 + ((u128) d1) * r2 + (((u128) r4) * (d419 )); - t[4] = ((u128) d0) * r4 + ((u128) d1) * r3 + (((u128) r2) * (r2 )); - - r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); - t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); - t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); - t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); - t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); - r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; - r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; - r2 += c; - } while (--count); - - output[0] = r0; - output[1] = r1; - output[2] = r2; - output[3] = r3; - output[4] = r4; -} - -/* Load a little-endian 64-bit number */ -static inline limb load_limb(const u8 *in) -{ - return le64_to_cpu(*(__le64 *)in); -} - -static inline void store_limb(u8 *out, limb in) -{ - *(__le64 *)out = cpu_to_le64(in); -} - -/* Take a little-endian, 32-byte number and expand it into polynomial form */ -static inline void fexpand(limb *output, const u8 *in) -{ - output[0] = load_limb(in) & 0x7ffffffffffffUL; - output[1] = (load_limb(in + 6) >> 3) & 0x7ffffffffffffUL; - output[2] = (load_limb(in + 12) >> 6) & 0x7ffffffffffffUL; - output[3] = (load_limb(in + 19) >> 1) & 0x7ffffffffffffUL; - output[4] = (load_limb(in + 24) >> 12) & 0x7ffffffffffffUL; -} - -/* Take a fully reduced polynomial form number and contract it into a - * little-endian, 32-byte array - */ -static void fcontract(u8 *output, const felem input) -{ - u128 t[5]; - - t[0] = input[0]; - t[1] = input[1]; - t[2] = input[2]; - t[3] = input[3]; - t[4] = input[4]; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - /* now t is between 0 and 2^255-1, properly carried. */ - /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ - - t[0] += 19; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - /* now between 19 and 2^255-1 in both cases, and offset by 19. */ - - t[0] += 0x8000000000000UL - 19; - t[1] += 0x8000000000000UL - 1; - t[2] += 0x8000000000000UL - 1; - t[3] += 0x8000000000000UL - 1; - t[4] += 0x8000000000000UL - 1; - - /* now between 2^255 and 2^256-20, and offset by 2^255. */ - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[4] &= 0x7ffffffffffffUL; - - store_limb(output, t[0] | (t[1] << 51)); - store_limb(output+8, (t[1] >> 13) | (t[2] << 38)); - store_limb(output+16, (t[2] >> 26) | (t[3] << 25)); - store_limb(output+24, (t[3] >> 39) | (t[4] << 12)); -} - -/* Input: Q, Q', Q-Q' - * Output: 2Q, Q+Q' - * - * x2 z3: long form - * x3 z3: long form - * x z: short form, destroyed - * xprime zprime: short form, destroyed - * qmqp: short form, preserved - */ -static void fmonty(limb *x2, limb *z2, /* output 2Q */ - limb *x3, limb *z3, /* output Q + Q' */ - limb *x, limb *z, /* input Q */ - limb *xprime, limb *zprime, /* input Q' */ - - const limb *qmqp /* input Q - Q' */) -{ - limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5], zzzprime[5]; - - memcpy(origx, x, 5 * sizeof(limb)); - fsum(x, z); - fdifference_backwards(z, origx); // does x - z - - memcpy(origxprime, xprime, sizeof(limb) * 5); - fsum(xprime, zprime); - fdifference_backwards(zprime, origxprime); - fmul(xxprime, xprime, z); - fmul(zzprime, x, zprime); - memcpy(origxprime, xxprime, sizeof(limb) * 5); - fsum(xxprime, zzprime); - fdifference_backwards(zzprime, origxprime); - fsquare_times(x3, xxprime, 1); - fsquare_times(zzzprime, zzprime, 1); - fmul(z3, zzzprime, qmqp); - - fsquare_times(xx, x, 1); - fsquare_times(zz, z, 1); - fmul(x2, xx, zz); - fdifference_backwards(zz, xx); // does zz = xx - zz - fscalar_product(zzz, zz, 121665); - fsum(zzz, xx); - fmul(z2, zz, zzz); -} - -/* Maybe swap the contents of two limb arrays (@a and @b), each @len elements - * long. Perform the swap iff @swap is non-zero. - * - * This function performs the swap without leaking any side-channel - * information. - */ -static void swap_conditional(limb a[5], limb b[5], limb iswap) -{ - unsigned int i; - const limb swap = -iswap; - - for (i = 0; i < 5; ++i) { - const limb x = swap & (a[i] ^ b[i]); - - a[i] ^= x; - b[i] ^= x; - } -} - -/* Calculates nQ where Q is the x-coordinate of a point on the curve - * - * resultx/resultz: the x coordinate of the resulting curve point (short form) - * n: a little endian, 32-byte number - * q: a point of the curve (short form) - */ -static void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) -{ - limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0}; - limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; - limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1}; - limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; - - unsigned int i, j; - - memcpy(nqpqx, q, sizeof(limb) * 5); - - for (i = 0; i < 32; ++i) { - u8 byte = n[31 - i]; - - for (j = 0; j < 8; ++j) { - const limb bit = byte >> 7; - - swap_conditional(nqx, nqpqx, bit); - swap_conditional(nqz, nqpqz, bit); - fmonty(nqx2, nqz2, - nqpqx2, nqpqz2, - nqx, nqz, - nqpqx, nqpqz, - q); - swap_conditional(nqx2, nqpqx2, bit); - swap_conditional(nqz2, nqpqz2, bit); - - t = nqx; - nqx = nqx2; - nqx2 = t; - t = nqz; - nqz = nqz2; - nqz2 = t; - t = nqpqx; - nqpqx = nqpqx2; - nqpqx2 = t; - t = nqpqz; - nqpqz = nqpqz2; - nqpqz2 = t; - - byte <<= 1; - } - } - - memcpy(resultx, nqx, sizeof(limb) * 5); - memcpy(resultz, nqz, sizeof(limb) * 5); -} - -static void crecip(felem out, const felem z) -{ - felem a, t0, b, c; - - /* 2 */ fsquare_times(a, z, 1); // a = 2 - /* 8 */ fsquare_times(t0, a, 2); - /* 9 */ fmul(b, t0, z); // b = 9 - /* 11 */ fmul(a, b, a); // a = 11 - /* 22 */ fsquare_times(t0, a, 1); - /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); - /* 2^10 - 2^5 */ fsquare_times(t0, b, 5); - /* 2^10 - 2^0 */ fmul(b, t0, b); - /* 2^20 - 2^10 */ fsquare_times(t0, b, 10); - /* 2^20 - 2^0 */ fmul(c, t0, b); - /* 2^40 - 2^20 */ fsquare_times(t0, c, 20); - /* 2^40 - 2^0 */ fmul(t0, t0, c); - /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10); - /* 2^50 - 2^0 */ fmul(b, t0, b); - /* 2^100 - 2^50 */ fsquare_times(t0, b, 50); - /* 2^100 - 2^0 */ fmul(c, t0, b); - /* 2^200 - 2^100 */ fsquare_times(t0, c, 100); - /* 2^200 - 2^0 */ fmul(t0, t0, c); - /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50); - /* 2^250 - 2^0 */ fmul(t0, t0, b); - /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5); - /* 2^255 - 21 */ fmul(out, t0, a); -} - -bool curve25519_donna64(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) -{ - limb bp[5], x[5], z[5], zmone[5]; - u8 e[32]; - - memcpy(e, secret, 32); - normalize_secret(e); - - fexpand(bp, basepoint); - cmult(x, z, e, bp); - crecip(zmone, z); - fmul(z, x, zmone); - fcontract(mypublic, z); - - return true; -} diff --git a/curve25519-fiat32.c b/curve25519-fiat32.c deleted file mode 100644 index 69e230e..0000000 --- a/curve25519-fiat32.c +++ /dev/null @@ -1,840 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015-2016 The fiat-crypto Authors. - * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * This is a machine-generated formally verified implementation of curve25519 DH from: - * https://github.com/mit-plv/fiat-crypto - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -/* fe means field element. Here the field is \Z/(2^255-19). An element t, - * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 - * t[3]+2^102 t[4]+...+2^230 t[9]. - * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. - * Multiplication and carrying produce fe from fe_loose. - */ -typedef struct fe { u32 v[10]; } fe; - -/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc. - * Addition and subtraction produce fe_loose from (fe, fe). - */ -typedef struct fe_loose { u32 v[10]; } fe_loose; - -static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s) -{ - /* Ignores top bit of s. */ - u32 a0 = le32_to_cpup((__force __le32 *)(s)); - u32 a1 = le32_to_cpup((__force __le32 *)(s+4)); - u32 a2 = le32_to_cpup((__force __le32 *)(s+8)); - u32 a3 = le32_to_cpup((__force __le32 *)(s+12)); - u32 a4 = le32_to_cpup((__force __le32 *)(s+16)); - u32 a5 = le32_to_cpup((__force __le32 *)(s+20)); - u32 a6 = le32_to_cpup((__force __le32 *)(s+24)); - u32 a7 = le32_to_cpup((__force __le32 *)(s+28)); - h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */ - h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */ - h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */ - h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */ - h[4] = (a3>> 6); /* (32- 6) = 26 */ - h[5] = a4&((1<<25)-1); /* 25 */ - h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */ - h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */ - h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */ - h[9] = (a7>> 6)&((1<<25)-1); /* 25 */ -} - -static __always_inline void fe_frombytes(fe *h, const u8 *s) -{ - fe_frombytes_impl(h->v, s); -} - -static __always_inline u8 /*bool*/ addcarryx_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low) -{ - /* This function extracts 25 bits of result and 1 bit of carry (26 total), so - * a 32-bit intermediate is sufficient. - */ - u32 x = a + b + c; - *low = x & ((1 << 25) - 1); - return (x >> 25) & 1; -} - -static __always_inline u8 /*bool*/ addcarryx_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low) -{ - /* This function extracts 26 bits of result and 1 bit of carry (27 total), so - * a 32-bit intermediate is sufficient. - */ - u32 x = a + b + c; - *low = x & ((1 << 26) - 1); - return (x >> 26) & 1; -} - -static __always_inline u8 /*bool*/ subborrow_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low) -{ - /* This function extracts 25 bits of result and 1 bit of borrow (26 total), so - * a 32-bit intermediate is sufficient. - */ - u32 x = a - b - c; - *low = x & ((1 << 25) - 1); - return x >> 31; -} - -static __always_inline u8 /*bool*/ subborrow_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low) -{ - /* This function extracts 26 bits of result and 1 bit of borrow (27 total), so - * a 32-bit intermediate is sufficient. - */ - u32 x = a - b - c; - *low = x & ((1 << 26) - 1); - return x >> 31; -} - -static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz) -{ - t = -!!t; /* all set if nonzero, 0 if 0 */ - return (t&nz) | ((~t)&z); -} - -static __always_inline void fe_freeze(u32 out[10], const u32 in1[10]) -{ - { const u32 x17 = in1[9]; - { const u32 x18 = in1[8]; - { const u32 x16 = in1[7]; - { const u32 x14 = in1[6]; - { const u32 x12 = in1[5]; - { const u32 x10 = in1[4]; - { const u32 x8 = in1[3]; - { const u32 x6 = in1[2]; - { const u32 x4 = in1[1]; - { const u32 x2 = in1[0]; - { u32 x20; u8/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20); - { u32 x23; u8/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23); - { u32 x26; u8/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26); - { u32 x29; u8/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29); - { u32 x32; u8/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32); - { u32 x35; u8/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35); - { u32 x38; u8/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38); - { u32 x41; u8/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41); - { u32 x44; u8/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44); - { u32 x47; u8/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47); - { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff); - { u32 x50 = (x49 & 0x3ffffed); - { u32 x52; u8/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52); - { u32 x54 = (x49 & 0x1ffffff); - { u32 x56; u8/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56); - { u32 x58 = (x49 & 0x3ffffff); - { u32 x60; u8/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60); - { u32 x62 = (x49 & 0x1ffffff); - { u32 x64; u8/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64); - { u32 x66 = (x49 & 0x3ffffff); - { u32 x68; u8/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68); - { u32 x70 = (x49 & 0x1ffffff); - { u32 x72; u8/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72); - { u32 x74 = (x49 & 0x3ffffff); - { u32 x76; u8/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76); - { u32 x78 = (x49 & 0x1ffffff); - { u32 x80; u8/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80); - { u32 x82 = (x49 & 0x3ffffff); - { u32 x84; u8/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84); - { u32 x86 = (x49 & 0x1ffffff); - { u32 x88; addcarryx_u25(x85, x47, x86, &x88); - out[0] = x52; - out[1] = x56; - out[2] = x60; - out[3] = x64; - out[4] = x68; - out[5] = x72; - out[6] = x76; - out[7] = x80; - out[8] = x84; - out[9] = x88; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_tobytes(u8 s[32], const fe *f) -{ - u32 h[10]; - fe_freeze(h, f->v); - s[0] = h[0] >> 0; - s[1] = h[0] >> 8; - s[2] = h[0] >> 16; - s[3] = (h[0] >> 24) | (h[1] << 2); - s[4] = h[1] >> 6; - s[5] = h[1] >> 14; - s[6] = (h[1] >> 22) | (h[2] << 3); - s[7] = h[2] >> 5; - s[8] = h[2] >> 13; - s[9] = (h[2] >> 21) | (h[3] << 5); - s[10] = h[3] >> 3; - s[11] = h[3] >> 11; - s[12] = (h[3] >> 19) | (h[4] << 6); - s[13] = h[4] >> 2; - s[14] = h[4] >> 10; - s[15] = h[4] >> 18; - s[16] = h[5] >> 0; - s[17] = h[5] >> 8; - s[18] = h[5] >> 16; - s[19] = (h[5] >> 24) | (h[6] << 1); - s[20] = h[6] >> 7; - s[21] = h[6] >> 15; - s[22] = (h[6] >> 23) | (h[7] << 3); - s[23] = h[7] >> 5; - s[24] = h[7] >> 13; - s[25] = (h[7] >> 21) | (h[8] << 4); - s[26] = h[8] >> 4; - s[27] = h[8] >> 12; - s[28] = (h[8] >> 20) | (h[9] << 6); - s[29] = h[9] >> 2; - s[30] = h[9] >> 10; - s[31] = h[9] >> 18; -} - -/* h = f */ -static __always_inline void fe_copy(fe *h, const fe *f) -{ - memmove(h, f, sizeof(fe)); -} - -static __always_inline void fe_copy_lt(fe_loose *h, const fe *f) -{ - memmove(h, f, sizeof(fe)); -} - -/* h = 0 */ -static __always_inline void fe_0(fe *h) -{ - memset(h, 0, sizeof(fe)); -} - -/* h = 1 */ -static __always_inline void fe_1(fe *h) -{ - memset(h, 0, sizeof(fe)); - h->v[0] = 1; -} - -static __always_inline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) -{ - { const u32 x20 = in1[9]; - { const u32 x21 = in1[8]; - { const u32 x19 = in1[7]; - { const u32 x17 = in1[6]; - { const u32 x15 = in1[5]; - { const u32 x13 = in1[4]; - { const u32 x11 = in1[3]; - { const u32 x9 = in1[2]; - { const u32 x7 = in1[1]; - { const u32 x5 = in1[0]; - { const u32 x38 = in2[9]; - { const u32 x39 = in2[8]; - { const u32 x37 = in2[7]; - { const u32 x35 = in2[6]; - { const u32 x33 = in2[5]; - { const u32 x31 = in2[4]; - { const u32 x29 = in2[3]; - { const u32 x27 = in2[2]; - { const u32 x25 = in2[1]; - { const u32 x23 = in2[0]; - out[0] = (x5 + x23); - out[1] = (x7 + x25); - out[2] = (x9 + x27); - out[3] = (x11 + x29); - out[4] = (x13 + x31); - out[5] = (x15 + x33); - out[6] = (x17 + x35); - out[7] = (x19 + x37); - out[8] = (x21 + x39); - out[9] = (x20 + x38); - }}}}}}}}}}}}}}}}}}}} -} - -/* h = f + g - * Can overlap h with f or g. - */ -static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g) -{ - fe_add_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) -{ - { const u32 x20 = in1[9]; - { const u32 x21 = in1[8]; - { const u32 x19 = in1[7]; - { const u32 x17 = in1[6]; - { const u32 x15 = in1[5]; - { const u32 x13 = in1[4]; - { const u32 x11 = in1[3]; - { const u32 x9 = in1[2]; - { const u32 x7 = in1[1]; - { const u32 x5 = in1[0]; - { const u32 x38 = in2[9]; - { const u32 x39 = in2[8]; - { const u32 x37 = in2[7]; - { const u32 x35 = in2[6]; - { const u32 x33 = in2[5]; - { const u32 x31 = in2[4]; - { const u32 x29 = in2[3]; - { const u32 x27 = in2[2]; - { const u32 x25 = in2[1]; - { const u32 x23 = in2[0]; - out[0] = ((0x7ffffda + x5) - x23); - out[1] = ((0x3fffffe + x7) - x25); - out[2] = ((0x7fffffe + x9) - x27); - out[3] = ((0x3fffffe + x11) - x29); - out[4] = ((0x7fffffe + x13) - x31); - out[5] = ((0x3fffffe + x15) - x33); - out[6] = ((0x7fffffe + x17) - x35); - out[7] = ((0x3fffffe + x19) - x37); - out[8] = ((0x7fffffe + x21) - x39); - out[9] = ((0x3fffffe + x20) - x38); - }}}}}}}}}}}}}}}}}}}} -} - -/* h = f - g - * Can overlap h with f or g. - */ -static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g) -{ - fe_sub_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) -{ - { const u32 x20 = in1[9]; - { const u32 x21 = in1[8]; - { const u32 x19 = in1[7]; - { const u32 x17 = in1[6]; - { const u32 x15 = in1[5]; - { const u32 x13 = in1[4]; - { const u32 x11 = in1[3]; - { const u32 x9 = in1[2]; - { const u32 x7 = in1[1]; - { const u32 x5 = in1[0]; - { const u32 x38 = in2[9]; - { const u32 x39 = in2[8]; - { const u32 x37 = in2[7]; - { const u32 x35 = in2[6]; - { const u32 x33 = in2[5]; - { const u32 x31 = in2[4]; - { const u32 x29 = in2[3]; - { const u32 x27 = in2[2]; - { const u32 x25 = in2[1]; - { const u32 x23 = in2[0]; - { u64 x40 = ((u64)x23 * x5); - { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5)); - { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5)); - { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5)); - { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5)); - { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5)); - { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5)); - { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5)); - { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5)); - { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5)); - { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9)); - { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9)); - { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13)); - { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13)); - { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17)); - { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17)); - { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19)))); - { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21)); - { u64 x58 = ((u64)(0x2 * x38) * x20); - { u64 x59 = (x48 + (x58 << 0x4)); - { u64 x60 = (x59 + (x58 << 0x1)); - { u64 x61 = (x60 + x58); - { u64 x62 = (x47 + (x57 << 0x4)); - { u64 x63 = (x62 + (x57 << 0x1)); - { u64 x64 = (x63 + x57); - { u64 x65 = (x46 + (x56 << 0x4)); - { u64 x66 = (x65 + (x56 << 0x1)); - { u64 x67 = (x66 + x56); - { u64 x68 = (x45 + (x55 << 0x4)); - { u64 x69 = (x68 + (x55 << 0x1)); - { u64 x70 = (x69 + x55); - { u64 x71 = (x44 + (x54 << 0x4)); - { u64 x72 = (x71 + (x54 << 0x1)); - { u64 x73 = (x72 + x54); - { u64 x74 = (x43 + (x53 << 0x4)); - { u64 x75 = (x74 + (x53 << 0x1)); - { u64 x76 = (x75 + x53); - { u64 x77 = (x42 + (x52 << 0x4)); - { u64 x78 = (x77 + (x52 << 0x1)); - { u64 x79 = (x78 + x52); - { u64 x80 = (x41 + (x51 << 0x4)); - { u64 x81 = (x80 + (x51 << 0x1)); - { u64 x82 = (x81 + x51); - { u64 x83 = (x40 + (x50 << 0x4)); - { u64 x84 = (x83 + (x50 << 0x1)); - { u64 x85 = (x84 + x50); - { u64 x86 = (x85 >> 0x1a); - { u32 x87 = ((u32)x85 & 0x3ffffff); - { u64 x88 = (x86 + x82); - { u64 x89 = (x88 >> 0x19); - { u32 x90 = ((u32)x88 & 0x1ffffff); - { u64 x91 = (x89 + x79); - { u64 x92 = (x91 >> 0x1a); - { u32 x93 = ((u32)x91 & 0x3ffffff); - { u64 x94 = (x92 + x76); - { u64 x95 = (x94 >> 0x19); - { u32 x96 = ((u32)x94 & 0x1ffffff); - { u64 x97 = (x95 + x73); - { u64 x98 = (x97 >> 0x1a); - { u32 x99 = ((u32)x97 & 0x3ffffff); - { u64 x100 = (x98 + x70); - { u64 x101 = (x100 >> 0x19); - { u32 x102 = ((u32)x100 & 0x1ffffff); - { u64 x103 = (x101 + x67); - { u64 x104 = (x103 >> 0x1a); - { u32 x105 = ((u32)x103 & 0x3ffffff); - { u64 x106 = (x104 + x64); - { u64 x107 = (x106 >> 0x19); - { u32 x108 = ((u32)x106 & 0x1ffffff); - { u64 x109 = (x107 + x61); - { u64 x110 = (x109 >> 0x1a); - { u32 x111 = ((u32)x109 & 0x3ffffff); - { u64 x112 = (x110 + x49); - { u64 x113 = (x112 >> 0x19); - { u32 x114 = ((u32)x112 & 0x1ffffff); - { u64 x115 = (x87 + (0x13 * x113)); - { u32 x116 = (u32) (x115 >> 0x1a); - { u32 x117 = ((u32)x115 & 0x3ffffff); - { u32 x118 = (x116 + x90); - { u32 x119 = (x118 >> 0x19); - { u32 x120 = (x118 & 0x1ffffff); - out[0] = x117; - out[1] = x120; - out[2] = (x119 + x93); - out[3] = x96; - out[4] = x99; - out[5] = x102; - out[6] = x105; - out[7] = x108; - out[8] = x111; - out[9] = x114; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_sqr_impl(u32 out[10], const u32 in1[10]) -{ - { const u32 x17 = in1[9]; - { const u32 x18 = in1[8]; - { const u32 x16 = in1[7]; - { const u32 x14 = in1[6]; - { const u32 x12 = in1[5]; - { const u32 x10 = in1[4]; - { const u32 x8 = in1[3]; - { const u32 x6 = in1[2]; - { const u32 x4 = in1[1]; - { const u32 x2 = in1[0]; - { u64 x19 = ((u64)x2 * x2); - { u64 x20 = ((u64)(0x2 * x2) * x4); - { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6))); - { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8))); - { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10)); - { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12))); - { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12))); - { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16))); - { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12)))))); - { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17))); - { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17))))); - { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17))); - { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17)))))); - { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17))); - { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17))); - { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17))); - { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17)); - { u64 x36 = ((u64)(0x2 * x18) * x17); - { u64 x37 = ((u64)(0x2 * x17) * x17); - { u64 x38 = (x27 + (x37 << 0x4)); - { u64 x39 = (x38 + (x37 << 0x1)); - { u64 x40 = (x39 + x37); - { u64 x41 = (x26 + (x36 << 0x4)); - { u64 x42 = (x41 + (x36 << 0x1)); - { u64 x43 = (x42 + x36); - { u64 x44 = (x25 + (x35 << 0x4)); - { u64 x45 = (x44 + (x35 << 0x1)); - { u64 x46 = (x45 + x35); - { u64 x47 = (x24 + (x34 << 0x4)); - { u64 x48 = (x47 + (x34 << 0x1)); - { u64 x49 = (x48 + x34); - { u64 x50 = (x23 + (x33 << 0x4)); - { u64 x51 = (x50 + (x33 << 0x1)); - { u64 x52 = (x51 + x33); - { u64 x53 = (x22 + (x32 << 0x4)); - { u64 x54 = (x53 + (x32 << 0x1)); - { u64 x55 = (x54 + x32); - { u64 x56 = (x21 + (x31 << 0x4)); - { u64 x57 = (x56 + (x31 << 0x1)); - { u64 x58 = (x57 + x31); - { u64 x59 = (x20 + (x30 << 0x4)); - { u64 x60 = (x59 + (x30 << 0x1)); - { u64 x61 = (x60 + x30); - { u64 x62 = (x19 + (x29 << 0x4)); - { u64 x63 = (x62 + (x29 << 0x1)); - { u64 x64 = (x63 + x29); - { u64 x65 = (x64 >> 0x1a); - { u32 x66 = ((u32)x64 & 0x3ffffff); - { u64 x67 = (x65 + x61); - { u64 x68 = (x67 >> 0x19); - { u32 x69 = ((u32)x67 & 0x1ffffff); - { u64 x70 = (x68 + x58); - { u64 x71 = (x70 >> 0x1a); - { u32 x72 = ((u32)x70 & 0x3ffffff); - { u64 x73 = (x71 + x55); - { u64 x74 = (x73 >> 0x19); - { u32 x75 = ((u32)x73 & 0x1ffffff); - { u64 x76 = (x74 + x52); - { u64 x77 = (x76 >> 0x1a); - { u32 x78 = ((u32)x76 & 0x3ffffff); - { u64 x79 = (x77 + x49); - { u64 x80 = (x79 >> 0x19); - { u32 x81 = ((u32)x79 & 0x1ffffff); - { u64 x82 = (x80 + x46); - { u64 x83 = (x82 >> 0x1a); - { u32 x84 = ((u32)x82 & 0x3ffffff); - { u64 x85 = (x83 + x43); - { u64 x86 = (x85 >> 0x19); - { u32 x87 = ((u32)x85 & 0x1ffffff); - { u64 x88 = (x86 + x40); - { u64 x89 = (x88 >> 0x1a); - { u32 x90 = ((u32)x88 & 0x3ffffff); - { u64 x91 = (x89 + x28); - { u64 x92 = (x91 >> 0x19); - { u32 x93 = ((u32)x91 & 0x1ffffff); - { u64 x94 = (x66 + (0x13 * x92)); - { u32 x95 = (u32) (x94 >> 0x1a); - { u32 x96 = ((u32)x94 & 0x3ffffff); - { u32 x97 = (x95 + x69); - { u32 x98 = (x97 >> 0x19); - { u32 x99 = (x97 & 0x1ffffff); - out[0] = x96; - out[1] = x99; - out[2] = (x98 + x72); - out[3] = x75; - out[4] = x78; - out[5] = x81; - out[6] = x84; - out[7] = x87; - out[8] = x90; - out[9] = x93; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_sq_tl(fe *h, const fe_loose *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static __always_inline void fe_sq_tt(fe *h, const fe *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static __always_inline void fe_loose_invert(fe *out, const fe_loose *z) -{ - fe t0; - fe t1; - fe t2; - fe t3; - int i; - - fe_sq_tl(&t0, z); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 2; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_tlt(&t1, z, &t1); - fe_mul_ttt(&t0, &t0, &t1); - fe_sq_tt(&t2, &t0); - fe_mul_ttt(&t1, &t1, &t2); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 20; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 100; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_ttt(out, &t1, &t0); -} - -static __always_inline void fe_invert(fe *out, const fe *z) -{ - fe_loose l; - fe_copy_lt(&l, z); - fe_loose_invert(out, &l); -} - -/* Replace (f,g) with (g,f) if b == 1; - * replace (f,g) with (f,g) if b == 0. - * - * Preconditions: b in {0,1} - */ -static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b) -{ - unsigned i; - b = 0-b; - for (i = 0; i < 10; i++) { - u32 x = f->v[i] ^ g->v[i]; - x &= b; - f->v[i] ^= x; - g->v[i] ^= x; - } -} - -/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ -static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10]) -{ - { const u32 x20 = in1[9]; - { const u32 x21 = in1[8]; - { const u32 x19 = in1[7]; - { const u32 x17 = in1[6]; - { const u32 x15 = in1[5]; - { const u32 x13 = in1[4]; - { const u32 x11 = in1[3]; - { const u32 x9 = in1[2]; - { const u32 x7 = in1[1]; - { const u32 x5 = in1[0]; - { const u32 x38 = 0; - { const u32 x39 = 0; - { const u32 x37 = 0; - { const u32 x35 = 0; - { const u32 x33 = 0; - { const u32 x31 = 0; - { const u32 x29 = 0; - { const u32 x27 = 0; - { const u32 x25 = 0; - { const u32 x23 = 121666; - { u64 x40 = ((u64)x23 * x5); - { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5)); - { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5)); - { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5)); - { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5)); - { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5)); - { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5)); - { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5)); - { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5)); - { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5)); - { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9)); - { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9)); - { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13)); - { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13)); - { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17)); - { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17)); - { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19)))); - { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21)); - { u64 x58 = ((u64)(0x2 * x38) * x20); - { u64 x59 = (x48 + (x58 << 0x4)); - { u64 x60 = (x59 + (x58 << 0x1)); - { u64 x61 = (x60 + x58); - { u64 x62 = (x47 + (x57 << 0x4)); - { u64 x63 = (x62 + (x57 << 0x1)); - { u64 x64 = (x63 + x57); - { u64 x65 = (x46 + (x56 << 0x4)); - { u64 x66 = (x65 + (x56 << 0x1)); - { u64 x67 = (x66 + x56); - { u64 x68 = (x45 + (x55 << 0x4)); - { u64 x69 = (x68 + (x55 << 0x1)); - { u64 x70 = (x69 + x55); - { u64 x71 = (x44 + (x54 << 0x4)); - { u64 x72 = (x71 + (x54 << 0x1)); - { u64 x73 = (x72 + x54); - { u64 x74 = (x43 + (x53 << 0x4)); - { u64 x75 = (x74 + (x53 << 0x1)); - { u64 x76 = (x75 + x53); - { u64 x77 = (x42 + (x52 << 0x4)); - { u64 x78 = (x77 + (x52 << 0x1)); - { u64 x79 = (x78 + x52); - { u64 x80 = (x41 + (x51 << 0x4)); - { u64 x81 = (x80 + (x51 << 0x1)); - { u64 x82 = (x81 + x51); - { u64 x83 = (x40 + (x50 << 0x4)); - { u64 x84 = (x83 + (x50 << 0x1)); - { u64 x85 = (x84 + x50); - { u64 x86 = (x85 >> 0x1a); - { u32 x87 = ((u32)x85 & 0x3ffffff); - { u64 x88 = (x86 + x82); - { u64 x89 = (x88 >> 0x19); - { u32 x90 = ((u32)x88 & 0x1ffffff); - { u64 x91 = (x89 + x79); - { u64 x92 = (x91 >> 0x1a); - { u32 x93 = ((u32)x91 & 0x3ffffff); - { u64 x94 = (x92 + x76); - { u64 x95 = (x94 >> 0x19); - { u32 x96 = ((u32)x94 & 0x1ffffff); - { u64 x97 = (x95 + x73); - { u64 x98 = (x97 >> 0x1a); - { u32 x99 = ((u32)x97 & 0x3ffffff); - { u64 x100 = (x98 + x70); - { u64 x101 = (x100 >> 0x19); - { u32 x102 = ((u32)x100 & 0x1ffffff); - { u64 x103 = (x101 + x67); - { u64 x104 = (x103 >> 0x1a); - { u32 x105 = ((u32)x103 & 0x3ffffff); - { u64 x106 = (x104 + x64); - { u64 x107 = (x106 >> 0x19); - { u32 x108 = ((u32)x106 & 0x1ffffff); - { u64 x109 = (x107 + x61); - { u64 x110 = (x109 >> 0x1a); - { u32 x111 = ((u32)x109 & 0x3ffffff); - { u64 x112 = (x110 + x49); - { u64 x113 = (x112 >> 0x19); - { u32 x114 = ((u32)x112 & 0x1ffffff); - { u64 x115 = (x87 + (0x13 * x113)); - { u32 x116 = (u32) (x115 >> 0x1a); - { u32 x117 = ((u32)x115 & 0x3ffffff); - { u32 x118 = (x116 + x90); - { u32 x119 = (x118 >> 0x19); - { u32 x120 = (x118 & 0x1ffffff); - out[0] = x117; - out[1] = x120; - out[2] = (x119 + x93); - out[3] = x96; - out[4] = x99; - out[5] = x102; - out[6] = x105; - out[7] = x108; - out[8] = x111; - out[9] = x114; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_mul121666(fe *h, const fe_loose *f) -{ - fe_mul_121666_impl(h->v, f->v); -} - -bool curve25519_fiat32(u8 out[CURVE25519_POINT_SIZE], const u8 scalar[CURVE25519_POINT_SIZE], const u8 point[CURVE25519_POINT_SIZE]) -{ - fe x1, x2, z2, x3, z3; - fe_loose x2l, z2l, x3l; - unsigned swap = 0; - int pos; - u8 e[32]; - - memcpy(e, scalar, 32); - normalize_secret(e); - - /* The following implementation was transcribed to Coq and proven to - * correspond to unary scalar multiplication in affine coordinates given that - * x1 != 0 is the x coordinate of some point on the curve. It was also checked - * in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 - * = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the - * underlying field, so it applies to Curve25519 itself and the quadratic - * twist of Curve25519. It was not proven in Coq that prime-field arithmetic - * correctly simulates extension-field arithmetic on prime-field values. - * The decoding of the byte array representation of e was not considered. - * Specification of Montgomery curves in affine coordinates: - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27> - * Proof that these form a group that is isomorphic to a Weierstrass curve: - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35> - * Coq transcription and correctness proof of the loop (where scalarbits=255): - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118> - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278> - * preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 - */ - fe_frombytes(&x1, point); - fe_1(&x2); - fe_0(&z2); - fe_copy(&x3, &x1); - fe_1(&z3); - - for (pos = 254; pos >= 0; --pos) { - fe tmp0, tmp1; - fe_loose tmp0l, tmp1l; - /* loop invariant as of right before the test, for the case where x1 != 0: - * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero - * let r := e >> (pos+1) in the following equalities of projective points: - * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) - * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) - * x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) - */ - unsigned b = 1 & (e[pos / 8] >> (pos & 7)); - swap ^= b; - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - swap = b; - /* Coq transcription of ladderstep formula (called from transcribed loop): - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89> - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131> - * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217> - * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147> - */ - fe_sub(&tmp0l, &x3, &z3); - fe_sub(&tmp1l, &x2, &z2); - fe_add(&x2l, &x2, &z2); - fe_add(&z2l, &x3, &z3); - fe_mul_tll(&z3, &tmp0l, &x2l); - fe_mul_tll(&z2, &z2l, &tmp1l); - fe_sq_tl(&tmp0, &tmp1l); - fe_sq_tl(&tmp1, &x2l); - fe_add(&x3l, &z3, &z2); - fe_sub(&z2l, &z3, &z2); - fe_mul_ttt(&x2, &tmp1, &tmp0); - fe_sub(&tmp1l, &tmp1, &tmp0); - fe_sq_tl(&z2, &z2l); - fe_mul121666(&z3, &tmp1l); - fe_sq_tl(&x3, &x3l); - fe_add(&tmp0l, &tmp0, &z3); - fe_mul_ttt(&z3, &x1, &z2); - fe_mul_tll(&z2, &tmp1l, &tmp0l); - } - /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) */ - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - - fe_invert(&z2, &z2); - fe_mul_ttt(&x2, &x2, &z2); - fe_tobytes(out, &x2); - - return true; -} diff --git a/curve25519-fiat64.c b/curve25519-fiat64.c deleted file mode 100644 index ed8119f..0000000 --- a/curve25519-fiat64.c +++ /dev/null @@ -1,577 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015-2016 The fiat-crypto Authors. - * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * This is a machine-generated formally verified implementation of curve25519 DH from: - * https://github.com/mit-plv/fiat-crypto - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -typedef __uint128_t u128; - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -/* fe means field element. Here the field is \Z/(2^255-19). An element t, - * entries t[0]...t[4], represents the integer t[0]+2^51 t[1]+2^102 t[2]+2^153 - * t[3]+2^204 t[4]. - * fe limbs are bounded by 1.125*2^51. - * Multiplication and carrying produce fe from fe_loose. - */ -typedef struct fe { u64 v[5]; } fe; - -/* fe_loose limbs are bounded by 3.375*2^51. - * Addition and subtraction produce fe_loose from (fe, fe). - */ -typedef struct fe_loose { u64 v[5]; } fe_loose; - -static __always_inline void fe_frombytes_impl(u64 h[5], const u8 *s) -{ - // Ignores top bit of s. - u64 a0 = le64_to_cpup((__force __le64 *)(s)); - u64 a1 = le64_to_cpup((__force __le64 *)(s+8)); - u64 a2 = le64_to_cpup((__force __le64 *)(s+16)); - u64 a3 = le64_to_cpup((__force __le64 *)(s+24)); - // Use 51 bits, 64-51 = 13 left. - h[0] = a0 & ((1ULL << 51) - 1); - // (64-51) + 38 = 13 + 38 = 51 - h[1] = (a0 >> 51) | ((a1 & ((1ULL << 38) - 1)) << 13); - // (64-38) + 25 = 26 + 25 = 51 - h[2] = (a1 >> 38) | ((a2 & ((1ULL << 25) - 1)) << 26); - // (64-25) + 12 = 39 + 12 = 51 - h[3] = (a2 >> 25) | ((a3 & ((1ULL << 12) - 1)) << 39); - // (64-12) = 52, ignore top bit - h[4] = (a3 >> 12) & ((1ULL << 51) - 1); -} - -static __always_inline void fe_frombytes(fe *h, const u8 *s) -{ - fe_frombytes_impl(h->v, s); -} - -static __always_inline u8 /*bool*/ addcarryx_u51(u8 /*bool*/ c, u64 a, u64 b, u64 *low) -{ - /* This function extracts 51 bits of result and 1 bit of carry (52 total), so - *a 64-bit intermediate is sufficient. - */ - u64 x = a + b + c; - *low = x & ((1ULL << 51) - 1); - return (x >> 51) & 1; -} - -static __always_inline u8 /*bool*/ subborrow_u51(u8 /*bool*/ c, u64 a, u64 b, u64 *low) -{ - /* This function extracts 51 bits of result and 1 bit of borrow (52 total), so - * a 64-bit intermediate is sufficient. - */ - u64 x = a - b - c; - *low = x & ((1ULL << 51) - 1); - return x >> 63; -} - -static __always_inline u64 cmovznz64(u64 t, u64 z, u64 nz) -{ - /* all set if nonzero, 0 if 0 */ - t = -!!t; - return (t&nz) | ((~t)&z); -} - -static __always_inline void fe_freeze(u64 out[5], const u64 in1[5]) -{ - { const u64 x7 = in1[4]; - { const u64 x8 = in1[3]; - { const u64 x6 = in1[2]; - { const u64 x4 = in1[1]; - { const u64 x2 = in1[0]; - { u64 x10; u8/*bool*/ x11 = subborrow_u51(0x0, x2, 0x7ffffffffffed, &x10); - { u64 x13; u8/*bool*/ x14 = subborrow_u51(x11, x4, 0x7ffffffffffff, &x13); - { u64 x16; u8/*bool*/ x17 = subborrow_u51(x14, x6, 0x7ffffffffffff, &x16); - { u64 x19; u8/*bool*/ x20 = subborrow_u51(x17, x8, 0x7ffffffffffff, &x19); - { u64 x22; u8/*bool*/ x23 = subborrow_u51(x20, x7, 0x7ffffffffffff, &x22); - { u64 x24 = cmovznz64(x23, 0x0, 0xffffffffffffffffL); - { u64 x25 = (x24 & 0x7ffffffffffed); - { u64 x27; u8/*bool*/ x28 = addcarryx_u51(0x0, x10, x25, &x27); - { u64 x29 = (x24 & 0x7ffffffffffff); - { u64 x31; u8/*bool*/ x32 = addcarryx_u51(x28, x13, x29, &x31); - { u64 x33 = (x24 & 0x7ffffffffffff); - { u64 x35; u8/*bool*/ x36 = addcarryx_u51(x32, x16, x33, &x35); - { u64 x37 = (x24 & 0x7ffffffffffff); - { u64 x39; u8/*bool*/ x40 = addcarryx_u51(x36, x19, x37, &x39); - { u64 x41 = (x24 & 0x7ffffffffffff); - { u64 x43; addcarryx_u51(x40, x22, x41, &x43); - out[0] = x27; - out[1] = x31; - out[2] = x35; - out[3] = x39; - out[4] = x43; - }}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_tobytes(u8 s[32], const fe *f) -{ - u64 h[5]; - fe_freeze(h, f->v); - - s[0] = h[0] >> 0; - s[1] = h[0] >> 8; - s[2] = h[0] >> 16; - s[3] = h[0] >> 24; - s[4] = h[0] >> 32; - s[5] = h[0] >> 40; - s[6] = (h[0] >> 48) | (h[1] << 3); - s[7] = h[1] >> 5; - s[8] = h[1] >> 13; - s[9] = h[1] >> 21; - s[10] = h[1] >> 29; - s[11] = h[1] >> 37; - s[12] = (h[1] >> 45) | (h[2] << 6); - s[13] = h[2] >> 2; - s[14] = h[2] >> 10; - s[15] = h[2] >> 18; - s[16] = h[2] >> 26; - s[17] = h[2] >> 34; - s[18] = h[2] >> 42; - s[19] = (h[2] >> 50) | (h[3] << 1); - s[20] = h[3] >> 7; - s[21] = h[3] >> 15; - s[22] = h[3] >> 23; - s[23] = h[3] >> 31; - s[24] = h[3] >> 39; - s[25] = (h[3] >> 47) | (h[4] << 4); - s[26] = h[4] >> 4; - s[27] = h[4] >> 12; - s[28] = h[4] >> 20; - s[29] = h[4] >> 28; - s[30] = h[4] >> 36; - s[31] = h[4] >> 44; -} - -/* h = f */ -static __always_inline void fe_copy(fe *h, const fe *f) -{ - memmove(h, f, sizeof(fe)); -} - -static __always_inline void fe_copy_lt(fe_loose *h, const fe *f) -{ - memmove(h, f, sizeof(fe)); -} - -/* h = 0 */ -static __always_inline void fe_0(fe *h) -{ - memset(h, 0, sizeof(fe)); -} - -/* h = 1 */ -static __always_inline void fe_1(fe *h) -{ - memset(h, 0, sizeof(fe)); - h->v[0] = 1; -} - -static __always_inline void fe_add_impl(u64 out[5], const u64 in1[5], const u64 in2[5]) -{ - { const u64 x10 = in1[4]; - { const u64 x11 = in1[3]; - { const u64 x9 = in1[2]; - { const u64 x7 = in1[1]; - { const u64 x5 = in1[0]; - { const u64 x18 = in2[4]; - { const u64 x19 = in2[3]; - { const u64 x17 = in2[2]; - { const u64 x15 = in2[1]; - { const u64 x13 = in2[0]; - out[0] = (x5 + x13); - out[1] = (x7 + x15); - out[2] = (x9 + x17); - out[3] = (x11 + x19); - out[4] = (x10 + x18); - }}}}}}}}}} -} - -/* h = f + g - * Can overlap h with f or g. - */ -static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g) -{ - fe_add_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_sub_impl(u64 out[5], const u64 in1[5], const u64 in2[5]) -{ - { const u64 x10 = in1[4]; - { const u64 x11 = in1[3]; - { const u64 x9 = in1[2]; - { const u64 x7 = in1[1]; - { const u64 x5 = in1[0]; - { const u64 x18 = in2[4]; - { const u64 x19 = in2[3]; - { const u64 x17 = in2[2]; - { const u64 x15 = in2[1]; - { const u64 x13 = in2[0]; - out[0] = ((0xfffffffffffda + x5) - x13); - out[1] = ((0xffffffffffffe + x7) - x15); - out[2] = ((0xffffffffffffe + x9) - x17); - out[3] = ((0xffffffffffffe + x11) - x19); - out[4] = ((0xffffffffffffe + x10) - x18); - }}}}}}}}}} -} - -/* h = f - g - * Can overlap h with f or g. - */ -static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g) -{ - fe_sub_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_impl(u64 out[5], const u64 in1[5], const u64 in2[5]) -{ - { const u64 x10 = in1[4]; - { const u64 x11 = in1[3]; - { const u64 x9 = in1[2]; - { const u64 x7 = in1[1]; - { const u64 x5 = in1[0]; - { const u64 x18 = in2[4]; - { const u64 x19 = in2[3]; - { const u64 x17 = in2[2]; - { const u64 x15 = in2[1]; - { const u64 x13 = in2[0]; - { u128 x20 = ((u128)x5 * x13); - { u128 x21 = (((u128)x5 * x15) + ((u128)x7 * x13)); - { u128 x22 = ((((u128)x5 * x17) + ((u128)x9 * x13)) + ((u128)x7 * x15)); - { u128 x23 = (((((u128)x5 * x19) + ((u128)x11 * x13)) + ((u128)x7 * x17)) + ((u128)x9 * x15)); - { u128 x24 = ((((((u128)x5 * x18) + ((u128)x10 * x13)) + ((u128)x11 * x15)) + ((u128)x7 * x19)) + ((u128)x9 * x17)); - { u64 x25 = (x10 * 0x13); - { u64 x26 = (x7 * 0x13); - { u64 x27 = (x9 * 0x13); - { u64 x28 = (x11 * 0x13); - { u128 x29 = ((((x20 + ((u128)x25 * x15)) + ((u128)x26 * x18)) + ((u128)x27 * x19)) + ((u128)x28 * x17)); - { u128 x30 = (((x21 + ((u128)x25 * x17)) + ((u128)x27 * x18)) + ((u128)x28 * x19)); - { u128 x31 = ((x22 + ((u128)x25 * x19)) + ((u128)x28 * x18)); - { u128 x32 = (x23 + ((u128)x25 * x18)); - { u64 x33 = (u64) (x29 >> 0x33); - { u64 x34 = ((u64)x29 & 0x7ffffffffffff); - { u128 x35 = (x33 + x30); - { u64 x36 = (u64) (x35 >> 0x33); - { u64 x37 = ((u64)x35 & 0x7ffffffffffff); - { u128 x38 = (x36 + x31); - { u64 x39 = (u64) (x38 >> 0x33); - { u64 x40 = ((u64)x38 & 0x7ffffffffffff); - { u128 x41 = (x39 + x32); - { u64 x42 = (u64) (x41 >> 0x33); - { u64 x43 = ((u64)x41 & 0x7ffffffffffff); - { u128 x44 = (x42 + x24); - { u64 x45 = (u64) (x44 >> 0x33); - { u64 x46 = ((u64)x44 & 0x7ffffffffffff); - { u64 x47 = (x34 + (0x13 * x45)); - { u64 x48 = (x47 >> 0x33); - { u64 x49 = (x47 & 0x7ffffffffffff); - { u64 x50 = (x48 + x37); - { u64 x51 = (x50 >> 0x33); - { u64 x52 = (x50 & 0x7ffffffffffff); - out[0] = x49; - out[1] = x52; - out[2] = (x51 + x40); - out[3] = x43; - out[4] = x46; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - -static __always_inline void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) -{ - fe_mul_impl(h->v, f->v, g->v); -} - - -static __always_inline void fe_sqr_impl(u64 out[5], const u64 in1[5]) -{ - { const u64 x7 = in1[4]; - { const u64 x8 = in1[3]; - { const u64 x6 = in1[2]; - { const u64 x4 = in1[1]; - { const u64 x2 = in1[0]; - { u64 x9 = (x2 * 0x2); - { u64 x10 = (x4 * 0x2); - { u64 x11 = ((x6 * 0x2) * 0x13); - { u64 x12 = (x7 * 0x13); - { u64 x13 = (x12 * 0x2); - { u128 x14 = ((((u128)x2 * x2) + ((u128)x13 * x4)) + ((u128)x11 * x8)); - { u128 x15 = ((((u128)x9 * x4) + ((u128)x13 * x6)) + ((u128)x8 * (x8 * 0x13))); - { u128 x16 = ((((u128)x9 * x6) + ((u128)x4 * x4)) + ((u128)x13 * x8)); - { u128 x17 = ((((u128)x9 * x8) + ((u128)x10 * x6)) + ((u128)x7 * x12)); - { u128 x18 = ((((u128)x9 * x7) + ((u128)x10 * x8)) + ((u128)x6 * x6)); - { u64 x19 = (u64) (x14 >> 0x33); - { u64 x20 = ((u64)x14 & 0x7ffffffffffff); - { u128 x21 = (x19 + x15); - { u64 x22 = (u64) (x21 >> 0x33); - { u64 x23 = ((u64)x21 & 0x7ffffffffffff); - { u128 x24 = (x22 + x16); - { u64 x25 = (u64) (x24 >> 0x33); - { u64 x26 = ((u64)x24 & 0x7ffffffffffff); - { u128 x27 = (x25 + x17); - { u64 x28 = (u64) (x27 >> 0x33); - { u64 x29 = ((u64)x27 & 0x7ffffffffffff); - { u128 x30 = (x28 + x18); - { u64 x31 = (u64) (x30 >> 0x33); - { u64 x32 = ((u64)x30 & 0x7ffffffffffff); - { u64 x33 = (x20 + (0x13 * x31)); - { u64 x34 = (x33 >> 0x33); - { u64 x35 = (x33 & 0x7ffffffffffff); - { u64 x36 = (x34 + x23); - { u64 x37 = (x36 >> 0x33); - { u64 x38 = (x36 & 0x7ffffffffffff); - out[0] = x35; - out[1] = x38; - out[2] = (x37 + x26); - out[3] = x29; - out[4] = x32; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_sq_tl(fe *h, const fe_loose *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static __always_inline void fe_sq_tt(fe *h, const fe *f) -{ - fe_sqr_impl(h->v, f->v); -} - -static __always_inline void fe_loose_invert(fe *out, const fe_loose *z) -{ - fe t0; - fe t1; - fe t2; - fe t3; - int i; - - fe_sq_tl(&t0, z); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 2; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_tlt(&t1, z, &t1); - fe_mul_ttt(&t0, &t0, &t1); - fe_sq_tt(&t2, &t0); - fe_mul_ttt(&t1, &t1, &t2); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 20; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 10; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 100; ++i) - fe_sq_tt(&t3, &t3); - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 50; ++i) - fe_sq_tt(&t2, &t2); - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 5; ++i) - fe_sq_tt(&t1, &t1); - fe_mul_ttt(out, &t1, &t0); -} - -static __always_inline void fe_invert(fe *out, const fe *z) -{ - fe_loose l; - fe_copy_lt(&l, z); - fe_loose_invert(out, &l); -} - -/* Replace (f,g) with (g,f) if b == 1; - * replace (f,g) with (f,g) if b == 0. - * - * Preconditions: b in {0,1} - */ -static __always_inline void fe_cswap(fe *f, fe *g, u64 b) -{ - unsigned i; - b = 0-b; - for (i = 0; i < 5; i++) { - u64 x = f->v[i] ^ g->v[i]; - x &= b; - f->v[i] ^= x; - g->v[i] ^= x; - } -} - -/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ -static __always_inline void fe_mul_121666_impl(u64 out[5], const u64 in1[5]) -{ - { const u64 x10 = in1[4]; - { const u64 x11 = in1[3]; - { const u64 x9 = in1[2]; - { const u64 x7 = in1[1]; - { const u64 x5 = in1[0]; - { const u64 x18 = 0; - { const u64 x19 = 0; - { const u64 x17 = 0; - { const u64 x15 = 0; - { const u64 x13 = 121666; - { u128 x20 = ((u128)x5 * x13); - { u128 x21 = (((u128)x5 * x15) + ((u128)x7 * x13)); - { u128 x22 = ((((u128)x5 * x17) + ((u128)x9 * x13)) + ((u128)x7 * x15)); - { u128 x23 = (((((u128)x5 * x19) + ((u128)x11 * x13)) + ((u128)x7 * x17)) + ((u128)x9 * x15)); - { u128 x24 = ((((((u128)x5 * x18) + ((u128)x10 * x13)) + ((u128)x11 * x15)) + ((u128)x7 * x19)) + ((u128)x9 * x17)); - { u64 x25 = (x10 * 0x13); - { u64 x26 = (x7 * 0x13); - { u64 x27 = (x9 * 0x13); - { u64 x28 = (x11 * 0x13); - { u128 x29 = ((((x20 + ((u128)x25 * x15)) + ((u128)x26 * x18)) + ((u128)x27 * x19)) + ((u128)x28 * x17)); - { u128 x30 = (((x21 + ((u128)x25 * x17)) + ((u128)x27 * x18)) + ((u128)x28 * x19)); - { u128 x31 = ((x22 + ((u128)x25 * x19)) + ((u128)x28 * x18)); - { u128 x32 = (x23 + ((u128)x25 * x18)); - { u64 x33 = (u64) (x29 >> 0x33); - { u64 x34 = ((u64)x29 & 0x7ffffffffffff); - { u128 x35 = (x33 + x30); - { u64 x36 = (u64) (x35 >> 0x33); - { u64 x37 = ((u64)x35 & 0x7ffffffffffff); - { u128 x38 = (x36 + x31); - { u64 x39 = (u64) (x38 >> 0x33); - { u64 x40 = ((u64)x38 & 0x7ffffffffffff); - { u128 x41 = (x39 + x32); - { u64 x42 = (u64) (x41 >> 0x33); - { u64 x43 = ((u64)x41 & 0x7ffffffffffff); - { u128 x44 = (x42 + x24); - { u64 x45 = (u64) (x44 >> 0x33); - { u64 x46 = ((u64)x44 & 0x7ffffffffffff); - { u64 x47 = (x34 + (0x13 * x45)); - { u64 x48 = (x47 >> 0x33); - { u64 x49 = (x47 & 0x7ffffffffffff); - { u64 x50 = (x48 + x37); - { u64 x51 = (x50 >> 0x33); - { u64 x52 = (x50 & 0x7ffffffffffff); - out[0] = x49; - out[1] = x52; - out[2] = (x51 + x40); - out[3] = x43; - out[4] = x46; - }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} -} - -static __always_inline void fe_mul121666(fe *h, const fe_loose *f) -{ - fe_mul_121666_impl(h->v, f->v); -} - -bool curve25519_fiat64(u8 out[CURVE25519_POINT_SIZE], const u8 scalar[CURVE25519_POINT_SIZE], const u8 point[CURVE25519_POINT_SIZE]) -{ - fe x1, x2, z2, x3, z3, tmp0, tmp1; - fe_loose x2l, z2l, x3l, tmp0l, tmp1l; - unsigned swap = 0; - int pos; - u8 e[32]; - - memcpy(e, scalar, 32); - normalize_secret(e); - - /* The following implementation was transcribed to Coq and proven to - * correspond to unary scalar multiplication in affine coordinates given that - * x1 != 0 is the x coordinate of some point on the curve. It was also checked - * in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 - * = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the - * underlying field, so it applies to Curve25519 itself and the quadratic - * twist of Curve25519. It was not proven in Coq that prime-field arithmetic - * correctly simulates extension-field arithmetic on prime-field values. - * The decoding of the byte array representation of e was not considered. - * Specification of Montgomery curves in affine coordinates: - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27> - * Proof that these form a group that is isomorphic to a Weierstrass curve: - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35> - * Coq transcription and correctness proof of the loop (where scalarbits=255): - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118> - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278> - * preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 - */ - fe_frombytes(&x1, point); - fe_1(&x2); - fe_0(&z2); - fe_copy(&x3, &x1); - fe_1(&z3); - - for (pos = 254; pos >= 0; --pos) { - /* loop invariant as of right before the test, for the case where x1 != 0: - * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero - * let r := e >> (pos+1) in the following equalities of projective points: - * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) - * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) - * x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) - */ - unsigned b = 1 & (e[pos / 8] >> (pos & 7)); - swap ^= b; - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - swap = b; - /* Coq transcription of ladderstep formula (called from transcribed loop): - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89> - * <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131> - * x1 != 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217> - * x1 = 0 <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147> - */ - fe_sub(&tmp0l, &x3, &z3); - fe_sub(&tmp1l, &x2, &z2); - fe_add(&x2l, &x2, &z2); - fe_add(&z2l, &x3, &z3); - fe_mul_tll(&z3, &tmp0l, &x2l); - fe_mul_tll(&z2, &z2l, &tmp1l); - fe_sq_tl(&tmp0, &tmp1l); - fe_sq_tl(&tmp1, &x2l); - fe_add(&x3l, &z3, &z2); - fe_sub(&z2l, &z3, &z2); - fe_mul_ttt(&x2, &tmp1, &tmp0); - fe_sub(&tmp1l, &tmp1, &tmp0); - fe_sq_tl(&z2, &z2l); - fe_mul121666(&z3, &tmp1l); - fe_sq_tl(&x3, &x3l); - fe_add(&tmp0l, &tmp0, &z3); - fe_mul_ttt(&z3, &x1, &z2); - fe_mul_tll(&z2, &tmp1l, &tmp0l); - } - /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) */ - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - - fe_invert(&z2, &z2); - fe_mul_ttt(&x2, &x2, &z2); - fe_tobytes(out, &x2); - - return true; -} diff --git a/curve25519-hacl64.c b/curve25519-hacl64.c deleted file mode 100644 index 258d1e9..0000000 --- a/curve25519-hacl64.c +++ /dev/null @@ -1,755 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2016-2017 INRIA and Microsoft Corporation. - * Copyright (C) 2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * This is a machine-generated formally verified implementation of curve25519 DH from: - * https://github.com/mitls/hacl-star - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -typedef __uint128_t u128; -static __always_inline u64 u64_eq_mask(u64 x, u64 y) -{ - x ^= y; - x |= -x; - return (x >> 63) - 1; -} - -static __always_inline u64 u64_gte_mask(u64 x, u64 y) -{ - return ((x ^ ((x ^ y) | ((x - y) ^ y))) >> 63) - 1; -} - -static __always_inline void modulo_carry_top(u64 *b) -{ - u64 b4 = b[4]; - u64 b0 = b[0]; - u64 b4_ = b4 & 0x7ffffffffffffLLU; - u64 b0_ = b0 + 19 * (b4 >> 51); - b[4] = b4_; - b[0] = b0_; -} - -static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input) -{ - { - u128 xi = input[0]; - output[0] = ((u64)(xi)); - } - { - u128 xi = input[1]; - output[1] = ((u64)(xi)); - } - { - u128 xi = input[2]; - output[2] = ((u64)(xi)); - } - { - u128 xi = input[3]; - output[3] = ((u64)(xi)); - } - { - u128 xi = input[4]; - output[4] = ((u64)(xi)); - } -} - -static __always_inline void fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s) -{ - output[0] += (u128)input[0] * s; - output[1] += (u128)input[1] * s; - output[2] += (u128)input[2] * s; - output[3] += (u128)input[3] * s; - output[4] += (u128)input[4] * s; -} - -static __always_inline void fproduct_carry_wide_(u128 *tmp) -{ - { - u32 ctr = 0; - u128 tctr = tmp[ctr]; - u128 tctrp1 = tmp[ctr + 1]; - u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; - u128 c = ((tctr) >> (51)); - tmp[ctr] = ((u128)(r0)); - tmp[ctr + 1] = ((tctrp1) + (c)); - } - { - u32 ctr = 1; - u128 tctr = tmp[ctr]; - u128 tctrp1 = tmp[ctr + 1]; - u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; - u128 c = ((tctr) >> (51)); - tmp[ctr] = ((u128)(r0)); - tmp[ctr + 1] = ((tctrp1) + (c)); - } - - { - u32 ctr = 2; - u128 tctr = tmp[ctr]; - u128 tctrp1 = tmp[ctr + 1]; - u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; - u128 c = ((tctr) >> (51)); - tmp[ctr] = ((u128)(r0)); - tmp[ctr + 1] = ((tctrp1) + (c)); - } - { - u32 ctr = 3; - u128 tctr = tmp[ctr]; - u128 tctrp1 = tmp[ctr + 1]; - u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; - u128 c = ((tctr) >> (51)); - tmp[ctr] = ((u128)(r0)); - tmp[ctr + 1] = ((tctrp1) + (c)); - } -} - -static __always_inline void fmul_shift_reduce(u64 *output) -{ - u64 tmp = output[4]; - u64 b0; - { - u32 ctr = 5 - 0 - 1; - u64 z = output[ctr - 1]; - output[ctr] = z; - } - { - u32 ctr = 5 - 1 - 1; - u64 z = output[ctr - 1]; - output[ctr] = z; - } - { - u32 ctr = 5 - 2 - 1; - u64 z = output[ctr - 1]; - output[ctr] = z; - } - { - u32 ctr = 5 - 3 - 1; - u64 z = output[ctr - 1]; - output[ctr] = z; - } - output[0] = tmp; - b0 = output[0]; - output[0] = 19 * b0; -} - -static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input, u64 *input21) -{ - u32 i; - u64 input2i; - { - u64 input2i = input21[0]; - fproduct_sum_scalar_multiplication_(output, input, input2i); - fmul_shift_reduce(input); - } - { - u64 input2i = input21[1]; - fproduct_sum_scalar_multiplication_(output, input, input2i); - fmul_shift_reduce(input); - } - { - u64 input2i = input21[2]; - fproduct_sum_scalar_multiplication_(output, input, input2i); - fmul_shift_reduce(input); - } - { - u64 input2i = input21[3]; - fproduct_sum_scalar_multiplication_(output, input, input2i); - fmul_shift_reduce(input); - } - i = 4; - input2i = input21[i]; - fproduct_sum_scalar_multiplication_(output, input, input2i); -} - -static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21) -{ - u64 tmp[5]; - memcpy(tmp, input, 5 * sizeof(*input)); - { - u128 b4; - u128 b0; - u128 b4_; - u128 b0_; - u64 i0; - u64 i1; - u64 i0_; - u64 i1_; - u128 t[5] = { 0 }; - fmul_mul_shift_reduce_(t, tmp, input21); - fproduct_carry_wide_(t); - b4 = t[4]; - b0 = t[0]; - b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); - b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); - t[4] = b4_; - t[0] = b0_; - fproduct_copy_from_wide_(output, t); - i0 = output[0]; - i1 = output[1]; - i0_ = i0 & 0x7ffffffffffffLLU; - i1_ = i1 + (i0 >> 51); - output[0] = i0_; - output[1] = i1_; - } -} - -static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output) -{ - u64 r0 = output[0]; - u64 r1 = output[1]; - u64 r2 = output[2]; - u64 r3 = output[3]; - u64 r4 = output[4]; - u64 d0 = r0 * 2; - u64 d1 = r1 * 2; - u64 d2 = r2 * 2 * 19; - u64 d419 = r4 * 19; - u64 d4 = d419 * 2; - u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) + (((u128)(d2) * (r3)))); - u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) + (((u128)(r3 * 19) * (r3)))); - u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) + (((u128)(d4) * (r3)))); - u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) + (((u128)(r4) * (d419)))); - u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) + (((u128)(r2) * (r2)))); - tmp[0] = s0; - tmp[1] = s1; - tmp[2] = s2; - tmp[3] = s3; - tmp[4] = s4; -} - -static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output) -{ - u128 b4; - u128 b0; - u128 b4_; - u128 b0_; - u64 i0; - u64 i1; - u64 i0_; - u64 i1_; - fsquare_fsquare__(tmp, output); - fproduct_carry_wide_(tmp); - b4 = tmp[4]; - b0 = tmp[0]; - b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); - b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); - tmp[4] = b4_; - tmp[0] = b0_; - fproduct_copy_from_wide_(output, tmp); - i0 = output[0]; - i1 = output[1]; - i0_ = i0 & 0x7ffffffffffffLLU; - i1_ = i1 + (i0 >> 51); - output[0] = i0_; - output[1] = i1_; -} - -static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp, u32 count1) -{ - u32 i; - fsquare_fsquare_(tmp, output); - for (i = 1; i < count1; ++i) - fsquare_fsquare_(tmp, output); -} - -static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 count1) -{ - u128 t[5]; - memcpy(output, input, 5 * sizeof(*input)); - fsquare_fsquare_times_(output, t, count1); -} - -static __always_inline void fsquare_fsquare_times_inplace(u64 *output, u32 count1) -{ - u128 t[5]; - fsquare_fsquare_times_(output, t, count1); -} - -static __always_inline void crecip_crecip(u64 *out, u64 *z) -{ - u64 buf[20] = { 0 }; - u64 *a0 = buf; - u64 *t00 = buf + 5; - u64 *b0 = buf + 10; - u64 *t01; - u64 *b1; - u64 *c0; - u64 *a; - u64 *t0; - u64 *b; - u64 *c; - fsquare_fsquare_times(a0, z, 1); - fsquare_fsquare_times(t00, a0, 2); - fmul_fmul(b0, t00, z); - fmul_fmul(a0, b0, a0); - fsquare_fsquare_times(t00, a0, 1); - fmul_fmul(b0, t00, b0); - fsquare_fsquare_times(t00, b0, 5); - t01 = buf + 5; - b1 = buf + 10; - c0 = buf + 15; - fmul_fmul(b1, t01, b1); - fsquare_fsquare_times(t01, b1, 10); - fmul_fmul(c0, t01, b1); - fsquare_fsquare_times(t01, c0, 20); - fmul_fmul(t01, t01, c0); - fsquare_fsquare_times_inplace(t01, 10); - fmul_fmul(b1, t01, b1); - fsquare_fsquare_times(t01, b1, 50); - a = buf; - t0 = buf + 5; - b = buf + 10; - c = buf + 15; - fmul_fmul(c, t0, b); - fsquare_fsquare_times(t0, c, 100); - fmul_fmul(t0, t0, c); - fsquare_fsquare_times_inplace(t0, 50); - fmul_fmul(t0, t0, b); - fsquare_fsquare_times_inplace(t0, 5); - fmul_fmul(out, t0, a); -} - -static __always_inline void fsum(u64 *a, u64 *b) -{ - a[0] += b[0]; - a[1] += b[1]; - a[2] += b[2]; - a[3] += b[3]; - a[4] += b[4]; -} - -static __always_inline void fdifference(u64 *a, u64 *b) -{ - u64 tmp[5] = { 0 }; - u64 b0; - u64 b1; - u64 b2; - u64 b3; - u64 b4; - memcpy(tmp, b, 5 * sizeof(*b)); - b0 = tmp[0]; - b1 = tmp[1]; - b2 = tmp[2]; - b3 = tmp[3]; - b4 = tmp[4]; - tmp[0] = b0 + 0x3fffffffffff68LLU; - tmp[1] = b1 + 0x3ffffffffffff8LLU; - tmp[2] = b2 + 0x3ffffffffffff8LLU; - tmp[3] = b3 + 0x3ffffffffffff8LLU; - tmp[4] = b4 + 0x3ffffffffffff8LLU; - { - u64 xi = a[0]; - u64 yi = tmp[0]; - a[0] = yi - xi; - } - { - u64 xi = a[1]; - u64 yi = tmp[1]; - a[1] = yi - xi; - } - { - u64 xi = a[2]; - u64 yi = tmp[2]; - a[2] = yi - xi; - } - { - u64 xi = a[3]; - u64 yi = tmp[3]; - a[3] = yi - xi; - } - { - u64 xi = a[4]; - u64 yi = tmp[4]; - a[4] = yi - xi; - } -} - -static __always_inline void fscalar(u64 *output, u64 *b, u64 s) -{ - u128 tmp[5]; - u128 b4; - u128 b0; - u128 b4_; - u128 b0_; - { - u64 xi = b[0]; - tmp[0] = ((u128)(xi) * (s)); - } - { - u64 xi = b[1]; - tmp[1] = ((u128)(xi) * (s)); - } - { - u64 xi = b[2]; - tmp[2] = ((u128)(xi) * (s)); - } - { - u64 xi = b[3]; - tmp[3] = ((u128)(xi) * (s)); - } - { - u64 xi = b[4]; - tmp[4] = ((u128)(xi) * (s)); - } - fproduct_carry_wide_(tmp); - b4 = tmp[4]; - b0 = tmp[0]; - b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); - b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); - tmp[4] = b4_; - tmp[0] = b0_; - fproduct_copy_from_wide_(output, tmp); -} - -static __always_inline void fmul(u64 *output, u64 *a, u64 *b) -{ - fmul_fmul(output, a, b); -} - -static __always_inline void crecip(u64 *output, u64 *input) -{ - crecip_crecip(output, input); -} - -static __always_inline void point_swap_conditional_step(u64 *a, u64 *b, u64 swap1, u32 ctr) -{ - u32 i = ctr - 1; - u64 ai = a[i]; - u64 bi = b[i]; - u64 x = swap1 & (ai ^ bi); - u64 ai1 = ai ^ x; - u64 bi1 = bi ^ x; - a[i] = ai1; - b[i] = bi1; -} - -static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1) -{ - point_swap_conditional_step(a, b, swap1, 5); - point_swap_conditional_step(a, b, swap1, 4); - point_swap_conditional_step(a, b, swap1, 3); - point_swap_conditional_step(a, b, swap1, 2); - point_swap_conditional_step(a, b, swap1, 1); -} - -static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap) -{ - u64 swap1 = 0 - iswap; - point_swap_conditional5(a, b, swap1); - point_swap_conditional5(a + 5, b + 5, swap1); -} - -static __always_inline void point_copy(u64 *output, u64 *input) -{ - memcpy(output, input, 5 * sizeof(*input)); - memcpy(output + 5, input + 5, 5 * sizeof(*input)); -} - -static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p, u64 *pq, u64 *qmqp) -{ - u64 *qx = qmqp; - u64 *x2 = pp; - u64 *z2 = pp + 5; - u64 *x3 = ppq; - u64 *z3 = ppq + 5; - u64 *x = p; - u64 *z = p + 5; - u64 *xprime = pq; - u64 *zprime = pq + 5; - u64 buf[40] = { 0 }; - u64 *origx = buf; - u64 *origxprime0 = buf + 5; - u64 *xxprime0; - u64 *zzprime0; - u64 *origxprime; - xxprime0 = buf + 25; - zzprime0 = buf + 30; - memcpy(origx, x, 5 * sizeof(*x)); - fsum(x, z); - fdifference(z, origx); - memcpy(origxprime0, xprime, 5 * sizeof(*xprime)); - fsum(xprime, zprime); - fdifference(zprime, origxprime0); - fmul(xxprime0, xprime, z); - fmul(zzprime0, x, zprime); - origxprime = buf + 5; - { - u64 *xx0; - u64 *zz0; - u64 *xxprime; - u64 *zzprime; - u64 *zzzprime; - xx0 = buf + 15; - zz0 = buf + 20; - xxprime = buf + 25; - zzprime = buf + 30; - zzzprime = buf + 35; - memcpy(origxprime, xxprime, 5 * sizeof(*xxprime)); - fsum(xxprime, zzprime); - fdifference(zzprime, origxprime); - fsquare_fsquare_times(x3, xxprime, 1); - fsquare_fsquare_times(zzzprime, zzprime, 1); - fmul(z3, zzzprime, qx); - fsquare_fsquare_times(xx0, x, 1); - fsquare_fsquare_times(zz0, z, 1); - { - u64 *zzz; - u64 *xx; - u64 *zz; - u64 scalar; - zzz = buf + 10; - xx = buf + 15; - zz = buf + 20; - fmul(x2, xx, zz); - fdifference(zz, xx); - scalar = 121665; - fscalar(zzz, zz, scalar); - fsum(zzz, xx); - fmul(z2, zzz, zz); - } - } -} - -static __always_inline void ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt) -{ - u64 bit0 = (u64)(byt >> 7); - u64 bit; - point_swap_conditional(nq, nqpq, bit0); - addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q); - bit = (u64)(byt >> 7); - point_swap_conditional(nq2, nqpq2, bit); -} - -static __always_inline void ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt) -{ - u8 byt1; - ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt); - byt1 = byt << 1; - ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1); -} - -static __always_inline void ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt, u32 i) -{ - while (i--) { - ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2, nqpq2, q, byt); - byt <<= 2; - } -} - -static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u32 i) -{ - while (i--) { - u8 byte = n1[i]; - ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byte, 4); - } -} - -static __always_inline void ladder_cmult(u64 *result, u8 *n1, u64 *q) -{ - u64 point_buf[40] = { 0 }; - u64 *nq = point_buf; - u64 *nqpq = point_buf + 10; - u64 *nq2 = point_buf + 20; - u64 *nqpq2 = point_buf + 30; - point_copy(nqpq, q); - nq[0] = 1; - ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32); - point_copy(result, nq); -} - -static __always_inline void format_fexpand(u64 *output, const u8 *input) -{ - const u8 *x00 = input + 6; - const u8 *x01 = input + 12; - const u8 *x02 = input + 19; - const u8 *x0 = input + 24; - u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4; - i0 = le64_to_cpup((__force __le64 *)input); - i1 = le64_to_cpup((__force __le64 *)x00); - i2 = le64_to_cpup((__force __le64 *)x01); - i3 = le64_to_cpup((__force __le64 *)x02); - i4 = le64_to_cpup((__force __le64 *)x0); - output0 = i0 & 0x7ffffffffffffLLU; - output1 = i1 >> 3 & 0x7ffffffffffffLLU; - output2 = i2 >> 6 & 0x7ffffffffffffLLU; - output3 = i3 >> 1 & 0x7ffffffffffffLLU; - output4 = i4 >> 12 & 0x7ffffffffffffLLU; - output[0] = output0; - output[1] = output1; - output[2] = output2; - output[3] = output3; - output[4] = output4; -} - -static __always_inline void format_fcontract_first_carry_pass(u64 *input) -{ - u64 t0 = input[0]; - u64 t1 = input[1]; - u64 t2 = input[2]; - u64 t3 = input[3]; - u64 t4 = input[4]; - u64 t1_ = t1 + (t0 >> 51); - u64 t0_ = t0 & 0x7ffffffffffffLLU; - u64 t2_ = t2 + (t1_ >> 51); - u64 t1__ = t1_ & 0x7ffffffffffffLLU; - u64 t3_ = t3 + (t2_ >> 51); - u64 t2__ = t2_ & 0x7ffffffffffffLLU; - u64 t4_ = t4 + (t3_ >> 51); - u64 t3__ = t3_ & 0x7ffffffffffffLLU; - input[0] = t0_; - input[1] = t1__; - input[2] = t2__; - input[3] = t3__; - input[4] = t4_; -} - -static __always_inline void format_fcontract_first_carry_full(u64 *input) -{ - format_fcontract_first_carry_pass(input); - modulo_carry_top(input); -} - -static __always_inline void format_fcontract_second_carry_pass(u64 *input) -{ - u64 t0 = input[0]; - u64 t1 = input[1]; - u64 t2 = input[2]; - u64 t3 = input[3]; - u64 t4 = input[4]; - u64 t1_ = t1 + (t0 >> 51); - u64 t0_ = t0 & 0x7ffffffffffffLLU; - u64 t2_ = t2 + (t1_ >> 51); - u64 t1__ = t1_ & 0x7ffffffffffffLLU; - u64 t3_ = t3 + (t2_ >> 51); - u64 t2__ = t2_ & 0x7ffffffffffffLLU; - u64 t4_ = t4 + (t3_ >> 51); - u64 t3__ = t3_ & 0x7ffffffffffffLLU; - input[0] = t0_; - input[1] = t1__; - input[2] = t2__; - input[3] = t3__; - input[4] = t4_; -} - -static __always_inline void format_fcontract_second_carry_full(u64 *input) -{ - u64 i0; - u64 i1; - u64 i0_; - u64 i1_; - format_fcontract_second_carry_pass(input); - modulo_carry_top(input); - i0 = input[0]; - i1 = input[1]; - i0_ = i0 & 0x7ffffffffffffLLU; - i1_ = i1 + (i0 >> 51); - input[0] = i0_; - input[1] = i1_; -} - -static __always_inline void format_fcontract_trim(u64 *input) -{ - u64 a0 = input[0]; - u64 a1 = input[1]; - u64 a2 = input[2]; - u64 a3 = input[3]; - u64 a4 = input[4]; - u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU); - u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU); - u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU); - u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU); - u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU); - u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4; - u64 a0_ = a0 - (0x7ffffffffffedLLU & mask); - u64 a1_ = a1 - (0x7ffffffffffffLLU & mask); - u64 a2_ = a2 - (0x7ffffffffffffLLU & mask); - u64 a3_ = a3 - (0x7ffffffffffffLLU & mask); - u64 a4_ = a4 - (0x7ffffffffffffLLU & mask); - input[0] = a0_; - input[1] = a1_; - input[2] = a2_; - input[3] = a3_; - input[4] = a4_; -} - -static __always_inline void format_fcontract_store(u8 *output, u64 *input) -{ - u64 t0 = input[0]; - u64 t1 = input[1]; - u64 t2 = input[2]; - u64 t3 = input[3]; - u64 t4 = input[4]; - u64 o0 = t1 << 51 | t0; - u64 o1 = t2 << 38 | t1 >> 13; - u64 o2 = t3 << 25 | t2 >> 26; - u64 o3 = t4 << 12 | t3 >> 39; - u8 *b0 = output; - u8 *b1 = output + 8; - u8 *b2 = output + 16; - u8 *b3 = output + 24; - *(__force __le64 *)b0 = cpu_to_le64(o0); - *(__force __le64 *)b1 = cpu_to_le64(o1); - *(__force __le64 *)b2 = cpu_to_le64(o2); - *(__force __le64 *)b3 = cpu_to_le64(o3); -} - -static __always_inline void format_fcontract(u8 *output, u64 *input) -{ - format_fcontract_first_carry_full(input); - format_fcontract_second_carry_full(input); - format_fcontract_trim(input); - format_fcontract_store(output, input); -} - -static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point) -{ - u64 *x = point; - u64 *z = point + 5; - u64 buf[10] __aligned(32) = { 0 }; - u64 *zmone = buf; - u64 *sc = buf + 5; - crecip(zmone, z); - fmul(sc, x, zmone); - format_fcontract(scalar, sc); -} - -bool curve25519_hacl64(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) -{ - u64 buf0[10] __aligned(32) = { 0 }; - u64 *x0 = buf0; - u64 *z = buf0 + 5; - u64 *q; - format_fexpand(x0, basepoint); - z[0] = 1; - q = buf0; - { - u8 e[32] __aligned(32) = { 0 }; - u8 *scalar; - memcpy(e, secret, 32); - normalize_secret(e); - scalar = e; - { - u64 buf[15] = { 0 }; - u64 *nq = buf; - u64 *x = nq; - x[0] = 1; - ladder_cmult(nq, scalar, q); - format_scalar_of_point(mypublic, nq); - } - } - - return true; -} diff --git a/curve25519-sandy2x-asm.S b/curve25519-sandy2x-asm.S deleted file mode 100644 index d710b1b..0000000 --- a/curve25519-sandy2x-asm.S +++ /dev/null @@ -1,3261 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Tung Chou <blueprint@crypto.tw> - */ - -#include <linux/linkage.h> - -.data -.align 16 -curve25519_sandy2x_v0_0: .quad 0, 0 -curve25519_sandy2x_v1_0: .quad 1, 0 -curve25519_sandy2x_v2_1: .quad 2, 1 -curve25519_sandy2x_v9_0: .quad 9, 0 -curve25519_sandy2x_v9_9: .quad 9, 9 -curve25519_sandy2x_v19_19: .quad 19, 19 -curve25519_sandy2x_v38_1: .quad 38, 1 -curve25519_sandy2x_v38_38: .quad 38, 38 -curve25519_sandy2x_v121666_121666: .quad 121666, 121666 -curve25519_sandy2x_m25: .quad 33554431, 33554431 -curve25519_sandy2x_m26: .quad 67108863, 67108863 -curve25519_sandy2x_subc0: .quad 0x07FFFFDA, 0x03FFFFFE -curve25519_sandy2x_subc2: .quad 0x07FFFFFE, 0x03FFFFFE -curve25519_sandy2x_REDMASK51: .quad 0x0007FFFFFFFFFFFF - -.text -.align 32 -#ifdef CONFIG_AS_AVX -SYM_FUNC_START(curve25519_sandy2x_fe51_mul) - push %rbp - mov %rsp,%rbp - sub $96,%rsp - and $-32,%rsp - movq %r11,0(%rsp) - movq %r12,8(%rsp) - movq %r13,16(%rsp) - movq %r14,24(%rsp) - movq %r15,32(%rsp) - movq %rbx,40(%rsp) - movq %rbp,48(%rsp) - movq %rdi,56(%rsp) - mov %rdx,%rcx - movq 24(%rsi),%rdx - imulq $19,%rdx,%rax - movq %rax,64(%rsp) - mulq 16(%rcx) - mov %rax,%r8 - mov %rdx,%r9 - movq 32(%rsi),%rdx - imulq $19,%rdx,%rax - movq %rax,72(%rsp) - mulq 8(%rcx) - add %rax,%r8 - adc %rdx,%r9 - movq 0(%rsi),%rax - mulq 0(%rcx) - add %rax,%r8 - adc %rdx,%r9 - movq 0(%rsi),%rax - mulq 8(%rcx) - mov %rax,%r10 - mov %rdx,%r11 - movq 0(%rsi),%rax - mulq 16(%rcx) - mov %rax,%r12 - mov %rdx,%r13 - movq 0(%rsi),%rax - mulq 24(%rcx) - mov %rax,%r14 - mov %rdx,%r15 - movq 0(%rsi),%rax - mulq 32(%rcx) - mov %rax,%rbx - mov %rdx,%rbp - movq 8(%rsi),%rax - mulq 0(%rcx) - add %rax,%r10 - adc %rdx,%r11 - movq 8(%rsi),%rax - mulq 8(%rcx) - add %rax,%r12 - adc %rdx,%r13 - movq 8(%rsi),%rax - mulq 16(%rcx) - add %rax,%r14 - adc %rdx,%r15 - movq 8(%rsi),%rax - mulq 24(%rcx) - add %rax,%rbx - adc %rdx,%rbp - movq 8(%rsi),%rdx - imulq $19,%rdx,%rax - mulq 32(%rcx) - add %rax,%r8 - adc %rdx,%r9 - movq 16(%rsi),%rax - mulq 0(%rcx) - add %rax,%r12 - adc %rdx,%r13 - movq 16(%rsi),%rax - mulq 8(%rcx) - add %rax,%r14 - adc %rdx,%r15 - movq 16(%rsi),%rax - mulq 16(%rcx) - add %rax,%rbx - adc %rdx,%rbp - movq 16(%rsi),%rdx - imulq $19,%rdx,%rax - mulq 24(%rcx) - add %rax,%r8 - adc %rdx,%r9 - movq 16(%rsi),%rdx - imulq $19,%rdx,%rax - mulq 32(%rcx) - add %rax,%r10 - adc %rdx,%r11 - movq 24(%rsi),%rax - mulq 0(%rcx) - add %rax,%r14 - adc %rdx,%r15 - movq 24(%rsi),%rax - mulq 8(%rcx) - add %rax,%rbx - adc %rdx,%rbp - movq 64(%rsp),%rax - mulq 24(%rcx) - add %rax,%r10 - adc %rdx,%r11 - movq 64(%rsp),%rax - mulq 32(%rcx) - add %rax,%r12 - adc %rdx,%r13 - movq 32(%rsi),%rax - mulq 0(%rcx) - add %rax,%rbx - adc %rdx,%rbp - movq 72(%rsp),%rax - mulq 16(%rcx) - add %rax,%r10 - adc %rdx,%r11 - movq 72(%rsp),%rax - mulq 24(%rcx) - add %rax,%r12 - adc %rdx,%r13 - movq 72(%rsp),%rax - mulq 32(%rcx) - add %rax,%r14 - adc %rdx,%r15 - movq curve25519_sandy2x_REDMASK51(%rip),%rsi - shld $13,%r8,%r9 - and %rsi,%r8 - shld $13,%r10,%r11 - and %rsi,%r10 - add %r9,%r10 - shld $13,%r12,%r13 - and %rsi,%r12 - add %r11,%r12 - shld $13,%r14,%r15 - and %rsi,%r14 - add %r13,%r14 - shld $13,%rbx,%rbp - and %rsi,%rbx - add %r15,%rbx - imulq $19,%rbp,%rdx - add %rdx,%r8 - mov %r8,%rdx - shr $51,%rdx - add %r10,%rdx - mov %rdx,%rcx - shr $51,%rdx - and %rsi,%r8 - add %r12,%rdx - mov %rdx,%r9 - shr $51,%rdx - and %rsi,%rcx - add %r14,%rdx - mov %rdx,%rax - shr $51,%rdx - and %rsi,%r9 - add %rbx,%rdx - mov %rdx,%r10 - shr $51,%rdx - and %rsi,%rax - imulq $19,%rdx,%rdx - add %rdx,%r8 - and %rsi,%r10 - movq %r8,0(%rdi) - movq %rcx,8(%rdi) - movq %r9,16(%rdi) - movq %rax,24(%rdi) - movq %r10,32(%rdi) - movq 0(%rsp),%r11 - movq 8(%rsp),%r12 - movq 16(%rsp),%r13 - movq 24(%rsp),%r14 - movq 32(%rsp),%r15 - movq 40(%rsp),%rbx - movq 48(%rsp),%rbp - leave - ret -SYM_FUNC_END(curve25519_sandy2x_fe51_mul) - -.align 32 -SYM_FUNC_START(curve25519_sandy2x_fe51_nsquare) - push %rbp - mov %rsp,%rbp - sub $64,%rsp - and $-32,%rsp - movq %r11,0(%rsp) - movq %r12,8(%rsp) - movq %r13,16(%rsp) - movq %r14,24(%rsp) - movq %r15,32(%rsp) - movq %rbx,40(%rsp) - movq %rbp,48(%rsp) - movq 0(%rsi),%rcx - movq 8(%rsi),%r8 - movq 16(%rsi),%r9 - movq 24(%rsi),%rax - movq 32(%rsi),%rsi - movq %r9,16(%rdi) - movq %rax,24(%rdi) - movq %rsi,32(%rdi) - mov %rdx,%rsi - - .align 16 - .Lloop: - sub $1,%rsi - mov %rcx,%rax - mul %rcx - add %rcx,%rcx - mov %rax,%r9 - mov %rdx,%r10 - mov %rcx,%rax - mul %r8 - mov %rax,%r11 - mov %rdx,%r12 - mov %rcx,%rax - mulq 16(%rdi) - mov %rax,%r13 - mov %rdx,%r14 - mov %rcx,%rax - mulq 24(%rdi) - mov %rax,%r15 - mov %rdx,%rbx - mov %rcx,%rax - mulq 32(%rdi) - mov %rax,%rcx - mov %rdx,%rbp - mov %r8,%rax - mul %r8 - add %r8,%r8 - add %rax,%r13 - adc %rdx,%r14 - mov %r8,%rax - mulq 16(%rdi) - add %rax,%r15 - adc %rdx,%rbx - mov %r8,%rax - imulq $19, %r8,%r8 - mulq 24(%rdi) - add %rax,%rcx - adc %rdx,%rbp - mov %r8,%rax - mulq 32(%rdi) - add %rax,%r9 - adc %rdx,%r10 - movq 16(%rdi),%rax - mulq 16(%rdi) - add %rax,%rcx - adc %rdx,%rbp - shld $13,%rcx,%rbp - movq 16(%rdi),%rax - imulq $38, %rax,%rax - mulq 24(%rdi) - add %rax,%r9 - adc %rdx,%r10 - shld $13,%r9,%r10 - movq 16(%rdi),%rax - imulq $38, %rax,%rax - mulq 32(%rdi) - add %rax,%r11 - adc %rdx,%r12 - movq 24(%rdi),%rax - imulq $19, %rax,%rax - mulq 24(%rdi) - add %rax,%r11 - adc %rdx,%r12 - shld $13,%r11,%r12 - movq 24(%rdi),%rax - imulq $38, %rax,%rax - mulq 32(%rdi) - add %rax,%r13 - adc %rdx,%r14 - shld $13,%r13,%r14 - movq 32(%rdi),%rax - imulq $19, %rax,%rax - mulq 32(%rdi) - add %rax,%r15 - adc %rdx,%rbx - shld $13,%r15,%rbx - movq curve25519_sandy2x_REDMASK51(%rip),%rdx - and %rdx,%rcx - add %rbx,%rcx - and %rdx,%r9 - and %rdx,%r11 - add %r10,%r11 - and %rdx,%r13 - add %r12,%r13 - and %rdx,%r15 - add %r14,%r15 - imulq $19, %rbp,%rbp - lea (%r9,%rbp),%r9 - mov %r9,%rax - shr $51,%r9 - add %r11,%r9 - and %rdx,%rax - mov %r9,%r8 - shr $51,%r9 - add %r13,%r9 - and %rdx,%r8 - mov %r9,%r10 - shr $51,%r9 - add %r15,%r9 - and %rdx,%r10 - movq %r10,16(%rdi) - mov %r9,%r10 - shr $51,%r9 - add %rcx,%r9 - and %rdx,%r10 - movq %r10,24(%rdi) - mov %r9,%r10 - shr $51,%r9 - imulq $19, %r9,%r9 - lea (%rax,%r9),%rcx - and %rdx,%r10 - movq %r10,32(%rdi) - cmp $0,%rsi - jne .Lloop - - movq %rcx,0(%rdi) - movq %r8,8(%rdi) - movq 0(%rsp),%r11 - movq 8(%rsp),%r12 - movq 16(%rsp),%r13 - movq 24(%rsp),%r14 - movq 32(%rsp),%r15 - movq 40(%rsp),%rbx - movq 48(%rsp),%rbp - leave - ret -SYM_FUNC_END(curve25519_sandy2x_fe51_nsquare) - -.align 32 -SYM_FUNC_START(curve25519_sandy2x_fe51_pack) - push %rbp - mov %rsp,%rbp - sub $32,%rsp - and $-32,%rsp - movq %r11,0(%rsp) - movq %r12,8(%rsp) - movq 0(%rsi),%rdx - movq 8(%rsi),%rcx - movq 16(%rsi),%r8 - movq 24(%rsi),%r9 - movq 32(%rsi),%rsi - movq curve25519_sandy2x_REDMASK51(%rip),%rax - lea -18(%rax),%r10 - mov $3,%r11 - - .align 16 - .Lreduceloop: - mov %rdx,%r12 - shr $51,%r12 - and %rax,%rdx - add %r12,%rcx - mov %rcx,%r12 - shr $51,%r12 - and %rax,%rcx - add %r12,%r8 - mov %r8,%r12 - shr $51,%r12 - and %rax,%r8 - add %r12,%r9 - mov %r9,%r12 - shr $51,%r12 - and %rax,%r9 - add %r12,%rsi - mov %rsi,%r12 - shr $51,%r12 - and %rax,%rsi - imulq $19, %r12,%r12 - add %r12,%rdx - sub $1,%r11 - ja .Lreduceloop - - mov $1,%r12 - cmp %r10,%rdx - cmovl %r11,%r12 - cmp %rax,%rcx - cmovne %r11,%r12 - cmp %rax,%r8 - cmovne %r11,%r12 - cmp %rax,%r9 - cmovne %r11,%r12 - cmp %rax,%rsi - cmovne %r11,%r12 - neg %r12 - and %r12,%rax - and %r12,%r10 - sub %r10,%rdx - sub %rax,%rcx - sub %rax,%r8 - sub %rax,%r9 - sub %rax,%rsi - mov %rdx,%rax - and $0xFF,%eax - movb %al,0(%rdi) - mov %rdx,%rax - shr $8,%rax - and $0xFF,%eax - movb %al,1(%rdi) - mov %rdx,%rax - shr $16,%rax - and $0xFF,%eax - movb %al,2(%rdi) - mov %rdx,%rax - shr $24,%rax - and $0xFF,%eax - movb %al,3(%rdi) - mov %rdx,%rax - shr $32,%rax - and $0xFF,%eax - movb %al,4(%rdi) - mov %rdx,%rax - shr $40,%rax - and $0xFF,%eax - movb %al,5(%rdi) - mov %rdx,%rdx - shr $48,%rdx - mov %rcx,%rax - shl $3,%rax - and $0xF8,%eax - xor %rdx,%rax - movb %al,6(%rdi) - mov %rcx,%rdx - shr $5,%rdx - and $0xFF,%edx - movb %dl,7(%rdi) - mov %rcx,%rdx - shr $13,%rdx - and $0xFF,%edx - movb %dl,8(%rdi) - mov %rcx,%rdx - shr $21,%rdx - and $0xFF,%edx - movb %dl,9(%rdi) - mov %rcx,%rdx - shr $29,%rdx - and $0xFF,%edx - movb %dl,10(%rdi) - mov %rcx,%rdx - shr $37,%rdx - and $0xFF,%edx - movb %dl,11(%rdi) - mov %rcx,%rdx - shr $45,%rdx - mov %r8,%rcx - shl $6,%rcx - and $0xC0,%ecx - xor %rdx,%rcx - movb %cl,12(%rdi) - mov %r8,%rdx - shr $2,%rdx - and $0xFF,%edx - movb %dl,13(%rdi) - mov %r8,%rdx - shr $10,%rdx - and $0xFF,%edx - movb %dl,14(%rdi) - mov %r8,%rdx - shr $18,%rdx - and $0xFF,%edx - movb %dl,15(%rdi) - mov %r8,%rdx - shr $26,%rdx - and $0xFF,%edx - movb %dl,16(%rdi) - mov %r8,%rdx - shr $34,%rdx - and $0xFF,%edx - movb %dl,17(%rdi) - mov %r8,%rdx - shr $42,%rdx - movb %dl,18(%rdi) - mov %r8,%rdx - shr $50,%rdx - mov %r9,%rcx - shl $1,%rcx - and $0xFE,%ecx - xor %rdx,%rcx - movb %cl,19(%rdi) - mov %r9,%rdx - shr $7,%rdx - and $0xFF,%edx - movb %dl,20(%rdi) - mov %r9,%rdx - shr $15,%rdx - and $0xFF,%edx - movb %dl,21(%rdi) - mov %r9,%rdx - shr $23,%rdx - and $0xFF,%edx - movb %dl,22(%rdi) - mov %r9,%rdx - shr $31,%rdx - and $0xFF,%edx - movb %dl,23(%rdi) - mov %r9,%rdx - shr $39,%rdx - and $0xFF,%edx - movb %dl,24(%rdi) - mov %r9,%rdx - shr $47,%rdx - mov %rsi,%rcx - shl $4,%rcx - and $0xF0,%ecx - xor %rdx,%rcx - movb %cl,25(%rdi) - mov %rsi,%rdx - shr $4,%rdx - and $0xFF,%edx - movb %dl,26(%rdi) - mov %rsi,%rdx - shr $12,%rdx - and $0xFF,%edx - movb %dl,27(%rdi) - mov %rsi,%rdx - shr $20,%rdx - and $0xFF,%edx - movb %dl,28(%rdi) - mov %rsi,%rdx - shr $28,%rdx - and $0xFF,%edx - movb %dl,29(%rdi) - mov %rsi,%rdx - shr $36,%rdx - and $0xFF,%edx - movb %dl,30(%rdi) - mov %rsi,%rsi - shr $44,%rsi - movb %sil,31(%rdi) - movq 0(%rsp),%r11 - movq 8(%rsp),%r12 - leave - ret -SYM_FUNC_END(curve25519_sandy2x_fe51_pack) - -.align 32 -SYM_FUNC_START(curve25519_sandy2x_ladder) - push %rbp - mov %rsp,%rbp - sub $1856,%rsp - and $-32,%rsp - movq %r11,1824(%rsp) - movq %r12,1832(%rsp) - movq %r13,1840(%rsp) - movq %r14,1848(%rsp) - vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 - vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 - vmovdqu 0(%rdi),%xmm2 - vmovdqa %xmm2,0(%rsp) - vmovdqu 16(%rdi),%xmm2 - vmovdqa %xmm2,16(%rsp) - vmovdqu 32(%rdi),%xmm2 - vmovdqa %xmm2,32(%rsp) - vmovdqu 48(%rdi),%xmm2 - vmovdqa %xmm2,48(%rsp) - vmovdqu 64(%rdi),%xmm2 - vmovdqa %xmm2,64(%rsp) - vmovdqa %xmm1,80(%rsp) - vmovdqa %xmm0,96(%rsp) - vmovdqa %xmm0,112(%rsp) - vmovdqa %xmm0,128(%rsp) - vmovdqa %xmm0,144(%rsp) - vmovdqa %xmm1,%xmm0 - vpxor %xmm1,%xmm1,%xmm1 - vpxor %xmm2,%xmm2,%xmm2 - vpxor %xmm3,%xmm3,%xmm3 - vpxor %xmm4,%xmm4,%xmm4 - vpxor %xmm5,%xmm5,%xmm5 - vpxor %xmm6,%xmm6,%xmm6 - vpxor %xmm7,%xmm7,%xmm7 - vpxor %xmm8,%xmm8,%xmm8 - vpxor %xmm9,%xmm9,%xmm9 - vmovdqu 0(%rdi),%xmm10 - vmovdqa %xmm10,160(%rsp) - vmovdqu 16(%rdi),%xmm10 - vmovdqa %xmm10,176(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,192(%rsp) - vmovdqu 32(%rdi),%xmm10 - vmovdqa %xmm10,208(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,224(%rsp) - vmovdqu 48(%rdi),%xmm10 - vmovdqa %xmm10,240(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,256(%rsp) - vmovdqu 64(%rdi),%xmm10 - vmovdqa %xmm10,272(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,288(%rsp) - vmovdqu 8(%rdi),%xmm10 - vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,304(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,320(%rsp) - vmovdqu 24(%rdi),%xmm10 - vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,336(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,352(%rsp) - vmovdqu 40(%rdi),%xmm10 - vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,368(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,384(%rsp) - vmovdqu 56(%rdi),%xmm10 - vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,400(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,416(%rsp) - vmovdqu 0(%rdi),%xmm10 - vmovdqu 64(%rdi),%xmm11 - vblendps $12, %xmm11, %xmm10, %xmm10 - vpshufd $2,%xmm10,%xmm10 - vpmuludq curve25519_sandy2x_v38_1(%rip),%xmm10,%xmm10 - vmovdqa %xmm10,432(%rsp) - movq 0(%rsi),%rdx - movq 8(%rsi),%rcx - movq 16(%rsi),%r8 - movq 24(%rsi),%r9 - shrd $1,%rcx,%rdx - shrd $1,%r8,%rcx - shrd $1,%r9,%r8 - shr $1,%r9 - xorq 0(%rsi),%rdx - xorq 8(%rsi),%rcx - xorq 16(%rsi),%r8 - xorq 24(%rsi),%r9 - leaq 800(%rsp),%rsi - mov $64,%rax - - .align 16 - .Lladder_small_loop: - mov %rdx,%r10 - mov %rcx,%r11 - mov %r8,%r12 - mov %r9,%r13 - shr $1,%rdx - shr $1,%rcx - shr $1,%r8 - shr $1,%r9 - and $1,%r10d - and $1,%r11d - and $1,%r12d - and $1,%r13d - neg %r10 - neg %r11 - neg %r12 - neg %r13 - movl %r10d,0(%rsi) - movl %r11d,256(%rsi) - movl %r12d,512(%rsi) - movl %r13d,768(%rsi) - add $4,%rsi - sub $1,%rax - jne .Lladder_small_loop - mov $255,%rdx - add $760,%rsi - - .align 16 - .Lladder_loop: - sub $1,%rdx - vbroadcastss 0(%rsi),%xmm10 - sub $4,%rsi - vmovdqa 0(%rsp),%xmm11 - vmovdqa 80(%rsp),%xmm12 - vpxor %xmm11,%xmm0,%xmm13 - vpand %xmm10,%xmm13,%xmm13 - vpxor %xmm13,%xmm0,%xmm0 - vpxor %xmm13,%xmm11,%xmm11 - vpxor %xmm12,%xmm1,%xmm13 - vpand %xmm10,%xmm13,%xmm13 - vpxor %xmm13,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vmovdqa 16(%rsp),%xmm13 - vmovdqa 96(%rsp),%xmm14 - vpxor %xmm13,%xmm2,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm2,%xmm2 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm3,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm3,%xmm3 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,0(%rsp) - vmovdqa %xmm14,16(%rsp) - vmovdqa 32(%rsp),%xmm13 - vmovdqa 112(%rsp),%xmm14 - vpxor %xmm13,%xmm4,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm4,%xmm4 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm5,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm5,%xmm5 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,32(%rsp) - vmovdqa %xmm14,80(%rsp) - vmovdqa 48(%rsp),%xmm13 - vmovdqa 128(%rsp),%xmm14 - vpxor %xmm13,%xmm6,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm6,%xmm6 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm7,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm7,%xmm7 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,48(%rsp) - vmovdqa %xmm14,96(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqa 144(%rsp),%xmm14 - vpxor %xmm13,%xmm8,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm9,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm9,%xmm9 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,64(%rsp) - vmovdqa %xmm14,112(%rsp) - vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 - vpsubq %xmm12,%xmm10,%xmm10 - vpaddq %xmm12,%xmm11,%xmm11 - vpunpckhqdq %xmm10,%xmm11,%xmm12 - vpunpcklqdq %xmm10,%xmm11,%xmm10 - vpaddq %xmm1,%xmm0,%xmm11 - vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 - vpsubq %xmm1,%xmm0,%xmm0 - vpunpckhqdq %xmm11,%xmm0,%xmm1 - vpunpcklqdq %xmm11,%xmm0,%xmm0 - vpmuludq %xmm0,%xmm10,%xmm11 - vpmuludq %xmm1,%xmm10,%xmm13 - vmovdqa %xmm1,128(%rsp) - vpaddq %xmm1,%xmm1,%xmm1 - vpmuludq %xmm0,%xmm12,%xmm14 - vmovdqa %xmm0,144(%rsp) - vpaddq %xmm14,%xmm13,%xmm13 - vpmuludq %xmm1,%xmm12,%xmm0 - vmovdqa %xmm1,448(%rsp) - vpaddq %xmm3,%xmm2,%xmm1 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 - vpsubq %xmm3,%xmm2,%xmm2 - vpunpckhqdq %xmm1,%xmm2,%xmm3 - vpunpcklqdq %xmm1,%xmm2,%xmm1 - vpmuludq %xmm1,%xmm10,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq %xmm3,%xmm10,%xmm2 - vmovdqa %xmm3,464(%rsp) - vpaddq %xmm3,%xmm3,%xmm3 - vpmuludq %xmm1,%xmm12,%xmm14 - vmovdqa %xmm1,480(%rsp) - vpaddq %xmm14,%xmm2,%xmm2 - vpmuludq %xmm3,%xmm12,%xmm1 - vmovdqa %xmm3,496(%rsp) - vpaddq %xmm5,%xmm4,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 - vpsubq %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm3,%xmm4,%xmm5 - vpunpcklqdq %xmm3,%xmm4,%xmm3 - vpmuludq %xmm3,%xmm10,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq %xmm5,%xmm10,%xmm4 - vmovdqa %xmm5,512(%rsp) - vpaddq %xmm5,%xmm5,%xmm5 - vpmuludq %xmm3,%xmm12,%xmm14 - vmovdqa %xmm3,528(%rsp) - vpaddq %xmm14,%xmm4,%xmm4 - vpaddq %xmm7,%xmm6,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 - vpsubq %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm3,%xmm6,%xmm7 - vpunpcklqdq %xmm3,%xmm6,%xmm3 - vpmuludq %xmm3,%xmm10,%xmm6 - vpmuludq %xmm5,%xmm12,%xmm14 - vmovdqa %xmm5,544(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 - vmovdqa %xmm5,560(%rsp) - vpaddq %xmm14,%xmm6,%xmm6 - vpmuludq %xmm7,%xmm10,%xmm5 - vmovdqa %xmm7,576(%rsp) - vpaddq %xmm7,%xmm7,%xmm7 - vpmuludq %xmm3,%xmm12,%xmm14 - vmovdqa %xmm3,592(%rsp) - vpaddq %xmm14,%xmm5,%xmm5 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vmovdqa %xmm3,608(%rsp) - vpaddq %xmm9,%xmm8,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 - vpsubq %xmm9,%xmm8,%xmm8 - vpunpckhqdq %xmm3,%xmm8,%xmm9 - vpunpcklqdq %xmm3,%xmm8,%xmm3 - vmovdqa %xmm3,624(%rsp) - vpmuludq %xmm7,%xmm12,%xmm8 - vmovdqa %xmm7,640(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 - vmovdqa %xmm7,656(%rsp) - vpmuludq %xmm3,%xmm10,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq %xmm9,%xmm10,%xmm7 - vmovdqa %xmm9,672(%rsp) - vpaddq %xmm9,%xmm9,%xmm9 - vpmuludq %xmm3,%xmm12,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vmovdqa %xmm3,688(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 - vpmuludq %xmm9,%xmm12,%xmm3 - vmovdqa %xmm9,704(%rsp) - vpaddq %xmm3,%xmm11,%xmm11 - vmovdqa 0(%rsp),%xmm3 - vmovdqa 16(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 480(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 464(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 528(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 512(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 592(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 576(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 624(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 672(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 448(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 480(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 496(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 528(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 544(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 592(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 640(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 624(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 704(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm0,%xmm0 - vmovdqa 32(%rsp),%xmm3 - vmovdqa 80(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 480(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 464(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 528(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 512(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 592(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 576(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 624(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 672(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 448(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 480(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 496(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 528(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 544(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 592(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 640(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 624(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 704(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm1,%xmm1 - vmovdqa 48(%rsp),%xmm3 - vmovdqa 96(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 480(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 464(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 528(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 512(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 592(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 576(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 624(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 672(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 448(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 480(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 496(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 528(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 544(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 592(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 640(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 624(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 704(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm6,%xmm6 - vmovdqa 64(%rsp),%xmm3 - vmovdqa 112(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 480(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 464(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 528(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 512(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 592(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 576(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 624(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 672(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 448(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 480(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 496(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 528(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 544(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 592(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 640(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 624(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 704(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm8,%xmm8 - vpsrlq $25,%xmm4,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 - vpsrlq $26,%xmm11,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm6,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm13,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 - vpsrlq $25,%xmm5,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm0,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 - vpsrlq $26,%xmm8,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $25,%xmm2,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 - vpsrlq $25,%xmm7,%xmm3 - vpsllq $4,%xmm3,%xmm9 - vpaddq %xmm3,%xmm11,%xmm11 - vpsllq $1,%xmm3,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpaddq %xmm9,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $26,%xmm1,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $26,%xmm11,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $25,%xmm4,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 - vpunpcklqdq %xmm13,%xmm11,%xmm3 - vpunpckhqdq %xmm13,%xmm11,%xmm9 - vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 - vpsubq %xmm3,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm3,%xmm10,%xmm9 - vpunpcklqdq %xmm3,%xmm10,%xmm10 - vpmuludq %xmm10,%xmm10,%xmm3 - vpaddq %xmm10,%xmm10,%xmm10 - vpmuludq %xmm9,%xmm10,%xmm11 - vpunpcklqdq %xmm2,%xmm0,%xmm12 - vpunpckhqdq %xmm2,%xmm0,%xmm0 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 - vpsubq %xmm12,%xmm2,%xmm2 - vpaddq %xmm0,%xmm12,%xmm12 - vpunpckhqdq %xmm12,%xmm2,%xmm0 - vpunpcklqdq %xmm12,%xmm2,%xmm2 - vpmuludq %xmm2,%xmm10,%xmm12 - vpaddq %xmm9,%xmm9,%xmm13 - vpmuludq %xmm13,%xmm9,%xmm9 - vpaddq %xmm9,%xmm12,%xmm12 - vpmuludq %xmm0,%xmm10,%xmm9 - vpmuludq %xmm2,%xmm13,%xmm14 - vpaddq %xmm14,%xmm9,%xmm9 - vpunpcklqdq %xmm4,%xmm1,%xmm14 - vpunpckhqdq %xmm4,%xmm1,%xmm1 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 - vpsubq %xmm14,%xmm4,%xmm4 - vpaddq %xmm1,%xmm14,%xmm14 - vpunpckhqdq %xmm14,%xmm4,%xmm1 - vpunpcklqdq %xmm14,%xmm4,%xmm4 - vmovdqa %xmm1,0(%rsp) - vpaddq %xmm1,%xmm1,%xmm1 - vmovdqa %xmm1,16(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsp) - vpmuludq %xmm4,%xmm10,%xmm1 - vpmuludq %xmm2,%xmm2,%xmm14 - vpaddq %xmm14,%xmm1,%xmm1 - vpmuludq 0(%rsp),%xmm10,%xmm14 - vpmuludq %xmm4,%xmm13,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpunpcklqdq %xmm5,%xmm6,%xmm15 - vpunpckhqdq %xmm5,%xmm6,%xmm5 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 - vpsubq %xmm15,%xmm6,%xmm6 - vpaddq %xmm5,%xmm15,%xmm15 - vpunpckhqdq %xmm15,%xmm6,%xmm5 - vpunpcklqdq %xmm15,%xmm6,%xmm6 - vmovdqa %xmm6,48(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 - vmovdqa %xmm6,64(%rsp) - vmovdqa %xmm5,80(%rsp) - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 - vmovdqa %xmm5,96(%rsp) - vpmuludq 48(%rsp),%xmm10,%xmm5 - vpaddq %xmm0,%xmm0,%xmm6 - vpmuludq %xmm6,%xmm0,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpmuludq 80(%rsp),%xmm10,%xmm0 - vpmuludq %xmm4,%xmm6,%xmm15 - vpaddq %xmm15,%xmm0,%xmm0 - vpmuludq %xmm6,%xmm13,%xmm15 - vpaddq %xmm15,%xmm1,%xmm1 - vpmuludq %xmm6,%xmm2,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpunpcklqdq %xmm7,%xmm8,%xmm15 - vpunpckhqdq %xmm7,%xmm8,%xmm7 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 - vpsubq %xmm15,%xmm8,%xmm8 - vpaddq %xmm7,%xmm15,%xmm15 - vpunpckhqdq %xmm15,%xmm8,%xmm7 - vpunpcklqdq %xmm15,%xmm8,%xmm8 - vmovdqa %xmm8,112(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 - vmovdqa %xmm8,448(%rsp) - vpmuludq 112(%rsp),%xmm10,%xmm8 - vpmuludq %xmm7,%xmm10,%xmm10 - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 - vpmuludq %xmm15,%xmm7,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq %xmm15,%xmm13,%xmm7 - vpaddq %xmm7,%xmm3,%xmm3 - vpmuludq %xmm15,%xmm2,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq 80(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm7,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq 16(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 48(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm0,%xmm0 - vpmuludq 112(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm10,%xmm10 - vpmuludq %xmm15,%xmm6,%xmm7 - vpaddq %xmm7,%xmm12,%xmm12 - vpmuludq %xmm15,%xmm4,%xmm7 - vpaddq %xmm7,%xmm9,%xmm9 - vpaddq %xmm2,%xmm2,%xmm2 - vpmuludq %xmm4,%xmm2,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 448(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm3,%xmm3 - vpmuludq 448(%rsp),%xmm6,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq 0(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm0,%xmm0 - vpmuludq 48(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq 80(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 96(%rsp),%xmm4,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm4,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpaddq %xmm4,%xmm4,%xmm2 - vpmuludq 448(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq 16(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq 48(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm14,%xmm14 - vpmuludq 96(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 448(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 16(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm8,%xmm8 - vpmuludq 48(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 80(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vpmuludq 112(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm0,%xmm0 - vmovdqa 48(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 448(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 80(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 448(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm14,%xmm14 - vpmuludq 64(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 64(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 96(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vmovdqa 48(%rsp),%xmm4 - vpmuludq 96(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 0(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vmovdqa 32(%rsp),%xmm2 - vpmuludq 0(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vmovdqa 64(%rsp),%xmm2 - vpmuludq 48(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vmovdqa 96(%rsp),%xmm2 - vpmuludq 80(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vmovdqa 448(%rsp),%xmm2 - vpmuludq 112(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpsrlq $26,%xmm3,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 - vpsrlq $25,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $25,%xmm11,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm5,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm12,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm0,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm9,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 - vpsrlq $26,%xmm8,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $25,%xmm10,%xmm2 - vpsllq $4,%xmm2,%xmm4 - vpaddq %xmm2,%xmm3,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpaddq %xmm2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $26,%xmm3,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 - vpunpckhqdq %xmm11,%xmm3,%xmm2 - vmovdqa %xmm2,0(%rsp) - vpshufd $0,%xmm3,%xmm2 - vpshufd $0,%xmm11,%xmm3 - vpmuludq 160(%rsp),%xmm2,%xmm4 - vpmuludq 432(%rsp),%xmm3,%xmm6 - vpaddq %xmm6,%xmm4,%xmm4 - vpmuludq 176(%rsp),%xmm2,%xmm6 - vpmuludq 304(%rsp),%xmm3,%xmm7 - vpaddq %xmm7,%xmm6,%xmm6 - vpmuludq 208(%rsp),%xmm2,%xmm7 - vpmuludq 336(%rsp),%xmm3,%xmm11 - vpaddq %xmm11,%xmm7,%xmm7 - vpmuludq 240(%rsp),%xmm2,%xmm11 - vpmuludq 368(%rsp),%xmm3,%xmm13 - vpaddq %xmm13,%xmm11,%xmm11 - vpmuludq 272(%rsp),%xmm2,%xmm2 - vpmuludq 400(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpunpckhqdq %xmm9,%xmm12,%xmm3 - vmovdqa %xmm3,16(%rsp) - vpshufd $0,%xmm12,%xmm3 - vpshufd $0,%xmm9,%xmm9 - vpmuludq 288(%rsp),%xmm3,%xmm12 - vpaddq %xmm12,%xmm4,%xmm4 - vpmuludq 416(%rsp),%xmm9,%xmm12 - vpaddq %xmm12,%xmm4,%xmm4 - vpmuludq 160(%rsp),%xmm3,%xmm12 - vpaddq %xmm12,%xmm6,%xmm6 - vpmuludq 432(%rsp),%xmm9,%xmm12 - vpaddq %xmm12,%xmm6,%xmm6 - vpmuludq 176(%rsp),%xmm3,%xmm12 - vpaddq %xmm12,%xmm7,%xmm7 - vpmuludq 304(%rsp),%xmm9,%xmm12 - vpaddq %xmm12,%xmm7,%xmm7 - vpmuludq 208(%rsp),%xmm3,%xmm12 - vpaddq %xmm12,%xmm11,%xmm11 - vpmuludq 336(%rsp),%xmm9,%xmm12 - vpaddq %xmm12,%xmm11,%xmm11 - vpmuludq 240(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 368(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm1,%xmm3 - vmovdqa %xmm3,32(%rsp) - vpshufd $0,%xmm1,%xmm1 - vpshufd $0,%xmm14,%xmm3 - vpmuludq 256(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm4,%xmm4 - vpmuludq 384(%rsp),%xmm3,%xmm9 - vpaddq %xmm9,%xmm4,%xmm4 - vpmuludq 288(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm6,%xmm6 - vpmuludq 416(%rsp),%xmm3,%xmm9 - vpaddq %xmm9,%xmm6,%xmm6 - vpmuludq 160(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm7,%xmm7 - vpmuludq 432(%rsp),%xmm3,%xmm9 - vpaddq %xmm9,%xmm7,%xmm7 - vpmuludq 176(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm11,%xmm11 - vpmuludq 304(%rsp),%xmm3,%xmm9 - vpaddq %xmm9,%xmm11,%xmm11 - vpmuludq 208(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm2,%xmm2 - vpmuludq 336(%rsp),%xmm3,%xmm1 - vpaddq %xmm1,%xmm2,%xmm2 - vpunpckhqdq %xmm0,%xmm5,%xmm1 - vmovdqa %xmm1,48(%rsp) - vpshufd $0,%xmm5,%xmm1 - vpshufd $0,%xmm0,%xmm0 - vpmuludq 224(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 352(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 256(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 384(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 288(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 416(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 160(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 432(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 176(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm2,%xmm2 - vpmuludq 304(%rsp),%xmm0,%xmm0 - vpaddq %xmm0,%xmm2,%xmm2 - vpunpckhqdq %xmm10,%xmm8,%xmm0 - vmovdqa %xmm0,64(%rsp) - vpshufd $0,%xmm8,%xmm0 - vpshufd $0,%xmm10,%xmm1 - vpmuludq 192(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 320(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 224(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 352(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 256(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 384(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 288(%rsp),%xmm0,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 416(%rsp),%xmm1,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 160(%rsp),%xmm0,%xmm0 - vpaddq %xmm0,%xmm2,%xmm2 - vpmuludq 432(%rsp),%xmm1,%xmm0 - vpaddq %xmm0,%xmm2,%xmm2 - vmovdqa %xmm4,80(%rsp) - vmovdqa %xmm6,96(%rsp) - vmovdqa %xmm7,112(%rsp) - vmovdqa %xmm11,448(%rsp) - vmovdqa %xmm2,496(%rsp) - vmovdqa 144(%rsp),%xmm0 - vpmuludq %xmm0,%xmm0,%xmm1 - vpaddq %xmm0,%xmm0,%xmm0 - vmovdqa 128(%rsp),%xmm2 - vpmuludq %xmm2,%xmm0,%xmm3 - vmovdqa 480(%rsp),%xmm4 - vpmuludq %xmm4,%xmm0,%xmm5 - vmovdqa 464(%rsp),%xmm6 - vpmuludq %xmm6,%xmm0,%xmm7 - vmovdqa 528(%rsp),%xmm8 - vpmuludq %xmm8,%xmm0,%xmm9 - vpmuludq 512(%rsp),%xmm0,%xmm10 - vpmuludq 592(%rsp),%xmm0,%xmm11 - vpmuludq 576(%rsp),%xmm0,%xmm12 - vpmuludq 624(%rsp),%xmm0,%xmm13 - vmovdqa 672(%rsp),%xmm14 - vpmuludq %xmm14,%xmm0,%xmm0 - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 - vpmuludq %xmm15,%xmm14,%xmm14 - vpaddq %xmm14,%xmm13,%xmm13 - vpaddq %xmm6,%xmm6,%xmm14 - vpmuludq %xmm14,%xmm6,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpaddq %xmm2,%xmm2,%xmm6 - vpmuludq %xmm6,%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq %xmm15,%xmm6,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vpmuludq %xmm15,%xmm4,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpmuludq 544(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 592(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 640(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 624(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq %xmm4,%xmm6,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq %xmm14,%xmm6,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq %xmm8,%xmm6,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq %xmm15,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq %xmm15,%xmm8,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq %xmm4,%xmm4,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq %xmm14,%xmm4,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpaddq %xmm4,%xmm4,%xmm2 - vpmuludq %xmm8,%xmm2,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vpmuludq 688(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq 688(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vpmuludq 512(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq 592(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm13,%xmm13 - vpmuludq 576(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq 656(%rsp),%xmm8,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpmuludq %xmm8,%xmm14,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq %xmm8,%xmm8,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpaddq %xmm8,%xmm8,%xmm2 - vpmuludq 688(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vpmuludq 544(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 592(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 656(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 544(%rsp),%xmm4 - vpmuludq 688(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm7,%xmm7 - vpmuludq 544(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm13,%xmm13 - vpmuludq 592(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm0,%xmm0 - vpmuludq 640(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vpmuludq 624(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vmovdqa 592(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 688(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 608(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 544(%rsp),%xmm4 - vpmuludq 608(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 544(%rsp),%xmm4 - vpmuludq 656(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vmovdqa 592(%rsp),%xmm4 - vpmuludq 656(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm7,%xmm7 - vmovdqa 640(%rsp),%xmm4 - vpmuludq 688(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 512(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vmovdqa 560(%rsp),%xmm2 - vpmuludq 512(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vmovdqa 608(%rsp),%xmm2 - vpmuludq 592(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vmovdqa 656(%rsp),%xmm2 - vpmuludq 576(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vmovdqa 688(%rsp),%xmm2 - vpmuludq 624(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $25,%xmm10,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm3,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 - vpsrlq $26,%xmm11,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm5,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 - vpsrlq $25,%xmm12,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm7,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $26,%xmm13,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 - vpsrlq $26,%xmm9,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $25,%xmm0,%xmm2 - vpsllq $4,%xmm2,%xmm4 - vpaddq %xmm2,%xmm1,%xmm1 - vpsllq $1,%xmm2,%xmm2 - vpaddq %xmm2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm10,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpunpckhqdq %xmm3,%xmm1,%xmm2 - vpunpcklqdq %xmm3,%xmm1,%xmm1 - vmovdqa %xmm1,464(%rsp) - vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 - vpsubq %xmm1,%xmm3,%xmm3 - vpunpckhqdq %xmm3,%xmm2,%xmm1 - vpunpcklqdq %xmm3,%xmm2,%xmm2 - vmovdqa %xmm2,480(%rsp) - vmovdqa %xmm1,512(%rsp) - vpsllq $1,%xmm1,%xmm1 - vmovdqa %xmm1,528(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 - vmovdqa 80(%rsp),%xmm1 - vpunpcklqdq %xmm1,%xmm3,%xmm2 - vpunpckhqdq %xmm1,%xmm3,%xmm1 - vpunpckhqdq %xmm7,%xmm5,%xmm3 - vpunpcklqdq %xmm7,%xmm5,%xmm4 - vmovdqa %xmm4,544(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 - vpsubq %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm5,%xmm3,%xmm4 - vpunpcklqdq %xmm5,%xmm3,%xmm3 - vmovdqa %xmm3,560(%rsp) - vmovdqa %xmm4,576(%rsp) - vpsllq $1,%xmm4,%xmm4 - vmovdqa %xmm4,592(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 - vmovdqa 96(%rsp),%xmm3 - vpunpcklqdq %xmm3,%xmm5,%xmm4 - vpunpckhqdq %xmm3,%xmm5,%xmm3 - vpunpckhqdq %xmm10,%xmm9,%xmm5 - vpunpcklqdq %xmm10,%xmm9,%xmm6 - vmovdqa %xmm6,608(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 - vpsubq %xmm6,%xmm7,%xmm7 - vpunpckhqdq %xmm7,%xmm5,%xmm6 - vpunpcklqdq %xmm7,%xmm5,%xmm5 - vmovdqa %xmm5,624(%rsp) - vmovdqa %xmm6,640(%rsp) - vpsllq $1,%xmm6,%xmm6 - vmovdqa %xmm6,656(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 - vmovdqa 112(%rsp),%xmm5 - vpunpcklqdq %xmm5,%xmm7,%xmm6 - vpunpckhqdq %xmm5,%xmm7,%xmm5 - vpunpckhqdq %xmm12,%xmm11,%xmm7 - vpunpcklqdq %xmm12,%xmm11,%xmm8 - vmovdqa %xmm8,672(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 - vpsubq %xmm8,%xmm9,%xmm9 - vpunpckhqdq %xmm9,%xmm7,%xmm8 - vpunpcklqdq %xmm9,%xmm7,%xmm7 - vmovdqa %xmm7,688(%rsp) - vmovdqa %xmm8,704(%rsp) - vpsllq $1,%xmm8,%xmm8 - vmovdqa %xmm8,720(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 - vmovdqa 448(%rsp),%xmm7 - vpunpcklqdq %xmm7,%xmm9,%xmm8 - vpunpckhqdq %xmm7,%xmm9,%xmm7 - vpunpckhqdq %xmm0,%xmm13,%xmm9 - vpunpcklqdq %xmm0,%xmm13,%xmm0 - vmovdqa %xmm0,448(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 - vpsubq %xmm0,%xmm10,%xmm10 - vpunpckhqdq %xmm10,%xmm9,%xmm0 - vpunpcklqdq %xmm10,%xmm9,%xmm9 - vmovdqa %xmm9,736(%rsp) - vmovdqa %xmm0,752(%rsp) - vpsllq $1,%xmm0,%xmm0 - vmovdqa %xmm0,768(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 - vmovdqa 496(%rsp),%xmm0 - vpunpcklqdq %xmm0,%xmm10,%xmm9 - vpunpckhqdq %xmm0,%xmm10,%xmm0 - vpsrlq $26,%xmm2,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 - vpsrlq $25,%xmm5,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $25,%xmm1,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 - vpsrlq $26,%xmm8,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm4,%xmm10 - vpaddq %xmm10,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 - vpsrlq $25,%xmm7,%xmm10 - vpaddq %xmm10,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $25,%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 - vpsrlq $26,%xmm9,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $26,%xmm6,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm0,%xmm10 - vpsllq $4,%xmm10,%xmm11 - vpaddq %xmm10,%xmm2,%xmm2 - vpsllq $1,%xmm10,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpaddq %xmm11,%xmm2,%xmm2 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm5,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm2,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 - vpunpckhqdq %xmm1,%xmm2,%xmm10 - vmovdqa %xmm10,80(%rsp) - vpunpcklqdq %xmm1,%xmm2,%xmm1 - vpunpckhqdq %xmm3,%xmm4,%xmm2 - vmovdqa %xmm2,96(%rsp) - vpunpcklqdq %xmm3,%xmm4,%xmm2 - vpunpckhqdq %xmm5,%xmm6,%xmm3 - vmovdqa %xmm3,112(%rsp) - vpunpcklqdq %xmm5,%xmm6,%xmm3 - vpunpckhqdq %xmm7,%xmm8,%xmm4 - vmovdqa %xmm4,128(%rsp) - vpunpcklqdq %xmm7,%xmm8,%xmm4 - vpunpckhqdq %xmm0,%xmm9,%xmm5 - vmovdqa %xmm5,144(%rsp) - vpunpcklqdq %xmm0,%xmm9,%xmm0 - vmovdqa 464(%rsp),%xmm5 - vpaddq %xmm5,%xmm1,%xmm1 - vpunpcklqdq %xmm1,%xmm5,%xmm6 - vpunpckhqdq %xmm1,%xmm5,%xmm1 - vpmuludq 512(%rsp),%xmm6,%xmm5 - vpmuludq 480(%rsp),%xmm1,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 560(%rsp),%xmm6,%xmm7 - vpmuludq 528(%rsp),%xmm1,%xmm8 - vpaddq %xmm8,%xmm7,%xmm7 - vpmuludq 576(%rsp),%xmm6,%xmm8 - vpmuludq 560(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm8,%xmm8 - vpmuludq 624(%rsp),%xmm6,%xmm9 - vpmuludq 592(%rsp),%xmm1,%xmm10 - vpaddq %xmm10,%xmm9,%xmm9 - vpmuludq 640(%rsp),%xmm6,%xmm10 - vpmuludq 624(%rsp),%xmm1,%xmm11 - vpaddq %xmm11,%xmm10,%xmm10 - vpmuludq 688(%rsp),%xmm6,%xmm11 - vpmuludq 656(%rsp),%xmm1,%xmm12 - vpaddq %xmm12,%xmm11,%xmm11 - vpmuludq 704(%rsp),%xmm6,%xmm12 - vpmuludq 688(%rsp),%xmm1,%xmm13 - vpaddq %xmm13,%xmm12,%xmm12 - vpmuludq 736(%rsp),%xmm6,%xmm13 - vpmuludq 720(%rsp),%xmm1,%xmm14 - vpaddq %xmm14,%xmm13,%xmm13 - vpmuludq 752(%rsp),%xmm6,%xmm14 - vpmuludq 736(%rsp),%xmm1,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpmuludq 480(%rsp),%xmm6,%xmm6 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 768(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vmovdqa 544(%rsp),%xmm1 - vpaddq %xmm1,%xmm2,%xmm2 - vpunpcklqdq %xmm2,%xmm1,%xmm15 - vpunpckhqdq %xmm2,%xmm1,%xmm1 - vpmuludq 480(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 512(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 560(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 576(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 624(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 640(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 688(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 704(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 - vpmuludq 736(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 752(%rsp),%xmm15,%xmm15 - vpaddq %xmm15,%xmm5,%xmm5 - vpmuludq 480(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 528(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 560(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 592(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 624(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 656(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 688(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 720(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 736(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 768(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vmovdqa 608(%rsp),%xmm1 - vpaddq %xmm1,%xmm3,%xmm3 - vpunpcklqdq %xmm3,%xmm1,%xmm2 - vpunpckhqdq %xmm3,%xmm1,%xmm1 - vpmuludq 480(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpmuludq 512(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - vpmuludq 560(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 576(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq 624(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 640(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 688(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 704(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 736(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 752(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 480(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 528(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 560(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 592(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 624(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 656(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 688(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 720(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 736(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 768(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vmovdqa 672(%rsp),%xmm1 - vpaddq %xmm1,%xmm4,%xmm4 - vpunpcklqdq %xmm4,%xmm1,%xmm2 - vpunpckhqdq %xmm4,%xmm1,%xmm1 - vpmuludq 480(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 512(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq 560(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 576(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 624(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 640(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 688(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 704(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 736(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpmuludq 752(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 480(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 528(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 560(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 592(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 624(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 656(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 688(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 720(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 736(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 768(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vmovdqa 448(%rsp),%xmm1 - vpaddq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm0,%xmm1,%xmm2 - vpunpckhqdq %xmm0,%xmm1,%xmm0 - vpmuludq 480(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpmuludq 512(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 560(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpmuludq 576(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm5,%xmm5 - vpmuludq 624(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vpmuludq 640(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm8,%xmm8 - vpmuludq 688(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vpmuludq 704(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm10,%xmm10 - vpmuludq 736(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq 752(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 480(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 - vpmuludq 528(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpmuludq 560(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm5,%xmm5 - vpmuludq 592(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vpmuludq 624(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm8,%xmm8 - vpmuludq 656(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vpmuludq 688(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm10,%xmm10 - vpmuludq 720(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq 736(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm12,%xmm12 - vpmuludq 768(%rsp),%xmm0,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpsrlq $26,%xmm6,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm10,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm5,%xmm0 - vpaddq %xmm0,%xmm7,%xmm7 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm11,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm7,%xmm0 - vpaddq %xmm0,%xmm8,%xmm8 - vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 - vpsrlq $25,%xmm12,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm8,%xmm0 - vpaddq %xmm0,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm13,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 - vpsrlq $26,%xmm9,%xmm0 - vpaddq %xmm0,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $25,%xmm14,%xmm0 - vpsllq $4,%xmm0,%xmm1 - vpaddq %xmm0,%xmm6,%xmm6 - vpsllq $1,%xmm0,%xmm0 - vpaddq %xmm0,%xmm1,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $25,%xmm10,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $26,%xmm6,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpunpckhqdq %xmm5,%xmm6,%xmm1 - vpunpcklqdq %xmm5,%xmm6,%xmm0 - vpunpckhqdq %xmm8,%xmm7,%xmm3 - vpunpcklqdq %xmm8,%xmm7,%xmm2 - vpunpckhqdq %xmm10,%xmm9,%xmm5 - vpunpcklqdq %xmm10,%xmm9,%xmm4 - vpunpckhqdq %xmm12,%xmm11,%xmm7 - vpunpcklqdq %xmm12,%xmm11,%xmm6 - vpunpckhqdq %xmm14,%xmm13,%xmm9 - vpunpcklqdq %xmm14,%xmm13,%xmm8 - cmp $0,%rdx - jne .Lladder_loop - vmovdqu %xmm1,160(%rdi) - vmovdqu %xmm0,80(%rdi) - vmovdqu %xmm3,176(%rdi) - vmovdqu %xmm2,96(%rdi) - vmovdqu %xmm5,192(%rdi) - vmovdqu %xmm4,112(%rdi) - vmovdqu %xmm7,208(%rdi) - vmovdqu %xmm6,128(%rdi) - vmovdqu %xmm9,224(%rdi) - vmovdqu %xmm8,144(%rdi) - movq 1824(%rsp),%r11 - movq 1832(%rsp),%r12 - movq 1840(%rsp),%r13 - movq 1848(%rsp),%r14 - leave - ret -SYM_FUNC_END(curve25519_sandy2x_ladder) - -.align 32 -SYM_FUNC_START(curve25519_sandy2x_ladder_base) - push %rbp - mov %rsp,%rbp - sub $1568,%rsp - and $-32,%rsp - movq %r11,1536(%rsp) - movq %r12,1544(%rsp) - movq %r13,1552(%rsp) - vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 - vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 - vmovdqa curve25519_sandy2x_v9_0(%rip),%xmm2 - vmovdqa %xmm2,0(%rsp) - vmovdqa %xmm0,16(%rsp) - vmovdqa %xmm0,32(%rsp) - vmovdqa %xmm0,48(%rsp) - vmovdqa %xmm0,64(%rsp) - vmovdqa %xmm1,80(%rsp) - vmovdqa %xmm0,96(%rsp) - vmovdqa %xmm0,112(%rsp) - vmovdqa %xmm0,128(%rsp) - vmovdqa %xmm0,144(%rsp) - vmovdqa %xmm1,%xmm0 - vpxor %xmm1,%xmm1,%xmm1 - vpxor %xmm2,%xmm2,%xmm2 - vpxor %xmm3,%xmm3,%xmm3 - vpxor %xmm4,%xmm4,%xmm4 - vpxor %xmm5,%xmm5,%xmm5 - vpxor %xmm6,%xmm6,%xmm6 - vpxor %xmm7,%xmm7,%xmm7 - vpxor %xmm8,%xmm8,%xmm8 - vpxor %xmm9,%xmm9,%xmm9 - movq 0(%rsi),%rdx - movq 8(%rsi),%rcx - movq 16(%rsi),%r8 - movq 24(%rsi),%r9 - shrd $1,%rcx,%rdx - shrd $1,%r8,%rcx - shrd $1,%r9,%r8 - shr $1,%r9 - xorq 0(%rsi),%rdx - xorq 8(%rsi),%rcx - xorq 16(%rsi),%r8 - xorq 24(%rsi),%r9 - leaq 512(%rsp),%rsi - mov $64,%rax - - .align 16 - .Lladder_base_small_loop: - mov %rdx,%r10 - mov %rcx,%r11 - mov %r8,%r12 - mov %r9,%r13 - shr $1,%rdx - shr $1,%rcx - shr $1,%r8 - shr $1,%r9 - and $1,%r10d - and $1,%r11d - and $1,%r12d - and $1,%r13d - neg %r10 - neg %r11 - neg %r12 - neg %r13 - movl %r10d,0(%rsi) - movl %r11d,256(%rsi) - movl %r12d,512(%rsi) - movl %r13d,768(%rsi) - add $4,%rsi - sub $1,%rax - jne .Lladder_base_small_loop - mov $255,%rdx - add $760,%rsi - - .align 16 - .Lladder_base_loop: - sub $1,%rdx - vbroadcastss 0(%rsi),%xmm10 - sub $4,%rsi - vmovdqa 0(%rsp),%xmm11 - vmovdqa 80(%rsp),%xmm12 - vpxor %xmm11,%xmm0,%xmm13 - vpand %xmm10,%xmm13,%xmm13 - vpxor %xmm13,%xmm0,%xmm0 - vpxor %xmm13,%xmm11,%xmm11 - vpxor %xmm12,%xmm1,%xmm13 - vpand %xmm10,%xmm13,%xmm13 - vpxor %xmm13,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vmovdqa 16(%rsp),%xmm13 - vmovdqa 96(%rsp),%xmm14 - vpxor %xmm13,%xmm2,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm2,%xmm2 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm3,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm3,%xmm3 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,0(%rsp) - vmovdqa %xmm14,16(%rsp) - vmovdqa 32(%rsp),%xmm13 - vmovdqa 112(%rsp),%xmm14 - vpxor %xmm13,%xmm4,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm4,%xmm4 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm5,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm5,%xmm5 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,32(%rsp) - vmovdqa %xmm14,80(%rsp) - vmovdqa 48(%rsp),%xmm13 - vmovdqa 128(%rsp),%xmm14 - vpxor %xmm13,%xmm6,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm6,%xmm6 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm7,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm7,%xmm7 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,48(%rsp) - vmovdqa %xmm14,96(%rsp) - vmovdqa 64(%rsp),%xmm13 - vmovdqa 144(%rsp),%xmm14 - vpxor %xmm13,%xmm8,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm15,%xmm13,%xmm13 - vpxor %xmm14,%xmm9,%xmm15 - vpand %xmm10,%xmm15,%xmm15 - vpxor %xmm15,%xmm9,%xmm9 - vpxor %xmm15,%xmm14,%xmm14 - vmovdqa %xmm13,64(%rsp) - vmovdqa %xmm14,112(%rsp) - vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 - vpsubq %xmm12,%xmm10,%xmm10 - vpaddq %xmm12,%xmm11,%xmm11 - vpunpckhqdq %xmm10,%xmm11,%xmm12 - vpunpcklqdq %xmm10,%xmm11,%xmm10 - vpaddq %xmm1,%xmm0,%xmm11 - vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 - vpsubq %xmm1,%xmm0,%xmm0 - vpunpckhqdq %xmm11,%xmm0,%xmm1 - vpunpcklqdq %xmm11,%xmm0,%xmm0 - vpmuludq %xmm0,%xmm10,%xmm11 - vpmuludq %xmm1,%xmm10,%xmm13 - vmovdqa %xmm1,128(%rsp) - vpaddq %xmm1,%xmm1,%xmm1 - vpmuludq %xmm0,%xmm12,%xmm14 - vmovdqa %xmm0,144(%rsp) - vpaddq %xmm14,%xmm13,%xmm13 - vpmuludq %xmm1,%xmm12,%xmm0 - vmovdqa %xmm1,160(%rsp) - vpaddq %xmm3,%xmm2,%xmm1 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 - vpsubq %xmm3,%xmm2,%xmm2 - vpunpckhqdq %xmm1,%xmm2,%xmm3 - vpunpcklqdq %xmm1,%xmm2,%xmm1 - vpmuludq %xmm1,%xmm10,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq %xmm3,%xmm10,%xmm2 - vmovdqa %xmm3,176(%rsp) - vpaddq %xmm3,%xmm3,%xmm3 - vpmuludq %xmm1,%xmm12,%xmm14 - vmovdqa %xmm1,192(%rsp) - vpaddq %xmm14,%xmm2,%xmm2 - vpmuludq %xmm3,%xmm12,%xmm1 - vmovdqa %xmm3,208(%rsp) - vpaddq %xmm5,%xmm4,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 - vpsubq %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm3,%xmm4,%xmm5 - vpunpcklqdq %xmm3,%xmm4,%xmm3 - vpmuludq %xmm3,%xmm10,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq %xmm5,%xmm10,%xmm4 - vmovdqa %xmm5,224(%rsp) - vpaddq %xmm5,%xmm5,%xmm5 - vpmuludq %xmm3,%xmm12,%xmm14 - vmovdqa %xmm3,240(%rsp) - vpaddq %xmm14,%xmm4,%xmm4 - vpaddq %xmm7,%xmm6,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 - vpsubq %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm3,%xmm6,%xmm7 - vpunpcklqdq %xmm3,%xmm6,%xmm3 - vpmuludq %xmm3,%xmm10,%xmm6 - vpmuludq %xmm5,%xmm12,%xmm14 - vmovdqa %xmm5,256(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 - vmovdqa %xmm5,272(%rsp) - vpaddq %xmm14,%xmm6,%xmm6 - vpmuludq %xmm7,%xmm10,%xmm5 - vmovdqa %xmm7,288(%rsp) - vpaddq %xmm7,%xmm7,%xmm7 - vpmuludq %xmm3,%xmm12,%xmm14 - vmovdqa %xmm3,304(%rsp) - vpaddq %xmm14,%xmm5,%xmm5 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vmovdqa %xmm3,320(%rsp) - vpaddq %xmm9,%xmm8,%xmm3 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 - vpsubq %xmm9,%xmm8,%xmm8 - vpunpckhqdq %xmm3,%xmm8,%xmm9 - vpunpcklqdq %xmm3,%xmm8,%xmm3 - vmovdqa %xmm3,336(%rsp) - vpmuludq %xmm7,%xmm12,%xmm8 - vmovdqa %xmm7,352(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 - vmovdqa %xmm7,368(%rsp) - vpmuludq %xmm3,%xmm10,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq %xmm9,%xmm10,%xmm7 - vmovdqa %xmm9,384(%rsp) - vpaddq %xmm9,%xmm9,%xmm9 - vpmuludq %xmm3,%xmm12,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vmovdqa %xmm3,400(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 - vpmuludq %xmm9,%xmm12,%xmm3 - vmovdqa %xmm9,416(%rsp) - vpaddq %xmm3,%xmm11,%xmm11 - vmovdqa 0(%rsp),%xmm3 - vmovdqa 16(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 192(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 176(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 240(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 224(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 304(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 288(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 336(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 384(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 160(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 192(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 208(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 240(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 256(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 304(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 352(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 336(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 416(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm0,%xmm0 - vmovdqa 32(%rsp),%xmm3 - vmovdqa 80(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 192(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 176(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 240(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 224(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 304(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 288(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 336(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 384(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 160(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 192(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 208(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 240(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 256(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 304(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 352(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 336(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 416(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm1,%xmm1 - vmovdqa 48(%rsp),%xmm3 - vmovdqa 96(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpmuludq 192(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 176(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 240(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 224(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 304(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 288(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 336(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 384(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 160(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 192(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 208(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 240(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 256(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 304(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 352(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 336(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 416(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm6,%xmm6 - vmovdqa 64(%rsp),%xmm3 - vmovdqa 112(%rsp),%xmm9 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 - vpsubq %xmm9,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm10,%xmm3,%xmm9 - vpunpcklqdq %xmm10,%xmm3,%xmm3 - vpmuludq 144(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpmuludq 128(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 - vpmuludq 192(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpmuludq 176(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm13,%xmm13 - vpmuludq 240(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpmuludq 224(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm2,%xmm2 - vpmuludq 304(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpmuludq 288(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpmuludq 336(%rsp),%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpmuludq 384(%rsp),%xmm3,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 144(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 - vpmuludq 160(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 192(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 208(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpmuludq 240(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpmuludq 256(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpmuludq 304(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpmuludq 352(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 336(%rsp),%xmm9,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 416(%rsp),%xmm9,%xmm9 - vpaddq %xmm9,%xmm8,%xmm8 - vpsrlq $25,%xmm4,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 - vpsrlq $26,%xmm11,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm6,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm13,%xmm3 - vpaddq %xmm3,%xmm0,%xmm0 - vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 - vpsrlq $25,%xmm5,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm0,%xmm3 - vpaddq %xmm3,%xmm2,%xmm2 - vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 - vpsrlq $26,%xmm8,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $25,%xmm2,%xmm3 - vpaddq %xmm3,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 - vpsrlq $25,%xmm7,%xmm3 - vpsllq $4,%xmm3,%xmm9 - vpaddq %xmm3,%xmm11,%xmm11 - vpsllq $1,%xmm3,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpaddq %xmm9,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $26,%xmm1,%xmm3 - vpaddq %xmm3,%xmm4,%xmm4 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $26,%xmm11,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $25,%xmm4,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 - vpunpcklqdq %xmm13,%xmm11,%xmm3 - vpunpckhqdq %xmm13,%xmm11,%xmm9 - vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 - vpsubq %xmm3,%xmm10,%xmm10 - vpaddq %xmm9,%xmm3,%xmm3 - vpunpckhqdq %xmm3,%xmm10,%xmm9 - vpunpcklqdq %xmm3,%xmm10,%xmm10 - vpmuludq %xmm10,%xmm10,%xmm3 - vpaddq %xmm10,%xmm10,%xmm10 - vpmuludq %xmm9,%xmm10,%xmm11 - vpunpcklqdq %xmm2,%xmm0,%xmm12 - vpunpckhqdq %xmm2,%xmm0,%xmm0 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 - vpsubq %xmm12,%xmm2,%xmm2 - vpaddq %xmm0,%xmm12,%xmm12 - vpunpckhqdq %xmm12,%xmm2,%xmm0 - vpunpcklqdq %xmm12,%xmm2,%xmm2 - vpmuludq %xmm2,%xmm10,%xmm12 - vpaddq %xmm9,%xmm9,%xmm13 - vpmuludq %xmm13,%xmm9,%xmm9 - vpaddq %xmm9,%xmm12,%xmm12 - vpmuludq %xmm0,%xmm10,%xmm9 - vpmuludq %xmm2,%xmm13,%xmm14 - vpaddq %xmm14,%xmm9,%xmm9 - vpunpcklqdq %xmm4,%xmm1,%xmm14 - vpunpckhqdq %xmm4,%xmm1,%xmm1 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 - vpsubq %xmm14,%xmm4,%xmm4 - vpaddq %xmm1,%xmm14,%xmm14 - vpunpckhqdq %xmm14,%xmm4,%xmm1 - vpunpcklqdq %xmm14,%xmm4,%xmm4 - vmovdqa %xmm1,0(%rsp) - vpaddq %xmm1,%xmm1,%xmm1 - vmovdqa %xmm1,16(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsp) - vpmuludq %xmm4,%xmm10,%xmm1 - vpmuludq %xmm2,%xmm2,%xmm14 - vpaddq %xmm14,%xmm1,%xmm1 - vpmuludq 0(%rsp),%xmm10,%xmm14 - vpmuludq %xmm4,%xmm13,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpunpcklqdq %xmm5,%xmm6,%xmm15 - vpunpckhqdq %xmm5,%xmm6,%xmm5 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 - vpsubq %xmm15,%xmm6,%xmm6 - vpaddq %xmm5,%xmm15,%xmm15 - vpunpckhqdq %xmm15,%xmm6,%xmm5 - vpunpcklqdq %xmm15,%xmm6,%xmm6 - vmovdqa %xmm6,48(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 - vmovdqa %xmm6,64(%rsp) - vmovdqa %xmm5,80(%rsp) - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 - vmovdqa %xmm5,96(%rsp) - vpmuludq 48(%rsp),%xmm10,%xmm5 - vpaddq %xmm0,%xmm0,%xmm6 - vpmuludq %xmm6,%xmm0,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpmuludq 80(%rsp),%xmm10,%xmm0 - vpmuludq %xmm4,%xmm6,%xmm15 - vpaddq %xmm15,%xmm0,%xmm0 - vpmuludq %xmm6,%xmm13,%xmm15 - vpaddq %xmm15,%xmm1,%xmm1 - vpmuludq %xmm6,%xmm2,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpunpcklqdq %xmm7,%xmm8,%xmm15 - vpunpckhqdq %xmm7,%xmm8,%xmm7 - vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 - vpsubq %xmm15,%xmm8,%xmm8 - vpaddq %xmm7,%xmm15,%xmm15 - vpunpckhqdq %xmm15,%xmm8,%xmm7 - vpunpcklqdq %xmm15,%xmm8,%xmm8 - vmovdqa %xmm8,112(%rsp) - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 - vmovdqa %xmm8,160(%rsp) - vpmuludq 112(%rsp),%xmm10,%xmm8 - vpmuludq %xmm7,%xmm10,%xmm10 - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 - vpmuludq %xmm15,%xmm7,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq %xmm15,%xmm13,%xmm7 - vpaddq %xmm7,%xmm3,%xmm3 - vpmuludq %xmm15,%xmm2,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq 80(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm7,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq 16(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 48(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm0,%xmm0 - vpmuludq 112(%rsp),%xmm13,%xmm7 - vpaddq %xmm7,%xmm10,%xmm10 - vpmuludq %xmm15,%xmm6,%xmm7 - vpaddq %xmm7,%xmm12,%xmm12 - vpmuludq %xmm15,%xmm4,%xmm7 - vpaddq %xmm7,%xmm9,%xmm9 - vpaddq %xmm2,%xmm2,%xmm2 - vpmuludq %xmm4,%xmm2,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 160(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm3,%xmm3 - vpmuludq 160(%rsp),%xmm6,%xmm7 - vpaddq %xmm7,%xmm11,%xmm11 - vpmuludq 0(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm0,%xmm0 - vpmuludq 48(%rsp),%xmm2,%xmm7 - vpaddq %xmm7,%xmm8,%xmm8 - vpmuludq 80(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 96(%rsp),%xmm4,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq %xmm4,%xmm4,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpaddq %xmm4,%xmm4,%xmm2 - vpmuludq 160(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq 16(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq 48(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm14,%xmm14 - vpmuludq 96(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 160(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 16(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm8,%xmm8 - vpmuludq 48(%rsp),%xmm6,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 80(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vpmuludq 112(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm0,%xmm0 - vmovdqa 48(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 160(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 80(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 160(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm14,%xmm14 - vpmuludq 64(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 64(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vmovdqa 16(%rsp),%xmm4 - vpmuludq 96(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vmovdqa 48(%rsp),%xmm4 - vpmuludq 96(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 0(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vmovdqa 32(%rsp),%xmm2 - vpmuludq 0(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vmovdqa 64(%rsp),%xmm2 - vpmuludq 48(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vmovdqa 96(%rsp),%xmm2 - vpmuludq 80(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vmovdqa 160(%rsp),%xmm2 - vpmuludq 112(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpsrlq $26,%xmm3,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 - vpsrlq $25,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $25,%xmm11,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm5,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm12,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm0,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm9,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 - vpsrlq $26,%xmm8,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $25,%xmm10,%xmm2 - vpsllq $4,%xmm2,%xmm4 - vpaddq %xmm2,%xmm3,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpaddq %xmm2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $26,%xmm3,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 - vpunpckhqdq %xmm11,%xmm3,%xmm2 - vmovdqa %xmm2,0(%rsp) - vpunpcklqdq %xmm11,%xmm3,%xmm2 - vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 - vmovdqa %xmm2,80(%rsp) - vpunpckhqdq %xmm9,%xmm12,%xmm2 - vmovdqa %xmm2,16(%rsp) - vpunpcklqdq %xmm9,%xmm12,%xmm2 - vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 - vmovdqa %xmm2,96(%rsp) - vpunpckhqdq %xmm14,%xmm1,%xmm2 - vmovdqa %xmm2,32(%rsp) - vpunpcklqdq %xmm14,%xmm1,%xmm1 - vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm1,%xmm1 - vmovdqa %xmm1,112(%rsp) - vpunpckhqdq %xmm0,%xmm5,%xmm1 - vmovdqa %xmm1,48(%rsp) - vpunpcklqdq %xmm0,%xmm5,%xmm0 - vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 - vmovdqa %xmm0,160(%rsp) - vpunpckhqdq %xmm10,%xmm8,%xmm0 - vmovdqa %xmm0,64(%rsp) - vpunpcklqdq %xmm10,%xmm8,%xmm0 - vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 - vmovdqa %xmm0,208(%rsp) - vmovdqa 144(%rsp),%xmm0 - vpmuludq %xmm0,%xmm0,%xmm1 - vpaddq %xmm0,%xmm0,%xmm0 - vmovdqa 128(%rsp),%xmm2 - vpmuludq %xmm2,%xmm0,%xmm3 - vmovdqa 192(%rsp),%xmm4 - vpmuludq %xmm4,%xmm0,%xmm5 - vmovdqa 176(%rsp),%xmm6 - vpmuludq %xmm6,%xmm0,%xmm7 - vmovdqa 240(%rsp),%xmm8 - vpmuludq %xmm8,%xmm0,%xmm9 - vpmuludq 224(%rsp),%xmm0,%xmm10 - vpmuludq 304(%rsp),%xmm0,%xmm11 - vpmuludq 288(%rsp),%xmm0,%xmm12 - vpmuludq 336(%rsp),%xmm0,%xmm13 - vmovdqa 384(%rsp),%xmm14 - vpmuludq %xmm14,%xmm0,%xmm0 - vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 - vpmuludq %xmm15,%xmm14,%xmm14 - vpaddq %xmm14,%xmm13,%xmm13 - vpaddq %xmm6,%xmm6,%xmm14 - vpmuludq %xmm14,%xmm6,%xmm6 - vpaddq %xmm6,%xmm11,%xmm11 - vpaddq %xmm2,%xmm2,%xmm6 - vpmuludq %xmm6,%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq %xmm15,%xmm6,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vpmuludq %xmm15,%xmm4,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpmuludq 256(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 304(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 352(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 336(%rsp),%xmm6,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq %xmm4,%xmm6,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq %xmm14,%xmm6,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq %xmm8,%xmm6,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq %xmm15,%xmm14,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq %xmm15,%xmm8,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq %xmm4,%xmm4,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq %xmm14,%xmm4,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpaddq %xmm4,%xmm4,%xmm2 - vpmuludq %xmm8,%xmm2,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vpmuludq 400(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpmuludq 400(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vpmuludq 224(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vpmuludq 304(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm13,%xmm13 - vpmuludq 288(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpmuludq 368(%rsp),%xmm8,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpmuludq %xmm8,%xmm14,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq %xmm8,%xmm8,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpaddq %xmm8,%xmm8,%xmm2 - vpmuludq 400(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vpmuludq 256(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 304(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 368(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 256(%rsp),%xmm4 - vpmuludq 400(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm7,%xmm7 - vpmuludq 256(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm13,%xmm13 - vpmuludq 304(%rsp),%xmm14,%xmm4 - vpaddq %xmm4,%xmm0,%xmm0 - vpmuludq 352(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm11,%xmm11 - vpmuludq 336(%rsp),%xmm15,%xmm4 - vpaddq %xmm4,%xmm12,%xmm12 - vmovdqa 304(%rsp),%xmm4 - vpaddq %xmm4,%xmm4,%xmm4 - vpmuludq 400(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm9,%xmm9 - vpmuludq 320(%rsp),%xmm2,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vmovdqa 256(%rsp),%xmm4 - vpmuludq 320(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm3,%xmm3 - vmovdqa 256(%rsp),%xmm4 - vpmuludq 368(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm5,%xmm5 - vmovdqa 304(%rsp),%xmm4 - vpmuludq 368(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm7,%xmm7 - vmovdqa 352(%rsp),%xmm4 - vpmuludq 400(%rsp),%xmm4,%xmm4 - vpaddq %xmm4,%xmm10,%xmm10 - vpmuludq 224(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vmovdqa 272(%rsp),%xmm2 - vpmuludq 224(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm1,%xmm1 - vmovdqa 320(%rsp),%xmm2 - vpmuludq 304(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vmovdqa 368(%rsp),%xmm2 - vpmuludq 288(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vmovdqa 400(%rsp),%xmm2 - vpmuludq 336(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpsrlq $25,%xmm10,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm3,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 - vpsrlq $26,%xmm11,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm5,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 - vpsrlq $25,%xmm12,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm7,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $26,%xmm13,%xmm2 - vpaddq %xmm2,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 - vpsrlq $26,%xmm9,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $25,%xmm0,%xmm2 - vpsllq $4,%xmm2,%xmm4 - vpaddq %xmm2,%xmm1,%xmm1 - vpsllq $1,%xmm2,%xmm2 - vpaddq %xmm2,%xmm4,%xmm4 - vpaddq %xmm4,%xmm1,%xmm1 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm10,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $26,%xmm1,%xmm2 - vpaddq %xmm2,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 - vpunpckhqdq %xmm3,%xmm1,%xmm2 - vpunpcklqdq %xmm3,%xmm1,%xmm1 - vmovdqa %xmm1,176(%rsp) - vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 - vpsubq %xmm1,%xmm3,%xmm3 - vpunpckhqdq %xmm3,%xmm2,%xmm1 - vpunpcklqdq %xmm3,%xmm2,%xmm2 - vmovdqa %xmm2,192(%rsp) - vmovdqa %xmm1,224(%rsp) - vpsllq $1,%xmm1,%xmm1 - vmovdqa %xmm1,240(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 - vmovdqa 80(%rsp),%xmm1 - vpunpcklqdq %xmm1,%xmm3,%xmm2 - vpunpckhqdq %xmm1,%xmm3,%xmm1 - vpunpckhqdq %xmm7,%xmm5,%xmm3 - vpunpcklqdq %xmm7,%xmm5,%xmm4 - vmovdqa %xmm4,256(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 - vpsubq %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm5,%xmm3,%xmm4 - vpunpcklqdq %xmm5,%xmm3,%xmm3 - vmovdqa %xmm3,272(%rsp) - vmovdqa %xmm4,288(%rsp) - vpsllq $1,%xmm4,%xmm4 - vmovdqa %xmm4,304(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 - vmovdqa 96(%rsp),%xmm3 - vpunpcklqdq %xmm3,%xmm5,%xmm4 - vpunpckhqdq %xmm3,%xmm5,%xmm3 - vpunpckhqdq %xmm10,%xmm9,%xmm5 - vpunpcklqdq %xmm10,%xmm9,%xmm6 - vmovdqa %xmm6,320(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 - vpsubq %xmm6,%xmm7,%xmm7 - vpunpckhqdq %xmm7,%xmm5,%xmm6 - vpunpcklqdq %xmm7,%xmm5,%xmm5 - vmovdqa %xmm5,336(%rsp) - vmovdqa %xmm6,352(%rsp) - vpsllq $1,%xmm6,%xmm6 - vmovdqa %xmm6,368(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 - vmovdqa 112(%rsp),%xmm5 - vpunpcklqdq %xmm5,%xmm7,%xmm6 - vpunpckhqdq %xmm5,%xmm7,%xmm5 - vpunpckhqdq %xmm12,%xmm11,%xmm7 - vpunpcklqdq %xmm12,%xmm11,%xmm8 - vmovdqa %xmm8,384(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 - vpsubq %xmm8,%xmm9,%xmm9 - vpunpckhqdq %xmm9,%xmm7,%xmm8 - vpunpcklqdq %xmm9,%xmm7,%xmm7 - vmovdqa %xmm7,400(%rsp) - vmovdqa %xmm8,416(%rsp) - vpsllq $1,%xmm8,%xmm8 - vmovdqa %xmm8,432(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 - vmovdqa 160(%rsp),%xmm7 - vpunpcklqdq %xmm7,%xmm9,%xmm8 - vpunpckhqdq %xmm7,%xmm9,%xmm7 - vpunpckhqdq %xmm0,%xmm13,%xmm9 - vpunpcklqdq %xmm0,%xmm13,%xmm0 - vmovdqa %xmm0,160(%rsp) - vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 - vpsubq %xmm0,%xmm10,%xmm10 - vpunpckhqdq %xmm10,%xmm9,%xmm0 - vpunpcklqdq %xmm10,%xmm9,%xmm9 - vmovdqa %xmm9,448(%rsp) - vmovdqa %xmm0,464(%rsp) - vpsllq $1,%xmm0,%xmm0 - vmovdqa %xmm0,480(%rsp) - vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 - vmovdqa 208(%rsp),%xmm0 - vpunpcklqdq %xmm0,%xmm10,%xmm9 - vpunpckhqdq %xmm0,%xmm10,%xmm0 - vpsrlq $26,%xmm2,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 - vpsrlq $25,%xmm5,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $25,%xmm1,%xmm10 - vpaddq %xmm10,%xmm4,%xmm4 - vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 - vpsrlq $26,%xmm8,%xmm10 - vpaddq %xmm10,%xmm7,%xmm7 - vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm4,%xmm10 - vpaddq %xmm10,%xmm3,%xmm3 - vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 - vpsrlq $25,%xmm7,%xmm10 - vpaddq %xmm10,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 - vpsrlq $25,%xmm3,%xmm10 - vpaddq %xmm10,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 - vpsrlq $26,%xmm9,%xmm10 - vpaddq %xmm10,%xmm0,%xmm0 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $26,%xmm6,%xmm10 - vpaddq %xmm10,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm0,%xmm10 - vpsllq $4,%xmm10,%xmm11 - vpaddq %xmm10,%xmm2,%xmm2 - vpsllq $1,%xmm10,%xmm10 - vpaddq %xmm10,%xmm11,%xmm11 - vpaddq %xmm11,%xmm2,%xmm2 - vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 - vpsrlq $25,%xmm5,%xmm10 - vpaddq %xmm10,%xmm8,%xmm8 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm2,%xmm10 - vpaddq %xmm10,%xmm1,%xmm1 - vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 - vpunpckhqdq %xmm1,%xmm2,%xmm10 - vmovdqa %xmm10,80(%rsp) - vpunpcklqdq %xmm1,%xmm2,%xmm1 - vpunpckhqdq %xmm3,%xmm4,%xmm2 - vmovdqa %xmm2,96(%rsp) - vpunpcklqdq %xmm3,%xmm4,%xmm2 - vpunpckhqdq %xmm5,%xmm6,%xmm3 - vmovdqa %xmm3,112(%rsp) - vpunpcklqdq %xmm5,%xmm6,%xmm3 - vpunpckhqdq %xmm7,%xmm8,%xmm4 - vmovdqa %xmm4,128(%rsp) - vpunpcklqdq %xmm7,%xmm8,%xmm4 - vpunpckhqdq %xmm0,%xmm9,%xmm5 - vmovdqa %xmm5,144(%rsp) - vpunpcklqdq %xmm0,%xmm9,%xmm0 - vmovdqa 176(%rsp),%xmm5 - vpaddq %xmm5,%xmm1,%xmm1 - vpunpcklqdq %xmm1,%xmm5,%xmm6 - vpunpckhqdq %xmm1,%xmm5,%xmm1 - vpmuludq 224(%rsp),%xmm6,%xmm5 - vpmuludq 192(%rsp),%xmm1,%xmm7 - vpaddq %xmm7,%xmm5,%xmm5 - vpmuludq 272(%rsp),%xmm6,%xmm7 - vpmuludq 240(%rsp),%xmm1,%xmm8 - vpaddq %xmm8,%xmm7,%xmm7 - vpmuludq 288(%rsp),%xmm6,%xmm8 - vpmuludq 272(%rsp),%xmm1,%xmm9 - vpaddq %xmm9,%xmm8,%xmm8 - vpmuludq 336(%rsp),%xmm6,%xmm9 - vpmuludq 304(%rsp),%xmm1,%xmm10 - vpaddq %xmm10,%xmm9,%xmm9 - vpmuludq 352(%rsp),%xmm6,%xmm10 - vpmuludq 336(%rsp),%xmm1,%xmm11 - vpaddq %xmm11,%xmm10,%xmm10 - vpmuludq 400(%rsp),%xmm6,%xmm11 - vpmuludq 368(%rsp),%xmm1,%xmm12 - vpaddq %xmm12,%xmm11,%xmm11 - vpmuludq 416(%rsp),%xmm6,%xmm12 - vpmuludq 400(%rsp),%xmm1,%xmm13 - vpaddq %xmm13,%xmm12,%xmm12 - vpmuludq 448(%rsp),%xmm6,%xmm13 - vpmuludq 432(%rsp),%xmm1,%xmm14 - vpaddq %xmm14,%xmm13,%xmm13 - vpmuludq 464(%rsp),%xmm6,%xmm14 - vpmuludq 448(%rsp),%xmm1,%xmm15 - vpaddq %xmm15,%xmm14,%xmm14 - vpmuludq 192(%rsp),%xmm6,%xmm6 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 480(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vmovdqa 256(%rsp),%xmm1 - vpaddq %xmm1,%xmm2,%xmm2 - vpunpcklqdq %xmm2,%xmm1,%xmm15 - vpunpckhqdq %xmm2,%xmm1,%xmm1 - vpmuludq 192(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 224(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 272(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 288(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 336(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 352(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 400(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 416(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 - vpmuludq 448(%rsp),%xmm15,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 464(%rsp),%xmm15,%xmm15 - vpaddq %xmm15,%xmm5,%xmm5 - vpmuludq 192(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 240(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 272(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 304(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 336(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 368(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 400(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 432(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 448(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 480(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vmovdqa 320(%rsp),%xmm1 - vpaddq %xmm1,%xmm3,%xmm3 - vpunpcklqdq %xmm3,%xmm1,%xmm2 - vpunpckhqdq %xmm3,%xmm1,%xmm1 - vpmuludq 192(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpmuludq 224(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm10,%xmm10 - vpmuludq 272(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 288(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq 336(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 352(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 400(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 416(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 448(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 464(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 192(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 240(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm11,%xmm11 - vpmuludq 272(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 304(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 336(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 368(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 400(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 432(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 448(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 480(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vmovdqa 384(%rsp),%xmm1 - vpaddq %xmm1,%xmm4,%xmm4 - vpunpcklqdq %xmm4,%xmm1,%xmm2 - vpunpckhqdq %xmm4,%xmm1,%xmm1 - vpmuludq 192(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm11,%xmm11 - vpmuludq 224(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm12,%xmm12 - vpmuludq 272(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm13,%xmm13 - vpmuludq 288(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 336(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm6,%xmm6 - vpmuludq 352(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm5,%xmm5 - vpmuludq 400(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm7,%xmm7 - vpmuludq 416(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm8,%xmm8 - vpmuludq 448(%rsp),%xmm2,%xmm3 - vpaddq %xmm3,%xmm9,%xmm9 - vpmuludq 464(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 192(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 240(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm13,%xmm13 - vpmuludq 272(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 - vpmuludq 304(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm6,%xmm6 - vpmuludq 336(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm5,%xmm5 - vpmuludq 368(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm7,%xmm7 - vpmuludq 400(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm8,%xmm8 - vpmuludq 432(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm9,%xmm9 - vpmuludq 448(%rsp),%xmm1,%xmm2 - vpaddq %xmm2,%xmm10,%xmm10 - vpmuludq 480(%rsp),%xmm1,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vmovdqa 160(%rsp),%xmm1 - vpaddq %xmm1,%xmm0,%xmm0 - vpunpcklqdq %xmm0,%xmm1,%xmm2 - vpunpckhqdq %xmm0,%xmm1,%xmm0 - vpmuludq 192(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm13,%xmm13 - vpmuludq 224(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 - vpmuludq 272(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpmuludq 288(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm5,%xmm5 - vpmuludq 336(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vpmuludq 352(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm8,%xmm8 - vpmuludq 400(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vpmuludq 416(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm10,%xmm10 - vpmuludq 448(%rsp),%xmm2,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq 464(%rsp),%xmm2,%xmm2 - vpaddq %xmm2,%xmm12,%xmm12 - vpmuludq 192(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm14,%xmm14 - vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 - vpmuludq 240(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpmuludq 272(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm5,%xmm5 - vpmuludq 304(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm7,%xmm7 - vpmuludq 336(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm8,%xmm8 - vpmuludq 368(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm9,%xmm9 - vpmuludq 400(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm10,%xmm10 - vpmuludq 432(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm11,%xmm11 - vpmuludq 448(%rsp),%xmm0,%xmm1 - vpaddq %xmm1,%xmm12,%xmm12 - vpmuludq 480(%rsp),%xmm0,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpsrlq $26,%xmm6,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpsrlq $25,%xmm10,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $25,%xmm5,%xmm0 - vpaddq %xmm0,%xmm7,%xmm7 - vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 - vpsrlq $26,%xmm11,%xmm0 - vpaddq %xmm0,%xmm12,%xmm12 - vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 - vpsrlq $26,%xmm7,%xmm0 - vpaddq %xmm0,%xmm8,%xmm8 - vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 - vpsrlq $25,%xmm12,%xmm0 - vpaddq %xmm0,%xmm13,%xmm13 - vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 - vpsrlq $25,%xmm8,%xmm0 - vpaddq %xmm0,%xmm9,%xmm9 - vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 - vpsrlq $26,%xmm13,%xmm0 - vpaddq %xmm0,%xmm14,%xmm14 - vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 - vpsrlq $26,%xmm9,%xmm0 - vpaddq %xmm0,%xmm10,%xmm10 - vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 - vpsrlq $25,%xmm14,%xmm0 - vpsllq $4,%xmm0,%xmm1 - vpaddq %xmm0,%xmm6,%xmm6 - vpsllq $1,%xmm0,%xmm0 - vpaddq %xmm0,%xmm1,%xmm1 - vpaddq %xmm1,%xmm6,%xmm6 - vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 - vpsrlq $25,%xmm10,%xmm0 - vpaddq %xmm0,%xmm11,%xmm11 - vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 - vpsrlq $26,%xmm6,%xmm0 - vpaddq %xmm0,%xmm5,%xmm5 - vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 - vpunpckhqdq %xmm5,%xmm6,%xmm1 - vpunpcklqdq %xmm5,%xmm6,%xmm0 - vpunpckhqdq %xmm8,%xmm7,%xmm3 - vpunpcklqdq %xmm8,%xmm7,%xmm2 - vpunpckhqdq %xmm10,%xmm9,%xmm5 - vpunpcklqdq %xmm10,%xmm9,%xmm4 - vpunpckhqdq %xmm12,%xmm11,%xmm7 - vpunpcklqdq %xmm12,%xmm11,%xmm6 - vpunpckhqdq %xmm14,%xmm13,%xmm9 - vpunpcklqdq %xmm14,%xmm13,%xmm8 - cmp $0,%rdx - jne .Lladder_base_loop - vmovdqu %xmm1,80(%rdi) - vmovdqu %xmm0,0(%rdi) - vmovdqu %xmm3,96(%rdi) - vmovdqu %xmm2,16(%rdi) - vmovdqu %xmm5,112(%rdi) - vmovdqu %xmm4,32(%rdi) - vmovdqu %xmm7,128(%rdi) - vmovdqu %xmm6,48(%rdi) - vmovdqu %xmm9,144(%rdi) - vmovdqu %xmm8,64(%rdi) - movq 1536(%rsp),%r11 - movq 1544(%rsp),%r12 - movq 1552(%rsp),%r13 - leave - ret -SYM_FUNC_END(curve25519_sandy2x_ladder_base) -#endif /* CONFIG_AS_AVX */ diff --git a/curve25519-sandy2x.c b/curve25519-sandy2x.c deleted file mode 100644 index e8d5d2b..0000000 --- a/curve25519-sandy2x.c +++ /dev/null @@ -1,139 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Original author: Tung Chou <blueprint@crypto.tw> - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -typedef u64 fe[10]; -typedef u64 fe51[5]; -asmlinkage void curve25519_sandy2x_ladder(fe *, const u8 *); -asmlinkage void curve25519_sandy2x_ladder_base(fe *, const u8 *); -asmlinkage void curve25519_sandy2x_fe51_pack(u8 *, const fe51 *); -asmlinkage void curve25519_sandy2x_fe51_mul(fe51 *, const fe51 *, const fe51 *); -asmlinkage void curve25519_sandy2x_fe51_nsquare(fe51 *, const fe51 *, int); - -static inline u32 le24_to_cpupv(const u8 *in) -{ - return le16_to_cpup((__le16 *)in) | ((u32)in[2]) << 16; -} - -static inline void fe_frombytes(fe h, const u8 *s) -{ - u64 h0 = le32_to_cpup((__le32 *)s); - u64 h1 = le24_to_cpupv(s + 4) << 6; - u64 h2 = le24_to_cpupv(s + 7) << 5; - u64 h3 = le24_to_cpupv(s + 10) << 3; - u64 h4 = le24_to_cpupv(s + 13) << 2; - u64 h5 = le32_to_cpup((__le32 *)(s + 16)); - u64 h6 = le24_to_cpupv(s + 20) << 7; - u64 h7 = le24_to_cpupv(s + 23) << 5; - u64 h8 = le24_to_cpupv(s + 26) << 4; - u64 h9 = (le24_to_cpupv(s + 29) & 8388607) << 2; - u64 carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9; - - carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF; - carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF; - carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF; - carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF; - carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF; - - carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF; - carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF; - carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF; - carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF; - carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF; - - h[0] = h0; - h[1] = h1; - h[2] = h2; - h[3] = h3; - h[4] = h4; - h[5] = h5; - h[6] = h6; - h[7] = h7; - h[8] = h8; - h[9] = h9; -} - -static inline void fe51_invert(fe51 *r, const fe51 *x) -{ - fe51 z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t; - - /* 2 */ curve25519_sandy2x_fe51_nsquare(&z2, x, 1); - /* 4 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2, 1); - /* 8 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 1); - /* 9 */ curve25519_sandy2x_fe51_mul(&z9, (const fe51 *)&t, x); - /* 11 */ curve25519_sandy2x_fe51_mul(&z11, (const fe51 *)&z9, (const fe51 *)&z2); - /* 22 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z11, 1); - /* 2^5 - 2^0 = 31 */ curve25519_sandy2x_fe51_mul(&z2_5_0, (const fe51 *)&t, (const fe51 *)&z9); - - /* 2^10 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_5_0, 5); - /* 2^10 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_10_0, (const fe51 *)&t, (const fe51 *)&z2_5_0); - - /* 2^20 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_10_0, 10); - /* 2^20 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_20_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); - - /* 2^40 - 2^20 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_20_0, 20); - /* 2^40 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_20_0); - - /* 2^50 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 10); - /* 2^50 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_50_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); - - /* 2^100 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_50_0, 50); - /* 2^100 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_100_0, (const fe51 *)&t, (const fe51 *)&z2_50_0); - - /* 2^200 - 2^100 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_100_0, 100); - /* 2^200 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_100_0); - - /* 2^250 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 50); - /* 2^250 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_50_0); - - /* 2^255 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 5); - /* 2^255 - 21 */ curve25519_sandy2x_fe51_mul(r, (const fe51 *)t, (const fe51 *)&z11); -} - -bool curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) -{ - u8 e[32]; - fe var[3]; - fe51 x_51, z_51; - - memcpy(e, secret, 32); - normalize_secret(e); -#define x1 var[0] -#define x2 var[1] -#define z2 var[2] - fe_frombytes(x1, basepoint); - curve25519_sandy2x_ladder(var, e); - z_51[0] = (z2[1] << 26) + z2[0]; - z_51[1] = (z2[3] << 26) + z2[2]; - z_51[2] = (z2[5] << 26) + z2[4]; - z_51[3] = (z2[7] << 26) + z2[6]; - z_51[4] = (z2[9] << 26) + z2[8]; - x_51[0] = (x2[1] << 26) + x2[0]; - x_51[1] = (x2[3] << 26) + x2[2]; - x_51[2] = (x2[5] << 26) + x2[4]; - x_51[3] = (x2[7] << 26) + x2[6]; - x_51[4] = (x2[9] << 26) + x2[8]; -#undef x1 -#undef x2 -#undef z2 - fe51_invert(&z_51, (const fe51 *)&z_51); - curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); - curve25519_sandy2x_fe51_pack(mypublic, (const fe51 *)&x_51); - - return true; -} diff --git a/curve25519-tweetnacl.c b/curve25519-tweetnacl.c deleted file mode 100644 index 91e86a3..0000000 --- a/curve25519-tweetnacl.c +++ /dev/null @@ -1,169 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2015-2018 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. - * - * Curve25519 ECDH functions, based on TweetNaCl but cleaned up. - */ - -#include <linux/kernel.h> -#include <linux/string.h> - -enum { CURVE25519_POINT_SIZE = 32 }; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -typedef s64 fe[16]; - -static inline void carry(fe o) -{ - int i; - - for (i = 0; i < 16; ++i) { - o[(i + 1) % 16] += (i == 15 ? 38 : 1) * (o[i] >> 16); - o[i] &= 0xffff; - } -} - -static inline void cswap(fe p, fe q, int b) -{ - int i; - s64 t, c = ~(b - 1); - - for (i = 0; i < 16; ++i) { - t = c & (p[i] ^ q[i]); - p[i] ^= t; - q[i] ^= t; - } -} - -static inline void pack(u8 *o, const fe n) -{ - int i, j, b; - fe m, t; - - memcpy(t, n, sizeof(t)); - carry(t); - carry(t); - carry(t); - for (j = 0; j < 2; ++j) { - m[0] = t[0] - 0xffed; - for (i = 1; i < 15; ++i) { - m[i] = t[i] - 0xffff - ((m[i - 1] >> 16) & 1); - m[i - 1] &= 0xffff; - } - m[15] = t[15] - 0x7fff - ((m[14] >> 16) & 1); - b = (m[15] >> 16) & 1; - m[14] &= 0xffff; - cswap(t, m, 1 - b); - } - for (i = 0; i < 16; ++i) { - o[2 * i] = t[i] & 0xff; - o[2 * i + 1] = t[i] >> 8; - } -} - -static inline void unpack(fe o, const u8 *n) -{ - int i; - - for (i = 0; i < 16; ++i) - o[i] = n[2 * i] + ((s64)n[2 * i + 1] << 8); - o[15] &= 0x7fff; -} - -static inline void add(fe o, const fe a, const fe b) -{ - int i; - - for (i = 0; i < 16; ++i) - o[i] = a[i] + b[i]; -} - -static inline void subtract(fe o, const fe a, const fe b) -{ - int i; - - for (i = 0; i < 16; ++i) - o[i] = a[i] - b[i]; -} - -static inline void multmod(fe o, const fe a, const fe b) -{ - int i, j; - s64 t[31] = { 0 }; - - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - t[i + j] += a[i] * b[j]; - } - for (i = 0; i < 15; ++i) - t[i] += 38 * t[i + 16]; - memcpy(o, t, sizeof(fe)); - carry(o); - carry(o); -} - -static inline void invert(fe o, const fe i) -{ - fe c; - int a; - - memcpy(c, i, sizeof(c)); - for (a = 253; a >= 0; --a) { - multmod(c, c, c); - if (a != 2 && a != 4) - multmod(c, c, i); - } - memcpy(o, c, sizeof(fe)); -} - -bool curve25519_tweetnacl(u8 shared_secret[CURVE25519_POINT_SIZE], const u8 private_key[CURVE25519_POINT_SIZE], const u8 public_key[CURVE25519_POINT_SIZE]) -{ - static const fe a24 = { 0xdb41, 1 }; - u8 z[32]; - s64 r; - int i; - fe a = { 1 }, b, c = { 0 }, d = { 1 }, e, f, x; - - memcpy(z, private_key, sizeof(z)); - normalize_secret(z); - - unpack(x, public_key); - memcpy(b, x, sizeof(b)); - - for (i = 254; i >= 0; --i) { - r = (z[i >> 3] >> (i & 7)) & 1; - cswap(a, b, r); - cswap(c, d, r); - add(e, a, c); - subtract(a, a, c); - add(c, b, d); - subtract(b, b, d); - multmod(d, e, e); - multmod(f, a, a); - multmod(a, c, a); - multmod(c, b, e); - add(e, a, c); - subtract(a, a, c); - multmod(b, a, a); - subtract(c, d, f); - multmod(a, c, a24); - add(a, a, d); - multmod(c, c, a); - multmod(a, d, f); - multmod(d, b, x); - multmod(b, e, e); - cswap(a, b, r); - cswap(c, d, r); - } - invert(c, c); - multmod(a, a, c); - pack(shared_secret, a); - - return true; -} @@ -62,17 +62,8 @@ static __always_inline int name(void) \ } while (0) -declare_it(donna64) -declare_it(hacl64) -declare_it(fiat64) -declare_it(sandy2x) -declare_it(amd64) -declare_it(precomp_bmi2) declare_it(precomp_adx) declare_it(ever64) -declare_it(fiat32) -declare_it(donna32) -declare_it(tweetnacl) static int compare_cycles(const void *a, const void *b) { @@ -86,22 +77,10 @@ static bool verify(void) u8 out[CURVE25519_POINT_SIZE]; for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { - test_it(donna64, {}, {}); - test_it(hacl64, {}, {}); - test_it(fiat64, {}, {}); - if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end()); - if (boot_cpu_has(X86_FEATURE_BMI2)) - test_it(precomp_bmi2, {}, {}); if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) { test_it(precomp_adx, {}, {}); test_it(ever64, {}, {}); } - if (dangerous) - test_it(amd64, {}, {}); - test_it(fiat32, {}, {}); - test_it(donna32, {}, {}); - test_it(tweetnacl, {}, {}); } return true; } @@ -111,17 +90,8 @@ static int __init mod_init(void) enum { WARMUP = 6000, TRIALS = 5000, IDLE = 1 * 1000 }; int ret = 0, i; cycles_t *trial_times; - cycles_t median_donna64 = 0; - cycles_t median_hacl64 = 0; - cycles_t median_fiat64 = 0; - cycles_t median_sandy2x = 0; - cycles_t median_amd64 = 0; - cycles_t median_precomp_bmi2 = 0; cycles_t median_precomp_adx = 0; cycles_t median_ever64 = 0; - cycles_t median_fiat32 = 0; - cycles_t median_donna32 = 0; - cycles_t median_tweetnacl = 0; unsigned long flags; DEFINE_SPINLOCK(lock); @@ -135,46 +105,16 @@ static int __init mod_init(void) msleep(IDLE); spin_lock_irqsave(&lock, flags); - - do_it(donna64); - do_it(hacl64); - do_it(fiat64); - if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) { - kernel_fpu_begin(); - do_it(sandy2x); - kernel_fpu_end(); - } - if (boot_cpu_has(X86_FEATURE_BMI2)) - do_it(precomp_bmi2); if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) { do_it(precomp_adx); do_it(ever64); } - if (dangerous) - do_it(amd64); - do_it(fiat32); - do_it(donna32); - do_it(tweetnacl); - spin_unlock_irqrestore(&lock, flags); - report_it(donna64); - report_it(hacl64); - report_it(fiat64); - if (boot_cpu_has(X86_FEATURE_AVX) && cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) - report_it(sandy2x); - if (boot_cpu_has(X86_FEATURE_BMI2)) - report_it(precomp_bmi2); if (boot_cpu_has(X86_FEATURE_BMI2) && boot_cpu_has(X86_FEATURE_ADX)) { report_it(precomp_adx); report_it(ever64); } - if (dangerous) - report_it(amd64); - report_it(fiat32); - report_it(donna32); - report_it(tweetnacl); - /* Don't let compiler be too clever. */ dummy = ret; kfree(trial_times); |