From c61c26b80479082f4d8b708ae808774ec2ba3a6c Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 18 Jan 2018 13:06:26 +0100 Subject: Import other curves for comparison --- Makefile | 2 +- curve25519-amd64-asm.S | 1888 +++++++++++++++++++++++++++ curve25519-amd64.c | 234 ++++ curve25519-donna32.c | 861 ++++++++++++ curve25519-donna64.c | 414 ++++++ curve25519-fiat32.c | 838 ++++++++++++ curve25519-hacl64.c | 751 +++++++++++ curve25519-sandy2x-asm.S | 3261 ++++++++++++++++++++++++++++++++++++++++++++++ curve25519-sandy2x.c | 139 ++ curve25519-u128.c | 420 ------ function.h | 68 - main.c | 99 +- test_vectors.h | 48 + 13 files changed, 8524 insertions(+), 499 deletions(-) create mode 100644 curve25519-amd64-asm.S create mode 100644 curve25519-amd64.c create mode 100644 curve25519-donna32.c create mode 100644 curve25519-donna64.c create mode 100644 curve25519-fiat32.c create mode 100644 curve25519-hacl64.c create mode 100644 curve25519-sandy2x-asm.S create mode 100644 curve25519-sandy2x.c delete mode 100644 curve25519-u128.c delete mode 100644 function.h create mode 100644 test_vectors.h diff --git a/Makefile b/Makefile index 1c9e6ec..16e5c52 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ ifneq ($(KERNELRELEASE),) -kbench9000-y := main.o curve25519-u128.o +kbench9000-y := main.o curve25519-donna64.o curve25519-hacl64.o curve25519-sandy2x.o curve25519-sandy2x-asm.o curve25519-amd64.o curve25519-amd64-asm.o curve25519-donna32.o curve25519-fiat32.o obj-m := kbench9000.o ccflags-y += -O3 ccflags-y += -D'pr_fmt(fmt)=KBUILD_MODNAME ": " fmt' diff --git a/curve25519-amd64-asm.S b/curve25519-amd64-asm.S new file mode 100644 index 0000000..27a5b6a --- /dev/null +++ b/curve25519-amd64-asm.S @@ -0,0 +1,1888 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015 Google Inc. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Peter Schwabe + */ + +/************************************************ + * W A R N I N G + * W A R N I N G + * W A R N I N G + * W A R N I N G + * W A R N I N G + * + * Do not import this file into the kernel as-is, + * because it makes use of the x86_64 redzone, + * which will entirely melt the kernel. We're sort + * of getting away with it here, since interrupts + * are disabled, but DANGER this will kill kittens. + * + * W A R N I N G + * W A R N I N G + * W A R N I N G + * W A R N I N G + * W A R N I N G + ************************************************/ + +.data +.p2align 4 + +x25519_x86_64_REDMASK51: .quad 0x0007FFFFFFFFFFFF +x25519_x86_64_121666_213: .quad 996687872 +x25519_x86_64_2P0: .quad 0xFFFFFFFFFFFDA +x25519_x86_64_2P1234: .quad 0xFFFFFFFFFFFFE +x25519_x86_64_4P0: .quad 0x1FFFFFFFFFFFB4 +x25519_x86_64_4P1234: .quad 0x1FFFFFFFFFFFFC +x25519_x86_64_MU0: .quad 0xED9CE5A30A2C131B +x25519_x86_64_MU1: .quad 0x2106215D086329A7 +x25519_x86_64_MU2: .quad 0xFFFFFFFFFFFFFFEB +x25519_x86_64_MU3: .quad 0xFFFFFFFFFFFFFFFF +x25519_x86_64_MU4: .quad 0x000000000000000F +x25519_x86_64_ORDER0: .quad 0x5812631A5CF5D3ED +x25519_x86_64_ORDER1: .quad 0x14DEF9DEA2F79CD6 +x25519_x86_64_ORDER2: .quad 0x0000000000000000 +x25519_x86_64_ORDER3: .quad 0x1000000000000000 +x25519_x86_64_EC2D0: .quad 1859910466990425 +x25519_x86_64_EC2D1: .quad 932731440258426 +x25519_x86_64_EC2D2: .quad 1072319116312658 +x25519_x86_64_EC2D3: .quad 1815898335770999 +x25519_x86_64_EC2D4: .quad 633789495995903 +x25519_x86_64__38: .quad 38 + +.text +.p2align 5 + +.globl x25519_x86_64_freeze +.hidden x25519_x86_64_freeze +x25519_x86_64_freeze: +.cfi_startproc +/* This is a leaf function and uses the redzone for saving registers. */ +movq %r12,-8(%rsp) +.cfi_rel_offset r12, -8 +movq 0(%rdi),%rsi +movq 8(%rdi),%rdx +movq 16(%rdi),%rcx +movq 24(%rdi),%r8 +movq 32(%rdi),%r9 +movq x25519_x86_64_REDMASK51(%rip),%rax +mov %rax,%r10 +sub $18,%r10 +mov $3,%r11 +._reduceloop: +mov %rsi,%r12 +shr $51,%r12 +and %rax,%rsi +add %r12,%rdx +mov %rdx,%r12 +shr $51,%r12 +and %rax,%rdx +add %r12,%rcx +mov %rcx,%r12 +shr $51,%r12 +and %rax,%rcx +add %r12,%r8 +mov %r8,%r12 +shr $51,%r12 +and %rax,%r8 +add %r12,%r9 +mov %r9,%r12 +shr $51,%r12 +and %rax,%r9 +imulq $19,%r12,%r12 +add %r12,%rsi +sub $1,%r11 +ja ._reduceloop +mov $1,%r12 +cmp %r10,%rsi +cmovl %r11,%r12 +cmp %rax,%rdx +cmovne %r11,%r12 +cmp %rax,%rcx +cmovne %r11,%r12 +cmp %rax,%r8 +cmovne %r11,%r12 +cmp %rax,%r9 +cmovne %r11,%r12 +neg %r12 +and %r12,%rax +and %r12,%r10 +sub %r10,%rsi +sub %rax,%rdx +sub %rax,%rcx +sub %rax,%r8 +sub %rax,%r9 +movq %rsi,0(%rdi) +movq %rdx,8(%rdi) +movq %rcx,16(%rdi) +movq %r8,24(%rdi) +movq %r9,32(%rdi) +movq -8(%rsp),%r12 +ret +.cfi_endproc + +.p2align 5 +.globl x25519_x86_64_mul +.hidden x25519_x86_64_mul +x25519_x86_64_mul: +.cfi_startproc +/* This is a leaf function and uses the redzone for saving registers. */ +movq %r12,-8(%rsp) +.cfi_rel_offset r12, -8 +movq %r13,-16(%rsp) +.cfi_rel_offset r13, -16 +movq %r14,-24(%rsp) +.cfi_rel_offset r14, -24 +movq %r15,-32(%rsp) +.cfi_rel_offset r15, -32 +movq %rbx,-40(%rsp) +.cfi_rel_offset rbx, -40 +movq %rbp,-48(%rsp) +.cfi_rel_offset rbp, -48 +mov %rdx,%rcx +movq 24(%rsi),%rdx +imulq $19,%rdx,%rax +movq %rax,-64(%rsp) +mulq 16(%rcx) +mov %rax,%r8 +mov %rdx,%r9 +movq 32(%rsi),%rdx +imulq $19,%rdx,%rax +movq %rax,-72(%rsp) +mulq 8(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsi),%rax +mulq 0(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsi),%rax +mulq 8(%rcx) +mov %rax,%r10 +mov %rdx,%r11 +movq 0(%rsi),%rax +mulq 16(%rcx) +mov %rax,%r12 +mov %rdx,%r13 +movq 0(%rsi),%rax +mulq 24(%rcx) +mov %rax,%r14 +mov %rdx,%r15 +movq 0(%rsi),%rax +mulq 32(%rcx) +mov %rax,%rbx +mov %rdx,%rbp +movq 8(%rsi),%rax +mulq 0(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsi),%rax +mulq 8(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 8(%rsi),%rax +mulq 16(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsi),%rax +mulq 24(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 8(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 16(%rsi),%rax +mulq 0(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 16(%rsi),%rax +mulq 8(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 16(%rsi),%rax +mulq 16(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 16(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 16(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 24(%rsi),%rax +mulq 0(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 24(%rsi),%rax +mulq 8(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq -64(%rsp),%rax +mulq 24(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq -64(%rsp),%rax +mulq 32(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 32(%rsi),%rax +mulq 0(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq -72(%rsp),%rax +mulq 16(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq -72(%rsp),%rax +mulq 24(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq -72(%rsp),%rax +mulq 32(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq x25519_x86_64_REDMASK51(%rip),%rsi +shld $13,%r8,%r9 +and %rsi,%r8 +shld $13,%r10,%r11 +and %rsi,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rsi,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rsi,%r14 +add %r13,%r14 +shld $13,%rbx,%rbp +and %rsi,%rbx +add %r15,%rbx +imulq $19,%rbp,%rdx +add %rdx,%r8 +mov %r8,%rdx +shr $51,%rdx +add %r10,%rdx +mov %rdx,%rcx +shr $51,%rdx +and %rsi,%r8 +add %r12,%rdx +mov %rdx,%r9 +shr $51,%rdx +and %rsi,%rcx +add %r14,%rdx +mov %rdx,%rax +shr $51,%rdx +and %rsi,%r9 +add %rbx,%rdx +mov %rdx,%r10 +shr $51,%rdx +and %rsi,%rax +imulq $19,%rdx,%rdx +add %rdx,%r8 +and %rsi,%r10 +movq %r8,0(%rdi) +movq %rcx,8(%rdi) +movq %r9,16(%rdi) +movq %rax,24(%rdi) +movq %r10,32(%rdi) +movq -8(%rsp),%r12 +movq -16(%rsp),%r13 +movq -24(%rsp),%r14 +movq -32(%rsp),%r15 +movq -40(%rsp),%rbx +movq -48(%rsp),%rbp +ret +.cfi_endproc + +.p2align 5 +.globl x25519_x86_64_square +.hidden x25519_x86_64_square +x25519_x86_64_square: +.cfi_startproc +/* This is a leaf function and uses the redzone for saving registers. */ +movq %r12,-8(%rsp) +.cfi_rel_offset r12, -8 +movq %r13,-16(%rsp) +.cfi_rel_offset r13, -16 +movq %r14,-24(%rsp) +.cfi_rel_offset r14, -24 +movq %r15,-32(%rsp) +.cfi_rel_offset r15, -32 +movq %rbx,-40(%rsp) +.cfi_rel_offset rbx, -40 +movq 0(%rsi),%rax +mulq 0(%rsi) +mov %rax,%rcx +mov %rdx,%r8 +movq 0(%rsi),%rax +shl $1,%rax +mulq 8(%rsi) +mov %rax,%r9 +mov %rdx,%r10 +movq 0(%rsi),%rax +shl $1,%rax +mulq 16(%rsi) +mov %rax,%r11 +mov %rdx,%r12 +movq 0(%rsi),%rax +shl $1,%rax +mulq 24(%rsi) +mov %rax,%r13 +mov %rdx,%r14 +movq 0(%rsi),%rax +shl $1,%rax +mulq 32(%rsi) +mov %rax,%r15 +mov %rdx,%rbx +movq 8(%rsi),%rax +mulq 8(%rsi) +add %rax,%r11 +adc %rdx,%r12 +movq 8(%rsi),%rax +shl $1,%rax +mulq 16(%rsi) +add %rax,%r13 +adc %rdx,%r14 +movq 8(%rsi),%rax +shl $1,%rax +mulq 24(%rsi) +add %rax,%r15 +adc %rdx,%rbx +movq 8(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%rcx +adc %rdx,%r8 +movq 16(%rsi),%rax +mulq 16(%rsi) +add %rax,%r15 +adc %rdx,%rbx +movq 16(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 24(%rsi) +add %rax,%rcx +adc %rdx,%r8 +movq 16(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%r9 +adc %rdx,%r10 +movq 24(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rsi) +add %rax,%r9 +adc %rdx,%r10 +movq 24(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%r11 +adc %rdx,%r12 +movq 32(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rsi) +add %rax,%r13 +adc %rdx,%r14 +movq x25519_x86_64_REDMASK51(%rip),%rsi +shld $13,%rcx,%r8 +and %rsi,%rcx +shld $13,%r9,%r10 +and %rsi,%r9 +add %r8,%r9 +shld $13,%r11,%r12 +and %rsi,%r11 +add %r10,%r11 +shld $13,%r13,%r14 +and %rsi,%r13 +add %r12,%r13 +shld $13,%r15,%rbx +and %rsi,%r15 +add %r14,%r15 +imulq $19,%rbx,%rdx +add %rdx,%rcx +mov %rcx,%rdx +shr $51,%rdx +add %r9,%rdx +and %rsi,%rcx +mov %rdx,%r8 +shr $51,%rdx +add %r11,%rdx +and %rsi,%r8 +mov %rdx,%r9 +shr $51,%rdx +add %r13,%rdx +and %rsi,%r9 +mov %rdx,%rax +shr $51,%rdx +add %r15,%rdx +and %rsi,%rax +mov %rdx,%r10 +shr $51,%rdx +imulq $19,%rdx,%rdx +add %rdx,%rcx +and %rsi,%r10 +movq %rcx,0(%rdi) +movq %r8,8(%rdi) +movq %r9,16(%rdi) +movq %rax,24(%rdi) +movq %r10,32(%rdi) +movq -8(%rsp),%r12 +movq -16(%rsp),%r13 +movq -24(%rsp),%r14 +movq -32(%rsp),%r15 +movq -40(%rsp),%rbx +ret +.cfi_endproc + +.p2align 5 +.globl x25519_x86_64_ladderstep +.hidden x25519_x86_64_ladderstep +x25519_x86_64_ladderstep: +.cfi_startproc +sub $344,%rsp +.cfi_adjust_cfa_offset 344 +movq %r12,296(%rsp) +.cfi_rel_offset r12, 296 +movq %r13,304(%rsp) +.cfi_rel_offset r13, 304 +movq %r14,312(%rsp) +.cfi_rel_offset r14, 312 +movq %r15,320(%rsp) +.cfi_rel_offset r15, 320 +movq %rbx,328(%rsp) +.cfi_rel_offset rbx, 328 +movq %rbp,336(%rsp) +.cfi_rel_offset rbp, 336 +movq 40(%rdi),%rsi +movq 48(%rdi),%rdx +movq 56(%rdi),%rcx +movq 64(%rdi),%r8 +movq 72(%rdi),%r9 +mov %rsi,%rax +mov %rdx,%r10 +mov %rcx,%r11 +mov %r8,%r12 +mov %r9,%r13 +add x25519_x86_64_2P0(%rip),%rax +add x25519_x86_64_2P1234(%rip),%r10 +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 80(%rdi),%rsi +addq 88(%rdi),%rdx +addq 96(%rdi),%rcx +addq 104(%rdi),%r8 +addq 112(%rdi),%r9 +subq 80(%rdi),%rax +subq 88(%rdi),%r10 +subq 96(%rdi),%r11 +subq 104(%rdi),%r12 +subq 112(%rdi),%r13 +movq %rsi,0(%rsp) +movq %rdx,8(%rsp) +movq %rcx,16(%rsp) +movq %r8,24(%rsp) +movq %r9,32(%rsp) +movq %rax,40(%rsp) +movq %r10,48(%rsp) +movq %r11,56(%rsp) +movq %r12,64(%rsp) +movq %r13,72(%rsp) +movq 40(%rsp),%rax +mulq 40(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 40(%rsp),%rax +shl $1,%rax +mulq 48(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 40(%rsp),%rax +shl $1,%rax +mulq 56(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 40(%rsp),%rax +shl $1,%rax +mulq 64(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 40(%rsp),%rax +shl $1,%rax +mulq 72(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 48(%rsp),%rax +mulq 48(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 48(%rsp),%rax +shl $1,%rax +mulq 56(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 48(%rsp),%rax +shl $1,%rax +mulq 64(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 48(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 72(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 56(%rsp),%rax +mulq 56(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 56(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 64(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 56(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 72(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 64(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 64(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 64(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 72(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 72(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 72(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,80(%rsp) +movq %r8,88(%rsp) +movq %r9,96(%rsp) +movq %rax,104(%rsp) +movq %r10,112(%rsp) +movq 0(%rsp),%rax +mulq 0(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 0(%rsp),%rax +shl $1,%rax +mulq 8(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 0(%rsp),%rax +shl $1,%rax +mulq 16(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 0(%rsp),%rax +shl $1,%rax +mulq 24(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 0(%rsp),%rax +shl $1,%rax +mulq 32(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 8(%rsp),%rax +mulq 8(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsp),%rax +shl $1,%rax +mulq 16(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 8(%rsp),%rax +shl $1,%rax +mulq 24(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 16(%rsp),%rax +mulq 16(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 16(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 24(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 16(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 24(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 24(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 24(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 32(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 32(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,120(%rsp) +movq %r8,128(%rsp) +movq %r9,136(%rsp) +movq %rax,144(%rsp) +movq %r10,152(%rsp) +mov %rsi,%rsi +mov %r8,%rdx +mov %r9,%rcx +mov %rax,%r8 +mov %r10,%r9 +add x25519_x86_64_2P0(%rip),%rsi +add x25519_x86_64_2P1234(%rip),%rdx +add x25519_x86_64_2P1234(%rip),%rcx +add x25519_x86_64_2P1234(%rip),%r8 +add x25519_x86_64_2P1234(%rip),%r9 +subq 80(%rsp),%rsi +subq 88(%rsp),%rdx +subq 96(%rsp),%rcx +subq 104(%rsp),%r8 +subq 112(%rsp),%r9 +movq %rsi,160(%rsp) +movq %rdx,168(%rsp) +movq %rcx,176(%rsp) +movq %r8,184(%rsp) +movq %r9,192(%rsp) +movq 120(%rdi),%rsi +movq 128(%rdi),%rdx +movq 136(%rdi),%rcx +movq 144(%rdi),%r8 +movq 152(%rdi),%r9 +mov %rsi,%rax +mov %rdx,%r10 +mov %rcx,%r11 +mov %r8,%r12 +mov %r9,%r13 +add x25519_x86_64_2P0(%rip),%rax +add x25519_x86_64_2P1234(%rip),%r10 +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 160(%rdi),%rsi +addq 168(%rdi),%rdx +addq 176(%rdi),%rcx +addq 184(%rdi),%r8 +addq 192(%rdi),%r9 +subq 160(%rdi),%rax +subq 168(%rdi),%r10 +subq 176(%rdi),%r11 +subq 184(%rdi),%r12 +subq 192(%rdi),%r13 +movq %rsi,200(%rsp) +movq %rdx,208(%rsp) +movq %rcx,216(%rsp) +movq %r8,224(%rsp) +movq %r9,232(%rsp) +movq %rax,240(%rsp) +movq %r10,248(%rsp) +movq %r11,256(%rsp) +movq %r12,264(%rsp) +movq %r13,272(%rsp) +movq 224(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,280(%rsp) +mulq 56(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 232(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,288(%rsp) +mulq 48(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 200(%rsp),%rax +mulq 40(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 200(%rsp),%rax +mulq 48(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 200(%rsp),%rax +mulq 56(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 200(%rsp),%rax +mulq 64(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 200(%rsp),%rax +mulq 72(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 208(%rsp),%rax +mulq 40(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 208(%rsp),%rax +mulq 48(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 208(%rsp),%rax +mulq 56(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 208(%rsp),%rax +mulq 64(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 208(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 72(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 216(%rsp),%rax +mulq 40(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 216(%rsp),%rax +mulq 48(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 216(%rsp),%rax +mulq 56(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 216(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 64(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 216(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 72(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 224(%rsp),%rax +mulq 40(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 224(%rsp),%rax +mulq 48(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 280(%rsp),%rax +mulq 64(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 280(%rsp),%rax +mulq 72(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 232(%rsp),%rax +mulq 40(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 288(%rsp),%rax +mulq 56(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 288(%rsp),%rax +mulq 64(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 288(%rsp),%rax +mulq 72(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,40(%rsp) +movq %r8,48(%rsp) +movq %r9,56(%rsp) +movq %rax,64(%rsp) +movq %r10,72(%rsp) +movq 264(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,200(%rsp) +mulq 16(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 272(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,208(%rsp) +mulq 8(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 240(%rsp),%rax +mulq 0(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 240(%rsp),%rax +mulq 8(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 240(%rsp),%rax +mulq 16(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 240(%rsp),%rax +mulq 24(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 240(%rsp),%rax +mulq 32(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 248(%rsp),%rax +mulq 0(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 248(%rsp),%rax +mulq 8(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 248(%rsp),%rax +mulq 16(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 248(%rsp),%rax +mulq 24(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 248(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 32(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 256(%rsp),%rax +mulq 0(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 256(%rsp),%rax +mulq 8(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 256(%rsp),%rax +mulq 16(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 256(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 24(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 256(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 32(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 264(%rsp),%rax +mulq 0(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 264(%rsp),%rax +mulq 8(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 200(%rsp),%rax +mulq 24(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 200(%rsp),%rax +mulq 32(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 272(%rsp),%rax +mulq 0(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 208(%rsp),%rax +mulq 16(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 208(%rsp),%rax +mulq 24(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 208(%rsp),%rax +mulq 32(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +mov %rsi,%rdx +mov %r8,%rcx +mov %r9,%r11 +mov %rax,%r12 +mov %r10,%r13 +add x25519_x86_64_2P0(%rip),%rdx +add x25519_x86_64_2P1234(%rip),%rcx +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 40(%rsp),%rsi +addq 48(%rsp),%r8 +addq 56(%rsp),%r9 +addq 64(%rsp),%rax +addq 72(%rsp),%r10 +subq 40(%rsp),%rdx +subq 48(%rsp),%rcx +subq 56(%rsp),%r11 +subq 64(%rsp),%r12 +subq 72(%rsp),%r13 +movq %rsi,120(%rdi) +movq %r8,128(%rdi) +movq %r9,136(%rdi) +movq %rax,144(%rdi) +movq %r10,152(%rdi) +movq %rdx,160(%rdi) +movq %rcx,168(%rdi) +movq %r11,176(%rdi) +movq %r12,184(%rdi) +movq %r13,192(%rdi) +movq 120(%rdi),%rax +mulq 120(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 120(%rdi),%rax +shl $1,%rax +mulq 128(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 120(%rdi),%rax +shl $1,%rax +mulq 136(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 120(%rdi),%rax +shl $1,%rax +mulq 144(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 120(%rdi),%rax +shl $1,%rax +mulq 152(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 128(%rdi),%rax +mulq 128(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 128(%rdi),%rax +shl $1,%rax +mulq 136(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 128(%rdi),%rax +shl $1,%rax +mulq 144(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 128(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rdi),%rax +mulq 136(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 136(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 144(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 144(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 144(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 144(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 152(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 152(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,120(%rdi) +movq %r8,128(%rdi) +movq %r9,136(%rdi) +movq %rax,144(%rdi) +movq %r10,152(%rdi) +movq 160(%rdi),%rax +mulq 160(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 160(%rdi),%rax +shl $1,%rax +mulq 168(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 160(%rdi),%rax +shl $1,%rax +mulq 176(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 160(%rdi),%rax +shl $1,%rax +mulq 184(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 160(%rdi),%rax +shl $1,%rax +mulq 192(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 168(%rdi),%rax +mulq 168(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 168(%rdi),%rax +shl $1,%rax +mulq 176(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 168(%rdi),%rax +shl $1,%rax +mulq 184(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 168(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rax +mulq 176(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 176(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 184(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 184(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 192(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 192(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,160(%rdi) +movq %r8,168(%rdi) +movq %r9,176(%rdi) +movq %rax,184(%rdi) +movq %r10,192(%rdi) +movq 184(%rdi),%rsi +imulq $19,%rsi,%rax +movq %rax,0(%rsp) +mulq 16(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 192(%rdi),%rdx +imulq $19,%rdx,%rax +movq %rax,8(%rsp) +mulq 8(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 160(%rdi),%rax +mulq 0(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 160(%rdi),%rax +mulq 8(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 160(%rdi),%rax +mulq 16(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 160(%rdi),%rax +mulq 24(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 160(%rdi),%rax +mulq 32(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 168(%rdi),%rax +mulq 0(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 168(%rdi),%rax +mulq 8(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 168(%rdi),%rax +mulq 16(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 168(%rdi),%rax +mulq 24(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 168(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rax +mulq 0(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 176(%rdi),%rax +mulq 8(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 176(%rdi),%rax +mulq 16(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 176(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rax +mulq 0(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 184(%rdi),%rax +mulq 8(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 0(%rsp),%rax +mulq 24(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsp),%rax +mulq 32(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 192(%rdi),%rax +mulq 0(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsp),%rax +mulq 16(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 8(%rsp),%rax +mulq 24(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsp),%rax +mulq 32(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,160(%rdi) +movq %r8,168(%rdi) +movq %r9,176(%rdi) +movq %rax,184(%rdi) +movq %r10,192(%rdi) +movq 144(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,0(%rsp) +mulq 96(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 152(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,8(%rsp) +mulq 88(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 120(%rsp),%rax +mulq 80(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 120(%rsp),%rax +mulq 88(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 120(%rsp),%rax +mulq 96(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 120(%rsp),%rax +mulq 104(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 120(%rsp),%rax +mulq 112(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 128(%rsp),%rax +mulq 80(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 128(%rsp),%rax +mulq 88(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 128(%rsp),%rax +mulq 96(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 128(%rsp),%rax +mulq 104(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 128(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 112(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rsp),%rax +mulq 80(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 136(%rsp),%rax +mulq 88(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 136(%rsp),%rax +mulq 96(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 136(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 104(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 112(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 144(%rsp),%rax +mulq 80(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 144(%rsp),%rax +mulq 88(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 0(%rsp),%rax +mulq 104(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsp),%rax +mulq 112(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 152(%rsp),%rax +mulq 80(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsp),%rax +mulq 96(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 8(%rsp),%rax +mulq 104(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsp),%rax +mulq 112(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,40(%rdi) +movq %r8,48(%rdi) +movq %r9,56(%rdi) +movq %rax,64(%rdi) +movq %r10,72(%rdi) +movq 160(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +mov %rax,%rsi +mov %rdx,%rcx +movq 168(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%rcx +mov %rdx,%r8 +movq 176(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r8 +mov %rdx,%r9 +movq 184(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r9 +mov %rdx,%r10 +movq 192(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r10 +imulq $19,%rdx,%rdx +add %rdx,%rsi +addq 80(%rsp),%rsi +addq 88(%rsp),%rcx +addq 96(%rsp),%r8 +addq 104(%rsp),%r9 +addq 112(%rsp),%r10 +movq %rsi,80(%rdi) +movq %rcx,88(%rdi) +movq %r8,96(%rdi) +movq %r9,104(%rdi) +movq %r10,112(%rdi) +movq 104(%rdi),%rsi +imulq $19,%rsi,%rax +movq %rax,0(%rsp) +mulq 176(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 112(%rdi),%rdx +imulq $19,%rdx,%rax +movq %rax,8(%rsp) +mulq 168(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 80(%rdi),%rax +mulq 160(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 80(%rdi),%rax +mulq 168(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 80(%rdi),%rax +mulq 176(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 80(%rdi),%rax +mulq 184(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 80(%rdi),%rax +mulq 192(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 88(%rdi),%rax +mulq 160(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 88(%rdi),%rax +mulq 168(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 88(%rdi),%rax +mulq 176(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 88(%rdi),%rax +mulq 184(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 88(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 192(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 96(%rdi),%rax +mulq 160(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 96(%rdi),%rax +mulq 168(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 96(%rdi),%rax +mulq 176(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 96(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 184(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 96(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 192(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 104(%rdi),%rax +mulq 160(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 104(%rdi),%rax +mulq 168(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 0(%rsp),%rax +mulq 184(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsp),%rax +mulq 192(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 112(%rdi),%rax +mulq 160(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsp),%rax +mulq 176(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 8(%rsp),%rax +mulq 184(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsp),%rax +mulq 192(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,80(%rdi) +movq %r8,88(%rdi) +movq %r9,96(%rdi) +movq %rax,104(%rdi) +movq %r10,112(%rdi) +movq 296(%rsp),%r12 +movq 304(%rsp),%r13 +movq 312(%rsp),%r14 +movq 320(%rsp),%r15 +movq 328(%rsp),%rbx +movq 336(%rsp),%rbp +add $344,%rsp +.cfi_adjust_cfa_offset -344 +ret +.cfi_endproc + +.p2align 5 +.globl x25519_x86_64_work_cswap +.hidden x25519_x86_64_work_cswap +x25519_x86_64_work_cswap: +.cfi_startproc +subq $1,%rsi +notq %rsi +movq %rsi,%xmm15 +pshufd $0x44,%xmm15,%xmm15 +movdqu 0(%rdi),%xmm0 +movdqu 16(%rdi),%xmm2 +movdqu 32(%rdi),%xmm4 +movdqu 48(%rdi),%xmm6 +movdqu 64(%rdi),%xmm8 +movdqu 80(%rdi),%xmm1 +movdqu 96(%rdi),%xmm3 +movdqu 112(%rdi),%xmm5 +movdqu 128(%rdi),%xmm7 +movdqu 144(%rdi),%xmm9 +movdqa %xmm1,%xmm10 +movdqa %xmm3,%xmm11 +movdqa %xmm5,%xmm12 +movdqa %xmm7,%xmm13 +movdqa %xmm9,%xmm14 +pxor %xmm0,%xmm10 +pxor %xmm2,%xmm11 +pxor %xmm4,%xmm12 +pxor %xmm6,%xmm13 +pxor %xmm8,%xmm14 +pand %xmm15,%xmm10 +pand %xmm15,%xmm11 +pand %xmm15,%xmm12 +pand %xmm15,%xmm13 +pand %xmm15,%xmm14 +pxor %xmm10,%xmm0 +pxor %xmm10,%xmm1 +pxor %xmm11,%xmm2 +pxor %xmm11,%xmm3 +pxor %xmm12,%xmm4 +pxor %xmm12,%xmm5 +pxor %xmm13,%xmm6 +pxor %xmm13,%xmm7 +pxor %xmm14,%xmm8 +pxor %xmm14,%xmm9 +movdqu %xmm0,0(%rdi) +movdqu %xmm2,16(%rdi) +movdqu %xmm4,32(%rdi) +movdqu %xmm6,48(%rdi) +movdqu %xmm8,64(%rdi) +movdqu %xmm1,80(%rdi) +movdqu %xmm3,96(%rdi) +movdqu %xmm5,112(%rdi) +movdqu %xmm7,128(%rdi) +movdqu %xmm9,144(%rdi) +ret +.cfi_endproc diff --git a/curve25519-amd64.c b/curve25519-amd64.c new file mode 100644 index 0000000..095b0d2 --- /dev/null +++ b/curve25519-amd64.c @@ -0,0 +1,234 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015 Google Inc. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Peter Schwabe + */ + +#include +#include + +typedef struct { uint64_t v[5]; } fe25519; + +asmlinkage void x25519_x86_64_work_cswap(fe25519 *, uint64_t); +asmlinkage void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b); +asmlinkage void x25519_x86_64_square(fe25519 *out, const fe25519 *a); +asmlinkage void x25519_x86_64_freeze(fe25519 *); +asmlinkage void x25519_x86_64_ladderstep(fe25519 *work); + +enum { CURVE25519_POINT_SIZE = 32 }; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +static void fe25519_setint(fe25519 *r, unsigned v) +{ + r->v[0] = v; + r->v[1] = 0; + r->v[2] = 0; + r->v[3] = 0; + r->v[4] = 0; +} + +// Assumes input x being reduced below 2^255 +static void fe25519_pack(unsigned char r[32], const fe25519 *x) +{ + fe25519 t; + t = *x; + x25519_x86_64_freeze(&t); + + r[0] = (uint8_t)(t.v[0] & 0xff); + r[1] = (uint8_t)((t.v[0] >> 8) & 0xff); + r[2] = (uint8_t)((t.v[0] >> 16) & 0xff); + r[3] = (uint8_t)((t.v[0] >> 24) & 0xff); + r[4] = (uint8_t)((t.v[0] >> 32) & 0xff); + r[5] = (uint8_t)((t.v[0] >> 40) & 0xff); + r[6] = (uint8_t)((t.v[0] >> 48)); + + r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8); + r[7] = (uint8_t)((t.v[1] >> 5) & 0xff); + r[8] = (uint8_t)((t.v[1] >> 13) & 0xff); + r[9] = (uint8_t)((t.v[1] >> 21) & 0xff); + r[10] = (uint8_t)((t.v[1] >> 29) & 0xff); + r[11] = (uint8_t)((t.v[1] >> 37) & 0xff); + r[12] = (uint8_t)((t.v[1] >> 45)); + + r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0); + r[13] = (uint8_t)((t.v[2] >> 2) & 0xff); + r[14] = (uint8_t)((t.v[2] >> 10) & 0xff); + r[15] = (uint8_t)((t.v[2] >> 18) & 0xff); + r[16] = (uint8_t)((t.v[2] >> 26) & 0xff); + r[17] = (uint8_t)((t.v[2] >> 34) & 0xff); + r[18] = (uint8_t)((t.v[2] >> 42) & 0xff); + r[19] = (uint8_t)((t.v[2] >> 50)); + + r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe); + r[20] = (uint8_t)((t.v[3] >> 7) & 0xff); + r[21] = (uint8_t)((t.v[3] >> 15) & 0xff); + r[22] = (uint8_t)((t.v[3] >> 23) & 0xff); + r[23] = (uint8_t)((t.v[3] >> 31) & 0xff); + r[24] = (uint8_t)((t.v[3] >> 39) & 0xff); + r[25] = (uint8_t)((t.v[3] >> 47)); + + r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0); + r[26] = (uint8_t)((t.v[4] >> 4) & 0xff); + r[27] = (uint8_t)((t.v[4] >> 12) & 0xff); + r[28] = (uint8_t)((t.v[4] >> 20) & 0xff); + r[29] = (uint8_t)((t.v[4] >> 28) & 0xff); + r[30] = (uint8_t)((t.v[4] >> 36) & 0xff); + r[31] = (uint8_t)((t.v[4] >> 44)); +} + +static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) +{ + r->v[0] = x[0]; + r->v[0] += (uint64_t)x[1] << 8; + r->v[0] += (uint64_t)x[2] << 16; + r->v[0] += (uint64_t)x[3] << 24; + r->v[0] += (uint64_t)x[4] << 32; + r->v[0] += (uint64_t)x[5] << 40; + r->v[0] += ((uint64_t)x[6] & 7) << 48; + + r->v[1] = x[6] >> 3; + r->v[1] += (uint64_t)x[7] << 5; + r->v[1] += (uint64_t)x[8] << 13; + r->v[1] += (uint64_t)x[9] << 21; + r->v[1] += (uint64_t)x[10] << 29; + r->v[1] += (uint64_t)x[11] << 37; + r->v[1] += ((uint64_t)x[12] & 63) << 45; + + r->v[2] = x[12] >> 6; + r->v[2] += (uint64_t)x[13] << 2; + r->v[2] += (uint64_t)x[14] << 10; + r->v[2] += (uint64_t)x[15] << 18; + r->v[2] += (uint64_t)x[16] << 26; + r->v[2] += (uint64_t)x[17] << 34; + r->v[2] += (uint64_t)x[18] << 42; + r->v[2] += ((uint64_t)x[19] & 1) << 50; + + r->v[3] = x[19] >> 1; + r->v[3] += (uint64_t)x[20] << 7; + r->v[3] += (uint64_t)x[21] << 15; + r->v[3] += (uint64_t)x[22] << 23; + r->v[3] += (uint64_t)x[23] << 31; + r->v[3] += (uint64_t)x[24] << 39; + r->v[3] += ((uint64_t)x[25] & 15) << 47; + + r->v[4] = x[25] >> 4; + r->v[4] += (uint64_t)x[26] << 4; + r->v[4] += (uint64_t)x[27] << 12; + r->v[4] += (uint64_t)x[28] << 20; + r->v[4] += (uint64_t)x[29] << 28; + r->v[4] += (uint64_t)x[30] << 36; + r->v[4] += ((uint64_t)x[31] & 127) << 44; +} + +static void fe25519_invert(fe25519 *r, const fe25519 *x) +{ + fe25519 z2; + fe25519 z9; + fe25519 z11; + fe25519 z2_5_0; + fe25519 z2_10_0; + fe25519 z2_20_0; + fe25519 z2_50_0; + fe25519 z2_100_0; + fe25519 t; + int i; + + /* 2 */ x25519_x86_64_square(&z2, x); + /* 4 */ x25519_x86_64_square(&t, &z2); + /* 8 */ x25519_x86_64_square(&t, &t); + /* 9 */ x25519_x86_64_mul(&z9, &t, x); + /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2); + /* 22 */ x25519_x86_64_square(&t, &z11); + /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9); + + /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0); + /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); } + /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0); + + /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0); + /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } + /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0); + + /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0); + /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); } + /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0); + + /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } + /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0); + + /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0); + /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } + /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0); + + /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0); + /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) { + x25519_x86_64_square(&t, &t); + } + /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0); + + /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } + /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0); + + /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t); + /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t); + + /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t); + + /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t); + /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11); +} + +static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) +{ + int i, j; + uint8_t prevbit = 0; + fe25519 work[5]; + + work[0] = *xr; + fe25519_setint(work + 1, 1); + fe25519_setint(work + 2, 0); + work[3] = *xr; + fe25519_setint(work + 4, 1); + + j = 6; + for (i = 31; i >= 0; i--) { + while (j >= 0) { + const uint8_t bit = 1 & (s[i] >> j); + const uint64_t swap = bit ^ prevbit; + prevbit = bit; + x25519_x86_64_work_cswap(work + 1, swap); + x25519_x86_64_ladderstep(work); + j -= 1; + } + j = 7; + } + + *xr = work[1]; + *zr = work[2]; +} +bool curve25519_amd64(u8 out[CURVE25519_POINT_SIZE], const u8 scalar[CURVE25519_POINT_SIZE], const u8 point[CURVE25519_POINT_SIZE]) +{ + fe25519 t; + fe25519 z; + uint8_t e[32]; + memcpy(e, scalar, sizeof(e)); + normalize_secret(e); + + fe25519_unpack(&t, point); + mladder(&t, &z, e); + fe25519_invert(&z, &z); + x25519_x86_64_mul(&t, &t, &z); + fe25519_pack(out, &t); + return true; +} diff --git a/curve25519-donna32.c b/curve25519-donna32.c new file mode 100644 index 0000000..4721864 --- /dev/null +++ b/curve25519-donna32.c @@ -0,0 +1,861 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2008 Google Inc. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Adam Langley + */ + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +typedef s64 limb; + +/* Field element representation: + * + * Field elements are written as an array of signed, 64-bit limbs, least + * significant first. The value of the field element is: + * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... + * + * i.e. the limbs are 26, 25, 26, 25, ... bits wide. + */ + +/* Sum two numbers: output += in */ +static void fsum(limb *output, const limb *in) +{ + unsigned int i; + + for (i = 0; i < 10; i += 2) { + output[0 + i] = output[0 + i] + in[0 + i]; + output[1 + i] = output[1 + i] + in[1 + i]; + } +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!). + */ +static void fdifference(limb *output, const limb *in) +{ + unsigned int i; + + for (i = 0; i < 10; ++i) + output[i] = in[i] - output[i]; +} + +/* Multiply a number by a scalar: output = in * scalar */ +static void fscalar_product(limb *output, const limb *in, const limb scalar) +{ + unsigned int i; + + for (i = 0; i < 10; ++i) + output[i] = in[i] * scalar; +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. + */ +static void fproduct(limb *output, const limb *in2, const limb *in) +{ + output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); + output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + + ((limb) ((s32) in2[1])) * ((s32) in[0]); + output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[0]); + output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[0]); + output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + + 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[1])) + + ((limb) ((s32) in2[0])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[0]); + output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[0]); + output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[0]); + output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[0]); + output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + + 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[0]); + output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[0]); + output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[1])) + + ((limb) ((s32) in2[4])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[4]) + + ((limb) ((s32) in2[2])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[2]); + output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[2]); + output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + + 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[3])) + + ((limb) ((s32) in2[4])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[4]); + output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[6]) + + ((limb) ((s32) in2[5])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[4]); + output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) + + ((limb) ((s32) in2[5])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[5])) + + ((limb) ((s32) in2[6])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[6]); + output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[7]) + + ((limb) ((s32) in2[6])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[6]); + output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[7])); + output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[8]); + output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); +} + +/* Reduce a long form to a short form by taking the input mod 2^255 - 19. + * + * On entry: |output[i]| < 14*2^54 + * On exit: |output[0..8]| < 280*2^54 + */ +static void freduce_degree(limb *output) +{ + /* Each of these shifts and adds ends up multiplying the value by 19. + * + * For output[0..8], the absolute entry value is < 14*2^54 and we add, at + * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. + */ + output[8] += output[18] << 4; + output[8] += output[18] << 1; + output[8] += output[18]; + output[7] += output[17] << 4; + output[7] += output[17] << 1; + output[7] += output[17]; + output[6] += output[16] << 4; + output[6] += output[16] << 1; + output[6] += output[16]; + output[5] += output[15] << 4; + output[5] += output[15] << 1; + output[5] += output[15]; + output[4] += output[14] << 4; + output[4] += output[14] << 1; + output[4] += output[14]; + output[3] += output[13] << 4; + output[3] += output[13] << 1; + output[3] += output[13]; + output[2] += output[12] << 4; + output[2] += output[12] << 1; + output[2] += output[12]; + output[1] += output[11] << 4; + output[1] += output[11] << 1; + output[1] += output[11]; + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; +} + +/* return v / 2^26, using only shifts and adds. + * + * On entry: v can take any value. + */ +static inline limb div_by_2_26(const limb v) +{ + /* High word of v; no shift needed. */ + const u32 highword = (u32) (((u64) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const s32 sign = ((s32) highword) >> 31; + /* Set to 0x3ffffff if v was negative; else set to 0. */ + const s32 roundoff = ((u32) sign) >> 6; + /* Should return v / (1<<26) */ + return (v + roundoff) >> 26; +} + +/* return v / (2^25), using only shifts and adds. + * + * On entry: v can take any value. + */ +static inline limb div_by_2_25(const limb v) +{ + /* High word of v; no shift needed*/ + const u32 highword = (u32) (((u64) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const s32 sign = ((s32) highword) >> 31; + /* Set to 0x1ffffff if v was negative; else set to 0. */ + const s32 roundoff = ((u32) sign) >> 7; + /* Should return v / (1<<25) */ + return (v + roundoff) >> 25; +} + +/* Reduce all coefficients of the short form input so that |x| < 2^26. + * + * On entry: |output[i]| < 280*2^54 + */ +static void freduce_coefficients(limb *output) +{ + unsigned int i; + + output[10] = 0; + + for (i = 0; i < 10; i += 2) { + limb over = div_by_2_26(output[i]); + /* The entry condition (that |output[i]| < 280*2^54) means that over is, at + * most, 280*2^28 in the first iteration of this loop. This is added to the + * next limb and we can approximate the resulting bound of that limb by + * 281*2^54. + */ + output[i] -= over << 26; + output[i+1] += over; + + /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < + * 281*2^29. When this is added to the next limb, the resulting bound can + * be approximated as 281*2^54. + * + * For subsequent iterations of the loop, 281*2^54 remains a conservative + * bound and no overflow occurs. + */ + over = div_by_2_25(output[i+1]); + output[i+1] -= over << 25; + output[i+2] += over; + } + /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; + + output[10] = 0; + + /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 + * So |over| will be no more than 2^16. + */ + { + limb over = div_by_2_26(output[0]); + + output[0] -= over << 26; + output[1] += over; + } + + /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The + * bound on |output[1]| is sufficient to meet our needs. + */ +} + +/* A helpful wrapper around fproduct: output = in * in2. + * + * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. + * + * output must be distinct to both inputs. The output is reduced degree + * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. + */ +static void fmul(limb *output, const limb *in, const limb *in2) +{ + limb t[19]; + + fproduct(t, in, in2); + /* |t[i]| < 14*2^54 */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Square a number: output = in**2 + * + * output must be distinct from the input. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. + */ +static void fsquare_inner(limb *output, const limb *in) +{ + output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); + output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); + output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + + ((limb) ((s32) in[0])) * ((s32) in[2])); + output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + + ((limb) ((s32) in[0])) * ((s32) in[3])); + output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + + 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + + 2 * ((limb) ((s32) in[0])) * ((s32) in[4]); + output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + + ((limb) ((s32) in[1])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[5])); + output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + + ((limb) ((s32) in[2])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[6]) + + 2 * ((limb) ((s32) in[1])) * ((s32) in[5])); + output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + + ((limb) ((s32) in[2])) * ((s32) in[5]) + + ((limb) ((s32) in[1])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[7])); + output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + + 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[5]))); + output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + + ((limb) ((s32) in[3])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[8]) + + ((limb) ((s32) in[0])) * ((s32) in[9])); + output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + + ((limb) ((s32) in[4])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[9]))); + output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + + ((limb) ((s32) in[4])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[8]) + + ((limb) ((s32) in[2])) * ((s32) in[9])); + output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + + 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[9]))); + output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + + ((limb) ((s32) in[5])) * ((s32) in[8]) + + ((limb) ((s32) in[4])) * ((s32) in[9])); + output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + + ((limb) ((s32) in[6])) * ((s32) in[8]) + + 2 * ((limb) ((s32) in[5])) * ((s32) in[9])); + output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + + ((limb) ((s32) in[6])) * ((s32) in[9])); + output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + + 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); + output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); + output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); +} + +/* fsquare sets output = in^2. + * + * On entry: The |in| argument is in reduced coefficients form and |in[i]| < + * 2^27. + * + * On exit: The |output| argument is in reduced coefficients form (indeed, one + * need only provide storage for 10 limbs) and |out[i]| < 2^26. + */ +static void fsquare(limb *output, const limb *in) +{ + limb t[19]; + + fsquare_inner(t, in); + /* |t[i]| < 14*2^54 because the largest product of two limbs will be < + * 2^(27+27) and fsquare_inner adds together, at most, 14 of those + * products. + */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static inline void fexpand(limb *output, const u8 *input) +{ +#define F(n, start, shift, mask) \ + output[n] = ((((limb) input[start + 0]) | \ + ((limb) input[start + 1]) << 8 | \ + ((limb) input[start + 2]) << 16 | \ + ((limb) input[start + 3]) << 24) >> shift) & mask; + F(0, 0, 0, 0x3ffffff); + F(1, 3, 2, 0x1ffffff); + F(2, 6, 3, 0x3ffffff); + F(3, 9, 5, 0x1ffffff); + F(4, 12, 6, 0x3ffffff); + F(5, 16, 0, 0x1ffffff); + F(6, 19, 1, 0x3ffffff); + F(7, 22, 3, 0x1ffffff); + F(8, 25, 4, 0x3ffffff); + F(9, 28, 6, 0x1ffffff); +#undef F +} + +/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ +static s32 s32_eq(s32 a, s32 b) +{ + a = ~(a ^ b); + a &= a << 16; + a &= a << 8; + a &= a << 4; + a &= a << 2; + a &= a << 1; + return a >> 31; +} + +/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are + * both non-negative. + */ +static s32 s32_gte(s32 a, s32 b) +{ + a -= b; + /* a >= 0 iff a >= b. */ + return ~(a >> 31); +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array. + * + * On entry: |input_limbs[i]| < 2^26 + */ +static void fcontract(u8 *output, limb *input_limbs) +{ + int i; + int j; + s32 input[10]; + s32 mask; + + /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ + for (i = 0; i < 10; i++) { + input[i] = input_limbs[i]; + } + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 9; ++i) { + if ((i & 1) == 1) { + /* This calculation is a time-invariant way to make input[i] + * non-negative by borrowing from the next-larger limb. + */ + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 25); + + input[i] = input[i] + (carry << 25); + input[i+1] = input[i+1] - carry; + } else { + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 26); + + input[i] = input[i] + (carry << 26); + input[i+1] = input[i+1] - carry; + } + } + + /* There's no greater limb for input[9] to borrow from, but we can multiply + * by 19 and borrow from input[0], which is valid mod 2^255-19. + */ + { + const s32 mask = input[9] >> 31; + const s32 carry = -((input[9] & mask) >> 25); + + input[9] = input[9] + (carry << 25); + input[0] = input[0] - (carry * 19); + } + + /* After the first iteration, input[1..9] are non-negative and fit within + * 25 or 26 bits, depending on position. However, input[0] may be + * negative. + */ + } + + /* The first borrow-propagation pass above ended with every limb + except (possibly) input[0] non-negative. + If input[0] was negative after the first pass, then it was because of a + carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, + one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. + In the second pass, each limb is decreased by at most one. Thus the second + borrow-propagation pass could only have wrapped around to decrease + input[0] again if the first pass left input[0] negative *and* input[1] + through input[9] were all zero. In that case, input[1] is now 2^25 - 1, + and this last borrow-propagation step will leave input[1] non-negative. */ + { + const s32 mask = input[0] >> 31; + const s32 carry = -((input[0] & mask) >> 26); + + input[0] = input[0] + (carry << 26); + input[1] = input[1] - carry; + } + + /* All input[i] are now non-negative. However, there might be values between + * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. + */ + for (j = 0; j < 2; j++) { + for (i = 0; i < 9; i++) { + if ((i & 1) == 1) { + const s32 carry = input[i] >> 25; + + input[i] &= 0x1ffffff; + input[i+1] += carry; + } else { + const s32 carry = input[i] >> 26; + + input[i] &= 0x3ffffff; + input[i+1] += carry; + } + } + + { + const s32 carry = input[9] >> 25; + + input[9] &= 0x1ffffff; + input[0] += 19*carry; + } + } + + /* If the first carry-chain pass, just above, ended up with a carry from + * input[9], and that caused input[0] to be out-of-bounds, then input[0] was + * < 2^26 + 2*19, because the carry was, at most, two. + * + * If the second pass carried from input[9] again then input[0] is < 2*19 and + * the input[9] -> input[0] carry didn't push input[0] out of bounds. + */ + + /* It still remains the case that input might be between 2^255-19 and 2^255. + * In this case, input[1..9] must take their maximum value and input[0] must + * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. + */ + mask = s32_gte(input[0], 0x3ffffed); + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + mask &= s32_eq(input[i], 0x1ffffff); + } else { + mask &= s32_eq(input[i], 0x3ffffff); + } + } + + /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus + * this conditionally subtracts 2^255-19. + */ + input[0] -= mask & 0x3ffffed; + + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + input[i] -= mask & 0x1ffffff; + } else { + input[i] -= mask & 0x3ffffff; + } + } + + input[1] <<= 2; + input[2] <<= 3; + input[3] <<= 5; + input[4] <<= 6; + input[6] <<= 1; + input[7] <<= 3; + input[8] <<= 4; + input[9] <<= 6; +#define F(i, s) \ + output[s+0] |= input[i] & 0xff; \ + output[s+1] = (input[i] >> 8) & 0xff; \ + output[s+2] = (input[i] >> 16) & 0xff; \ + output[s+3] = (input[i] >> 24) & 0xff; + output[0] = 0; + output[16] = 0; + F(0, 0); + F(1, 3); + F(2, 6); + F(3, 9); + F(4, 12); + F(5, 16); + F(6, 19); + F(7, 22); + F(8, 25); + F(9, 28); +#undef F +} + +/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave + * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid + * side-channel attacks. + * + * NOTE that this function requires that 'iswap' be 1 or 0; other values give + * wrong results. Also, the two limb arrays must be in reduced-coefficient, + * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, + * and all all values in a[0..9],b[0..9] must have magnitude less than + * INT32_MAX. + */ +static void swap_conditional(limb a[19], limb b[19], limb iswap) +{ + unsigned int i; + const s32 swap = (s32) -iswap; + + for (i = 0; i < 10; ++i) { + const s32 x = swap & (((s32)a[i]) ^ ((s32)b[i])); + + a[i] = ((s32)a[i]) ^ x; + b[i] = ((s32)b[i]) ^ x; + } +} + +static void crecip(limb *out, const limb *z) +{ + limb z2[10]; + limb z9[10]; + limb z11[10]; + limb z2_5_0[10]; + limb z2_10_0[10]; + limb z2_20_0[10]; + limb z2_50_0[10]; + limb z2_100_0[10]; + limb t0[10]; + limb t1[10]; + int i; + + /* 2 */ fsquare(z2, z); + /* 4 */ fsquare(t1, z2); + /* 8 */ fsquare(t0, t1); + /* 9 */ fmul(z9, t0, z); + /* 11 */ fmul(z11, z9, z2); + /* 22 */ fsquare(t0, z11); + /* 2^5 - 2^0 = 31 */ fmul(z2_5_0, t0, z9); + + /* 2^6 - 2^1 */ fsquare(t0, z2_5_0); + /* 2^7 - 2^2 */ fsquare(t1, t0); + /* 2^8 - 2^3 */ fsquare(t0, t1); + /* 2^9 - 2^4 */ fsquare(t1, t0); + /* 2^10 - 2^5 */ fsquare(t0, t1); + /* 2^10 - 2^0 */ fmul(z2_10_0, t0, z2_5_0); + + /* 2^11 - 2^1 */ fsquare(t0, z2_10_0); + /* 2^12 - 2^2 */ fsquare(t1, t0); + /* 2^20 - 2^10 */ for (i = 2; i < 10; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } + /* 2^20 - 2^0 */ fmul(z2_20_0, t1, z2_10_0); + + /* 2^21 - 2^1 */ fsquare(t0, z2_20_0); + /* 2^22 - 2^2 */ fsquare(t1, t0); + /* 2^40 - 2^20 */ for (i = 2; i < 20; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } + /* 2^40 - 2^0 */ fmul(t0, t1, z2_20_0); + + /* 2^41 - 2^1 */ fsquare(t1, t0); + /* 2^42 - 2^2 */ fsquare(t0, t1); + /* 2^50 - 2^10 */ for (i = 2; i < 10; i += 2) { fsquare(t1, t0); fsquare(t0, t1); } + /* 2^50 - 2^0 */ fmul(z2_50_0, t0, z2_10_0); + + /* 2^51 - 2^1 */ fsquare(t0, z2_50_0); + /* 2^52 - 2^2 */ fsquare(t1, t0); + /* 2^100 - 2^50 */ for (i = 2; i < 50; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } + /* 2^100 - 2^0 */ fmul(z2_100_0, t1, z2_50_0); + + /* 2^101 - 2^1 */ fsquare(t1, z2_100_0); + /* 2^102 - 2^2 */ fsquare(t0, t1); + /* 2^200 - 2^100 */ for (i = 2; i < 100; i += 2) { fsquare(t1, t0); fsquare(t0, t1); } + /* 2^200 - 2^0 */ fmul(t1, t0, z2_100_0); + + /* 2^201 - 2^1 */ fsquare(t0, t1); + /* 2^202 - 2^2 */ fsquare(t1, t0); + /* 2^250 - 2^50 */ for (i = 2; i < 50; i += 2) { fsquare(t0, t1); fsquare(t1, t0); } + /* 2^250 - 2^0 */ fmul(t0, t1, z2_50_0); + + /* 2^251 - 2^1 */ fsquare(t1, t0); + /* 2^252 - 2^2 */ fsquare(t0, t1); + /* 2^253 - 2^3 */ fsquare(t1, t0); + /* 2^254 - 2^4 */ fsquare(t0, t1); + /* 2^255 - 2^5 */ fsquare(t1, t0); + /* 2^255 - 21 */ fmul(out, t1, z11); +} + + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + * + * On entry and exit, the absolute value of the limbs of all inputs and outputs + * are < 2^26. + */ +static void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + + const limb *qmqp /* input Q - Q' */) +{ + limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], + zzprime[19], zzzprime[19], xxxprime[19]; + + memcpy(origx, x, 10 * sizeof(limb)); + fsum(x, z); + /* |x[i]| < 2^27 */ + fdifference(z, origx); /* does x - z */ + /* |z[i]| < 2^27 */ + + memcpy(origxprime, xprime, sizeof(limb) * 10); + fsum(xprime, zprime); + /* |xprime[i]| < 2^27 */ + fdifference(zprime, origxprime); + /* |zprime[i]| < 2^27 */ + fproduct(xxprime, xprime, z); + /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < + * 2^(27+27) and fproduct adds together, at most, 14 of those products. + * (Approximating that to 2^58 doesn't work out.) + */ + fproduct(zzprime, x, zprime); + /* |zzprime[i]| < 14*2^54 */ + freduce_degree(xxprime); + freduce_coefficients(xxprime); + /* |xxprime[i]| < 2^26 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(origxprime, xxprime, sizeof(limb) * 10); + fsum(xxprime, zzprime); + /* |xxprime[i]| < 2^27 */ + fdifference(zzprime, origxprime); + /* |zzprime[i]| < 2^27 */ + fsquare(xxxprime, xxprime); + /* |xxxprime[i]| < 2^26 */ + fsquare(zzzprime, zzprime); + /* |zzzprime[i]| < 2^26 */ + fproduct(zzprime, zzzprime, qmqp); + /* |zzprime[i]| < 14*2^52 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(x3, xxxprime, sizeof(limb) * 10); + memcpy(z3, zzprime, sizeof(limb) * 10); + + fsquare(xx, x); + /* |xx[i]| < 2^26 */ + fsquare(zz, z); + /* |zz[i]| < 2^26 */ + fproduct(x2, xx, zz); + /* |x2[i]| < 14*2^52 */ + freduce_degree(x2); + freduce_coefficients(x2); + /* |x2[i]| < 2^26 */ + fdifference(zz, xx); // does zz = xx - zz + /* |zz[i]| < 2^27 */ + memset(zzz + 10, 0, sizeof(limb) * 9); + fscalar_product(zzz, zz, 121665); + /* |zzz[i]| < 2^(27+17) */ + /* No need to call freduce_degree here: + fscalar_product doesn't increase the degree of its input. */ + freduce_coefficients(zzz); + /* |zzz[i]| < 2^26 */ + fsum(zzz, xx); + /* |zzz[i]| < 2^27 */ + fproduct(z2, zz, zzz); + /* |z2[i]| < 14*2^(26+27) */ + freduce_degree(z2); + freduce_coefficients(z2); + /* |z2|i| < 2^26 */ +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) + */ +static void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) +{ + limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned int i, j; + + memcpy(nqpqx, q, sizeof(limb) * 10); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 10); + memcpy(resultz, nqz, sizeof(limb) * 10); +} + +bool curve25519_donna32(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ + limb bp[10], x[10], z[11], zmone[10]; + u8 e[32]; + + memcpy(e, secret, 32); + normalize_secret(e); + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(mypublic, z); + + return true; +} diff --git a/curve25519-donna64.c b/curve25519-donna64.c new file mode 100644 index 0000000..f294369 --- /dev/null +++ b/curve25519-donna64.c @@ -0,0 +1,414 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2008 Google Inc. All Rights Reserved. + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Adam Langley + */ + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; + +typedef u64 limb; +typedef limb felem[5]; +typedef __uint128_t u128; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +/* Sum two numbers: output += in */ +static __always_inline void fsum(limb *output, const limb *in) +{ + output[0] += in[0]; + output[1] += in[1]; + output[2] += in[2]; + output[3] += in[3]; + output[4] += in[4]; +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!) + * + * Assumes that out[i] < 2**52 + * On return, out[i] < 2**55 + */ +static __always_inline void fdifference_backwards(felem out, const felem in) +{ + /* 152 is 19 << 3 */ + static const limb two54m152 = (((limb)1) << 54) - 152; + static const limb two54m8 = (((limb)1) << 54) - 8; + + out[0] = in[0] + two54m152 - out[0]; + out[1] = in[1] + two54m8 - out[1]; + out[2] = in[2] + two54m8 - out[2]; + out[3] = in[3] + two54m8 - out[3]; + out[4] = in[4] + two54m8 - out[4]; +} + +/* Multiply a number by a scalar: output = in * scalar */ +static __always_inline void fscalar_product(felem output, const felem in, const limb scalar) +{ + u128 a; + + a = ((u128) in[0]) * scalar; + output[0] = ((limb)a) & 0x7ffffffffffffUL; + + a = ((u128) in[1]) * scalar + ((limb) (a >> 51)); + output[1] = ((limb)a) & 0x7ffffffffffffUL; + + a = ((u128) in[2]) * scalar + ((limb) (a >> 51)); + output[2] = ((limb)a) & 0x7ffffffffffffUL; + + a = ((u128) in[3]) * scalar + ((limb) (a >> 51)); + output[3] = ((limb)a) & 0x7ffffffffffffUL; + + a = ((u128) in[4]) * scalar + ((limb) (a >> 51)); + output[4] = ((limb)a) & 0x7ffffffffffffUL; + + output[0] += (a >> 51) * 19; +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * Assumes that in[i] < 2**55 and likewise for in2. + * On return, output[i] < 2**52 + */ +static __always_inline void fmul(felem output, const felem in2, const felem in) +{ + u128 t[5]; + limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + s0 = in2[0]; + s1 = in2[1]; + s2 = in2[2]; + s3 = in2[3]; + s4 = in2[4]; + + t[0] = ((u128) r0) * s0; + t[1] = ((u128) r0) * s1 + ((u128) r1) * s0; + t[2] = ((u128) r0) * s2 + ((u128) r2) * s0 + ((u128) r1) * s1; + t[3] = ((u128) r0) * s3 + ((u128) r3) * s0 + ((u128) r1) * s2 + ((u128) r2) * s1; + t[4] = ((u128) r0) * s4 + ((u128) r4) * s0 + ((u128) r3) * s1 + ((u128) r1) * s3 + ((u128) r2) * s2; + + r4 *= 19; + r1 *= 19; + r2 *= 19; + r3 *= 19; + + t[0] += ((u128) r4) * s1 + ((u128) r1) * s4 + ((u128) r2) * s3 + ((u128) r3) * s2; + t[1] += ((u128) r4) * s2 + ((u128) r2) * s4 + ((u128) r3) * s3; + t[2] += ((u128) r4) * s3 + ((u128) r3) * s4; + t[3] += ((u128) r4) * s4; + + r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; + r2 += c; + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +static __always_inline void fsquare_times(felem output, const felem in, limb count) +{ + u128 t[5]; + limb r0, r1, r2, r3, r4, c; + limb d0, d1, d2, d4, d419; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + do { + d0 = r0 * 2; + d1 = r1 * 2; + d2 = r2 * 2 * 19; + d419 = r4 * 19; + d4 = d419 * 2; + + t[0] = ((u128) r0) * r0 + ((u128) d4) * r1 + (((u128) d2) * (r3 )); + t[1] = ((u128) d0) * r1 + ((u128) d4) * r2 + (((u128) r3) * (r3 * 19)); + t[2] = ((u128) d0) * r2 + ((u128) r1) * r1 + (((u128) d4) * (r3 )); + t[3] = ((u128) d0) * r3 + ((u128) d1) * r2 + (((u128) r4) * (d419 )); + t[4] = ((u128) d0) * r4 + ((u128) d1) * r3 + (((u128) r2) * (r2 )); + + r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; + r2 += c; + } while (--count); + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +/* Load a little-endian 64-bit number */ +static inline limb load_limb(const u8 *in) +{ + return le64_to_cpu(*(__le64 *)in); +} + +static inline void store_limb(u8 *out, limb in) +{ + *(__le64 *)out = cpu_to_le64(in); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static inline void fexpand(limb *output, const u8 *in) +{ + output[0] = load_limb(in) & 0x7ffffffffffffUL; + output[1] = (load_limb(in + 6) >> 3) & 0x7ffffffffffffUL; + output[2] = (load_limb(in + 12) >> 6) & 0x7ffffffffffffUL; + output[3] = (load_limb(in + 19) >> 1) & 0x7ffffffffffffUL; + output[4] = (load_limb(in + 24) >> 12) & 0x7ffffffffffffUL; +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array + */ +static void fcontract(u8 *output, const felem input) +{ + u128 t[5]; + + t[0] = input[0]; + t[1] = input[1]; + t[2] = input[2]; + t[3] = input[3]; + t[4] = input[4]; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; + + /* now t is between 0 and 2^255-1, properly carried. */ + /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ + + t[0] += 19; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; + + /* now between 19 and 2^255-1 in both cases, and offset by 19. */ + + t[0] += 0x8000000000000UL - 19; + t[1] += 0x8000000000000UL - 1; + t[2] += 0x8000000000000UL - 1; + t[3] += 0x8000000000000UL - 1; + t[4] += 0x8000000000000UL - 1; + + /* now between 2^255 and 2^256-20, and offset by 2^255. */ + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; + t[4] &= 0x7ffffffffffffUL; + + store_limb(output, t[0] | (t[1] << 51)); + store_limb(output+8, (t[1] >> 13) | (t[2] << 38)); + store_limb(output+16, (t[2] >> 26) | (t[3] << 25)); + store_limb(output+24, (t[3] >> 39) | (t[4] << 12)); +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + */ +static void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + + const limb *qmqp /* input Q - Q' */) +{ + limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5], zzzprime[5]; + + memcpy(origx, x, 5 * sizeof(limb)); + fsum(x, z); + fdifference_backwards(z, origx); // does x - z + + memcpy(origxprime, xprime, sizeof(limb) * 5); + fsum(xprime, zprime); + fdifference_backwards(zprime, origxprime); + fmul(xxprime, xprime, z); + fmul(zzprime, x, zprime); + memcpy(origxprime, xxprime, sizeof(limb) * 5); + fsum(xxprime, zzprime); + fdifference_backwards(zzprime, origxprime); + fsquare_times(x3, xxprime, 1); + fsquare_times(zzzprime, zzprime, 1); + fmul(z3, zzzprime, qmqp); + + fsquare_times(xx, x, 1); + fsquare_times(zz, z, 1); + fmul(x2, xx, zz); + fdifference_backwards(zz, xx); // does zz = xx - zz + fscalar_product(zzz, zz, 121665); + fsum(zzz, xx); + fmul(z2, zz, zzz); +} + +/* Maybe swap the contents of two limb arrays (@a and @b), each @len elements + * long. Perform the swap iff @swap is non-zero. + * + * This function performs the swap without leaking any side-channel + * information. + */ +static void swap_conditional(limb a[5], limb b[5], limb iswap) +{ + unsigned int i; + const limb swap = -iswap; + + for (i = 0; i < 5; ++i) { + const limb x = swap & (a[i] ^ b[i]); + + a[i] ^= x; + b[i] ^= x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) + */ +static void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) +{ + limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned int i, j; + + memcpy(nqpqx, q, sizeof(limb) * 5); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 5); + memcpy(resultz, nqz, sizeof(limb) * 5); +} + +static void crecip(felem out, const felem z) +{ + felem a, t0, b, c; + + /* 2 */ fsquare_times(a, z, 1); // a = 2 + /* 8 */ fsquare_times(t0, a, 2); + /* 9 */ fmul(b, t0, z); // b = 9 + /* 11 */ fmul(a, b, a); // a = 11 + /* 22 */ fsquare_times(t0, a, 1); + /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); + /* 2^10 - 2^5 */ fsquare_times(t0, b, 5); + /* 2^10 - 2^0 */ fmul(b, t0, b); + /* 2^20 - 2^10 */ fsquare_times(t0, b, 10); + /* 2^20 - 2^0 */ fmul(c, t0, b); + /* 2^40 - 2^20 */ fsquare_times(t0, c, 20); + /* 2^40 - 2^0 */ fmul(t0, t0, c); + /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10); + /* 2^50 - 2^0 */ fmul(b, t0, b); + /* 2^100 - 2^50 */ fsquare_times(t0, b, 50); + /* 2^100 - 2^0 */ fmul(c, t0, b); + /* 2^200 - 2^100 */ fsquare_times(t0, c, 100); + /* 2^200 - 2^0 */ fmul(t0, t0, c); + /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50); + /* 2^250 - 2^0 */ fmul(t0, t0, b); + /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5); + /* 2^255 - 21 */ fmul(out, t0, a); +} + +bool curve25519_donna64(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ + limb bp[5], x[5], z[5], zmone[5]; + u8 e[32]; + + memcpy(e, secret, 32); + normalize_secret(e); + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(mypublic, z); + + return true; +} diff --git a/curve25519-fiat32.c b/curve25519-fiat32.c new file mode 100644 index 0000000..ac6c6e2 --- /dev/null +++ b/curve25519-fiat32.c @@ -0,0 +1,838 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2016 The fiat-crypto Authors. + * Copyright (C) 2018 Jason A. Donenfeld . All Rights Reserved. + * + * This is a machine-generated formally verified implementation of curve25519 DH from: + * https://github.com/mit-plv/fiat-crypto + */ + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +/* fe means field element. Here the field is \Z/(2^255-19). An element t, + * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 + * t[3]+2^102 t[4]+...+2^230 t[9]. + * fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc. + * Multiplication and carrying produce fe from fe_loose. + */ +typedef struct fe { u32 v[10]; } fe; + +/* fe_loose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc. + * Addition and subtraction produce fe_loose from (fe, fe). + */ +typedef struct fe_loose { u32 v[10]; } fe_loose; + +static __always_inline void fe_frombytes_impl(u32 h[10], const u8 *s) +{ + /* Ignores top bit of s. */ + u32 a0 = le32_to_cpup((__force __le32 *)(s)); + u32 a1 = le32_to_cpup((__force __le32 *)(s+4)); + u32 a2 = le32_to_cpup((__force __le32 *)(s+8)); + u32 a3 = le32_to_cpup((__force __le32 *)(s+12)); + u32 a4 = le32_to_cpup((__force __le32 *)(s+16)); + u32 a5 = le32_to_cpup((__force __le32 *)(s+20)); + u32 a6 = le32_to_cpup((__force __le32 *)(s+24)); + u32 a7 = le32_to_cpup((__force __le32 *)(s+28)); + h[0] = a0&((1<<26)-1); /* 26 used, 32-26 left. 26 */ + h[1] = (a0>>26) | ((a1&((1<<19)-1))<< 6); /* (32-26) + 19 = 6+19 = 25 */ + h[2] = (a1>>19) | ((a2&((1<<13)-1))<<13); /* (32-19) + 13 = 13+13 = 26 */ + h[3] = (a2>>13) | ((a3&((1<< 6)-1))<<19); /* (32-13) + 6 = 19+ 6 = 25 */ + h[4] = (a3>> 6); /* (32- 6) = 26 */ + h[5] = a4&((1<<25)-1); /* 25 */ + h[6] = (a4>>25) | ((a5&((1<<19)-1))<< 7); /* (32-25) + 19 = 7+19 = 26 */ + h[7] = (a5>>19) | ((a6&((1<<12)-1))<<13); /* (32-19) + 12 = 13+12 = 25 */ + h[8] = (a6>>12) | ((a7&((1<< 6)-1))<<20); /* (32-12) + 6 = 20+ 6 = 26 */ + h[9] = (a7>> 6)&((1<<25)-1); /* 25 */ +} + +static __always_inline void fe_frombytes(fe *h, const u8 *s) +{ + fe_frombytes_impl(h->v, s); +} + +static __always_inline u8 /*bool*/ addcarryx_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low) +{ + /* This function extracts 25 bits of result and 1 bit of carry (26 total), so + * a 32-bit intermediate is sufficient. + */ + u32 x = a + b + c; + *low = x & ((1 << 25) - 1); + return (x >> 25) & 1; +} + +static __always_inline u8 /*bool*/ addcarryx_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low) +{ + /* This function extracts 26 bits of result and 1 bit of carry (27 total), so + * a 32-bit intermediate is sufficient. + */ + u32 x = a + b + c; + *low = x & ((1 << 26) - 1); + return (x >> 26) & 1; +} + +static __always_inline u8 /*bool*/ subborrow_u25(u8 /*bool*/ c, u32 a, u32 b, u32 *low) +{ + /* This function extracts 25 bits of result and 1 bit of borrow (26 total), so + * a 32-bit intermediate is sufficient. + */ + u32 x = a - b - c; + *low = x & ((1 << 25) - 1); + return x >> 31; +} + +static __always_inline u8 /*bool*/ subborrow_u26(u8 /*bool*/ c, u32 a, u32 b, u32 *low) +{ + /* This function extracts 26 bits of result and 1 bit of borrow (27 total), so + * a 32-bit intermediate is sufficient. + */ + u32 x = a - b - c; + *low = x & ((1 << 26) - 1); + return x >> 31; +} + +static __always_inline u32 cmovznz32(u32 t, u32 z, u32 nz) +{ + t = -!!t; /* all set if nonzero, 0 if 0 */ + return (t&nz) | ((~t)&z); +} + +static __always_inline void fe_freeze(u32 out[10], const u32 in1[10]) +{ + { const u32 x17 = in1[9]; + { const u32 x18 = in1[8]; + { const u32 x16 = in1[7]; + { const u32 x14 = in1[6]; + { const u32 x12 = in1[5]; + { const u32 x10 = in1[4]; + { const u32 x8 = in1[3]; + { const u32 x6 = in1[2]; + { const u32 x4 = in1[1]; + { const u32 x2 = in1[0]; + { u32 x20; u8/*bool*/ x21 = subborrow_u26(0x0, x2, 0x3ffffed, &x20); + { u32 x23; u8/*bool*/ x24 = subborrow_u25(x21, x4, 0x1ffffff, &x23); + { u32 x26; u8/*bool*/ x27 = subborrow_u26(x24, x6, 0x3ffffff, &x26); + { u32 x29; u8/*bool*/ x30 = subborrow_u25(x27, x8, 0x1ffffff, &x29); + { u32 x32; u8/*bool*/ x33 = subborrow_u26(x30, x10, 0x3ffffff, &x32); + { u32 x35; u8/*bool*/ x36 = subborrow_u25(x33, x12, 0x1ffffff, &x35); + { u32 x38; u8/*bool*/ x39 = subborrow_u26(x36, x14, 0x3ffffff, &x38); + { u32 x41; u8/*bool*/ x42 = subborrow_u25(x39, x16, 0x1ffffff, &x41); + { u32 x44; u8/*bool*/ x45 = subborrow_u26(x42, x18, 0x3ffffff, &x44); + { u32 x47; u8/*bool*/ x48 = subborrow_u25(x45, x17, 0x1ffffff, &x47); + { u32 x49 = cmovznz32(x48, 0x0, 0xffffffff); + { u32 x50 = (x49 & 0x3ffffed); + { u32 x52; u8/*bool*/ x53 = addcarryx_u26(0x0, x20, x50, &x52); + { u32 x54 = (x49 & 0x1ffffff); + { u32 x56; u8/*bool*/ x57 = addcarryx_u25(x53, x23, x54, &x56); + { u32 x58 = (x49 & 0x3ffffff); + { u32 x60; u8/*bool*/ x61 = addcarryx_u26(x57, x26, x58, &x60); + { u32 x62 = (x49 & 0x1ffffff); + { u32 x64; u8/*bool*/ x65 = addcarryx_u25(x61, x29, x62, &x64); + { u32 x66 = (x49 & 0x3ffffff); + { u32 x68; u8/*bool*/ x69 = addcarryx_u26(x65, x32, x66, &x68); + { u32 x70 = (x49 & 0x1ffffff); + { u32 x72; u8/*bool*/ x73 = addcarryx_u25(x69, x35, x70, &x72); + { u32 x74 = (x49 & 0x3ffffff); + { u32 x76; u8/*bool*/ x77 = addcarryx_u26(x73, x38, x74, &x76); + { u32 x78 = (x49 & 0x1ffffff); + { u32 x80; u8/*bool*/ x81 = addcarryx_u25(x77, x41, x78, &x80); + { u32 x82 = (x49 & 0x3ffffff); + { u32 x84; u8/*bool*/ x85 = addcarryx_u26(x81, x44, x82, &x84); + { u32 x86 = (x49 & 0x1ffffff); + { u32 x88; addcarryx_u25(x85, x47, x86, &x88); + out[0] = x52; + out[1] = x56; + out[2] = x60; + out[3] = x64; + out[4] = x68; + out[5] = x72; + out[6] = x76; + out[7] = x80; + out[8] = x84; + out[9] = x88; + }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} +} + +static __always_inline void fe_tobytes(u8 s[32], const fe *f) +{ + u32 h[10]; + fe_freeze(h, f->v); + s[0] = h[0] >> 0; + s[1] = h[0] >> 8; + s[2] = h[0] >> 16; + s[3] = (h[0] >> 24) | (h[1] << 2); + s[4] = h[1] >> 6; + s[5] = h[1] >> 14; + s[6] = (h[1] >> 22) | (h[2] << 3); + s[7] = h[2] >> 5; + s[8] = h[2] >> 13; + s[9] = (h[2] >> 21) | (h[3] << 5); + s[10] = h[3] >> 3; + s[11] = h[3] >> 11; + s[12] = (h[3] >> 19) | (h[4] << 6); + s[13] = h[4] >> 2; + s[14] = h[4] >> 10; + s[15] = h[4] >> 18; + s[16] = h[5] >> 0; + s[17] = h[5] >> 8; + s[18] = h[5] >> 16; + s[19] = (h[5] >> 24) | (h[6] << 1); + s[20] = h[6] >> 7; + s[21] = h[6] >> 15; + s[22] = (h[6] >> 23) | (h[7] << 3); + s[23] = h[7] >> 5; + s[24] = h[7] >> 13; + s[25] = (h[7] >> 21) | (h[8] << 4); + s[26] = h[8] >> 4; + s[27] = h[8] >> 12; + s[28] = (h[8] >> 20) | (h[9] << 6); + s[29] = h[9] >> 2; + s[30] = h[9] >> 10; + s[31] = h[9] >> 18; +} + +/* h = f */ +static __always_inline void fe_copy(fe *h, const fe *f) +{ + memmove(h, f, sizeof(u32) * 10); +} + +static __always_inline void fe_copy_lt(fe_loose *h, const fe *f) +{ + memmove(h, f, sizeof(u32) * 10); +} + +/* h = 0 */ +static __always_inline void fe_0(fe *h) +{ + memset(h, 0, sizeof(u32) * 10); +} + +/* h = 1 */ +static __always_inline void fe_1(fe *h) +{ + memset(h, 0, sizeof(u32) * 10); + h->v[0] = 1; +} + +static __always_inline void fe_add_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) +{ + { const u32 x20 = in1[9]; + { const u32 x21 = in1[8]; + { const u32 x19 = in1[7]; + { const u32 x17 = in1[6]; + { const u32 x15 = in1[5]; + { const u32 x13 = in1[4]; + { const u32 x11 = in1[3]; + { const u32 x9 = in1[2]; + { const u32 x7 = in1[1]; + { const u32 x5 = in1[0]; + { const u32 x38 = in2[9]; + { const u32 x39 = in2[8]; + { const u32 x37 = in2[7]; + { const u32 x35 = in2[6]; + { const u32 x33 = in2[5]; + { const u32 x31 = in2[4]; + { const u32 x29 = in2[3]; + { const u32 x27 = in2[2]; + { const u32 x25 = in2[1]; + { const u32 x23 = in2[0]; + out[0] = (x5 + x23); + out[1] = (x7 + x25); + out[2] = (x9 + x27); + out[3] = (x11 + x29); + out[4] = (x13 + x31); + out[5] = (x15 + x33); + out[6] = (x17 + x35); + out[7] = (x19 + x37); + out[8] = (x21 + x39); + out[9] = (x20 + x38); + }}}}}}}}}}}}}}}}}}}} +} + +/* h = f + g + * Can overlap h with f or g. + */ +static __always_inline void fe_add(fe_loose *h, const fe *f, const fe *g) +{ + fe_add_impl(h->v, f->v, g->v); +} + +static __always_inline void fe_sub_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) +{ + { const u32 x20 = in1[9]; + { const u32 x21 = in1[8]; + { const u32 x19 = in1[7]; + { const u32 x17 = in1[6]; + { const u32 x15 = in1[5]; + { const u32 x13 = in1[4]; + { const u32 x11 = in1[3]; + { const u32 x9 = in1[2]; + { const u32 x7 = in1[1]; + { const u32 x5 = in1[0]; + { const u32 x38 = in2[9]; + { const u32 x39 = in2[8]; + { const u32 x37 = in2[7]; + { const u32 x35 = in2[6]; + { const u32 x33 = in2[5]; + { const u32 x31 = in2[4]; + { const u32 x29 = in2[3]; + { const u32 x27 = in2[2]; + { const u32 x25 = in2[1]; + { const u32 x23 = in2[0]; + out[0] = ((0x7ffffda + x5) - x23); + out[1] = ((0x3fffffe + x7) - x25); + out[2] = ((0x7fffffe + x9) - x27); + out[3] = ((0x3fffffe + x11) - x29); + out[4] = ((0x7fffffe + x13) - x31); + out[5] = ((0x3fffffe + x15) - x33); + out[6] = ((0x7fffffe + x17) - x35); + out[7] = ((0x3fffffe + x19) - x37); + out[8] = ((0x7fffffe + x21) - x39); + out[9] = ((0x3fffffe + x20) - x38); + }}}}}}}}}}}}}}}}}}}} +} + +/* h = f - g + * Can overlap h with f or g. + */ +static __always_inline void fe_sub(fe_loose *h, const fe *f, const fe *g) +{ + fe_sub_impl(h->v, f->v, g->v); +} + +static __always_inline void fe_mul_impl(u32 out[10], const u32 in1[10], const u32 in2[10]) +{ + { const u32 x20 = in1[9]; + { const u32 x21 = in1[8]; + { const u32 x19 = in1[7]; + { const u32 x17 = in1[6]; + { const u32 x15 = in1[5]; + { const u32 x13 = in1[4]; + { const u32 x11 = in1[3]; + { const u32 x9 = in1[2]; + { const u32 x7 = in1[1]; + { const u32 x5 = in1[0]; + { const u32 x38 = in2[9]; + { const u32 x39 = in2[8]; + { const u32 x37 = in2[7]; + { const u32 x35 = in2[6]; + { const u32 x33 = in2[5]; + { const u32 x31 = in2[4]; + { const u32 x29 = in2[3]; + { const u32 x27 = in2[2]; + { const u32 x25 = in2[1]; + { const u32 x23 = in2[0]; + { u64 x40 = ((u64)x23 * x5); + { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5)); + { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5)); + { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5)); + { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5)); + { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5)); + { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5)); + { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5)); + { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5)); + { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5)); + { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9)); + { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9)); + { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13)); + { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13)); + { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17)); + { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17)); + { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19)))); + { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21)); + { u64 x58 = ((u64)(0x2 * x38) * x20); + { u64 x59 = (x48 + (x58 << 0x4)); + { u64 x60 = (x59 + (x58 << 0x1)); + { u64 x61 = (x60 + x58); + { u64 x62 = (x47 + (x57 << 0x4)); + { u64 x63 = (x62 + (x57 << 0x1)); + { u64 x64 = (x63 + x57); + { u64 x65 = (x46 + (x56 << 0x4)); + { u64 x66 = (x65 + (x56 << 0x1)); + { u64 x67 = (x66 + x56); + { u64 x68 = (x45 + (x55 << 0x4)); + { u64 x69 = (x68 + (x55 << 0x1)); + { u64 x70 = (x69 + x55); + { u64 x71 = (x44 + (x54 << 0x4)); + { u64 x72 = (x71 + (x54 << 0x1)); + { u64 x73 = (x72 + x54); + { u64 x74 = (x43 + (x53 << 0x4)); + { u64 x75 = (x74 + (x53 << 0x1)); + { u64 x76 = (x75 + x53); + { u64 x77 = (x42 + (x52 << 0x4)); + { u64 x78 = (x77 + (x52 << 0x1)); + { u64 x79 = (x78 + x52); + { u64 x80 = (x41 + (x51 << 0x4)); + { u64 x81 = (x80 + (x51 << 0x1)); + { u64 x82 = (x81 + x51); + { u64 x83 = (x40 + (x50 << 0x4)); + { u64 x84 = (x83 + (x50 << 0x1)); + { u64 x85 = (x84 + x50); + { u64 x86 = (x85 >> 0x1a); + { u32 x87 = ((u32)x85 & 0x3ffffff); + { u64 x88 = (x86 + x82); + { u64 x89 = (x88 >> 0x19); + { u32 x90 = ((u32)x88 & 0x1ffffff); + { u64 x91 = (x89 + x79); + { u64 x92 = (x91 >> 0x1a); + { u32 x93 = ((u32)x91 & 0x3ffffff); + { u64 x94 = (x92 + x76); + { u64 x95 = (x94 >> 0x19); + { u32 x96 = ((u32)x94 & 0x1ffffff); + { u64 x97 = (x95 + x73); + { u64 x98 = (x97 >> 0x1a); + { u32 x99 = ((u32)x97 & 0x3ffffff); + { u64 x100 = (x98 + x70); + { u64 x101 = (x100 >> 0x19); + { u32 x102 = ((u32)x100 & 0x1ffffff); + { u64 x103 = (x101 + x67); + { u64 x104 = (x103 >> 0x1a); + { u32 x105 = ((u32)x103 & 0x3ffffff); + { u64 x106 = (x104 + x64); + { u64 x107 = (x106 >> 0x19); + { u32 x108 = ((u32)x106 & 0x1ffffff); + { u64 x109 = (x107 + x61); + { u64 x110 = (x109 >> 0x1a); + { u32 x111 = ((u32)x109 & 0x3ffffff); + { u64 x112 = (x110 + x49); + { u64 x113 = (x112 >> 0x19); + { u32 x114 = ((u32)x112 & 0x1ffffff); + { u64 x115 = (x87 + (0x13 * x113)); + { u32 x116 = (u32) (x115 >> 0x1a); + { u32 x117 = ((u32)x115 & 0x3ffffff); + { u32 x118 = (x116 + x90); + { u32 x119 = (x118 >> 0x19); + { u32 x120 = (x118 & 0x1ffffff); + out[0] = x117; + out[1] = x120; + out[2] = (x119 + x93); + out[3] = x96; + out[4] = x99; + out[5] = x102; + out[6] = x105; + out[7] = x108; + out[8] = x111; + out[9] = x114; + }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} +} + +static __always_inline void fe_mul_ttt(fe *h, const fe *f, const fe *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static __always_inline void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static __always_inline void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) +{ + fe_mul_impl(h->v, f->v, g->v); +} + +static __always_inline void fe_sqr_impl(u32 out[10], const u32 in1[10]) +{ + { const u32 x17 = in1[9]; + { const u32 x18 = in1[8]; + { const u32 x16 = in1[7]; + { const u32 x14 = in1[6]; + { const u32 x12 = in1[5]; + { const u32 x10 = in1[4]; + { const u32 x8 = in1[3]; + { const u32 x6 = in1[2]; + { const u32 x4 = in1[1]; + { const u32 x2 = in1[0]; + { u64 x19 = ((u64)x2 * x2); + { u64 x20 = ((u64)(0x2 * x2) * x4); + { u64 x21 = (0x2 * (((u64)x4 * x4) + ((u64)x2 * x6))); + { u64 x22 = (0x2 * (((u64)x4 * x6) + ((u64)x2 * x8))); + { u64 x23 = ((((u64)x6 * x6) + ((u64)(0x4 * x4) * x8)) + ((u64)(0x2 * x2) * x10)); + { u64 x24 = (0x2 * ((((u64)x6 * x8) + ((u64)x4 * x10)) + ((u64)x2 * x12))); + { u64 x25 = (0x2 * (((((u64)x8 * x8) + ((u64)x6 * x10)) + ((u64)x2 * x14)) + ((u64)(0x2 * x4) * x12))); + { u64 x26 = (0x2 * (((((u64)x8 * x10) + ((u64)x6 * x12)) + ((u64)x4 * x14)) + ((u64)x2 * x16))); + { u64 x27 = (((u64)x10 * x10) + (0x2 * ((((u64)x6 * x14) + ((u64)x2 * x18)) + (0x2 * (((u64)x4 * x16) + ((u64)x8 * x12)))))); + { u64 x28 = (0x2 * ((((((u64)x10 * x12) + ((u64)x8 * x14)) + ((u64)x6 * x16)) + ((u64)x4 * x18)) + ((u64)x2 * x17))); + { u64 x29 = (0x2 * (((((u64)x12 * x12) + ((u64)x10 * x14)) + ((u64)x6 * x18)) + (0x2 * (((u64)x8 * x16) + ((u64)x4 * x17))))); + { u64 x30 = (0x2 * (((((u64)x12 * x14) + ((u64)x10 * x16)) + ((u64)x8 * x18)) + ((u64)x6 * x17))); + { u64 x31 = (((u64)x14 * x14) + (0x2 * (((u64)x10 * x18) + (0x2 * (((u64)x12 * x16) + ((u64)x8 * x17)))))); + { u64 x32 = (0x2 * ((((u64)x14 * x16) + ((u64)x12 * x18)) + ((u64)x10 * x17))); + { u64 x33 = (0x2 * ((((u64)x16 * x16) + ((u64)x14 * x18)) + ((u64)(0x2 * x12) * x17))); + { u64 x34 = (0x2 * (((u64)x16 * x18) + ((u64)x14 * x17))); + { u64 x35 = (((u64)x18 * x18) + ((u64)(0x4 * x16) * x17)); + { u64 x36 = ((u64)(0x2 * x18) * x17); + { u64 x37 = ((u64)(0x2 * x17) * x17); + { u64 x38 = (x27 + (x37 << 0x4)); + { u64 x39 = (x38 + (x37 << 0x1)); + { u64 x40 = (x39 + x37); + { u64 x41 = (x26 + (x36 << 0x4)); + { u64 x42 = (x41 + (x36 << 0x1)); + { u64 x43 = (x42 + x36); + { u64 x44 = (x25 + (x35 << 0x4)); + { u64 x45 = (x44 + (x35 << 0x1)); + { u64 x46 = (x45 + x35); + { u64 x47 = (x24 + (x34 << 0x4)); + { u64 x48 = (x47 + (x34 << 0x1)); + { u64 x49 = (x48 + x34); + { u64 x50 = (x23 + (x33 << 0x4)); + { u64 x51 = (x50 + (x33 << 0x1)); + { u64 x52 = (x51 + x33); + { u64 x53 = (x22 + (x32 << 0x4)); + { u64 x54 = (x53 + (x32 << 0x1)); + { u64 x55 = (x54 + x32); + { u64 x56 = (x21 + (x31 << 0x4)); + { u64 x57 = (x56 + (x31 << 0x1)); + { u64 x58 = (x57 + x31); + { u64 x59 = (x20 + (x30 << 0x4)); + { u64 x60 = (x59 + (x30 << 0x1)); + { u64 x61 = (x60 + x30); + { u64 x62 = (x19 + (x29 << 0x4)); + { u64 x63 = (x62 + (x29 << 0x1)); + { u64 x64 = (x63 + x29); + { u64 x65 = (x64 >> 0x1a); + { u32 x66 = ((u32)x64 & 0x3ffffff); + { u64 x67 = (x65 + x61); + { u64 x68 = (x67 >> 0x19); + { u32 x69 = ((u32)x67 & 0x1ffffff); + { u64 x70 = (x68 + x58); + { u64 x71 = (x70 >> 0x1a); + { u32 x72 = ((u32)x70 & 0x3ffffff); + { u64 x73 = (x71 + x55); + { u64 x74 = (x73 >> 0x19); + { u32 x75 = ((u32)x73 & 0x1ffffff); + { u64 x76 = (x74 + x52); + { u64 x77 = (x76 >> 0x1a); + { u32 x78 = ((u32)x76 & 0x3ffffff); + { u64 x79 = (x77 + x49); + { u64 x80 = (x79 >> 0x19); + { u32 x81 = ((u32)x79 & 0x1ffffff); + { u64 x82 = (x80 + x46); + { u64 x83 = (x82 >> 0x1a); + { u32 x84 = ((u32)x82 & 0x3ffffff); + { u64 x85 = (x83 + x43); + { u64 x86 = (x85 >> 0x19); + { u32 x87 = ((u32)x85 & 0x1ffffff); + { u64 x88 = (x86 + x40); + { u64 x89 = (x88 >> 0x1a); + { u32 x90 = ((u32)x88 & 0x3ffffff); + { u64 x91 = (x89 + x28); + { u64 x92 = (x91 >> 0x19); + { u32 x93 = ((u32)x91 & 0x1ffffff); + { u64 x94 = (x66 + (0x13 * x92)); + { u32 x95 = (u32) (x94 >> 0x1a); + { u32 x96 = ((u32)x94 & 0x3ffffff); + { u32 x97 = (x95 + x69); + { u32 x98 = (x97 >> 0x19); + { u32 x99 = (x97 & 0x1ffffff); + out[0] = x96; + out[1] = x99; + out[2] = (x98 + x72); + out[3] = x75; + out[4] = x78; + out[5] = x81; + out[6] = x84; + out[7] = x87; + out[8] = x90; + out[9] = x93; + }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} +} + +static __always_inline void fe_sq_tl(fe *h, const fe_loose *f) +{ + fe_sqr_impl(h->v, f->v); +} + +static __always_inline void fe_sq_tt(fe *h, const fe *f) +{ + fe_sqr_impl(h->v, f->v); +} + +static __always_inline void fe_loose_invert(fe *out, const fe_loose *z) +{ + fe t0; + fe t1; + fe t2; + fe t3; + int i; + + fe_sq_tl(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) + fe_sq_tt(&t1, &t1); + fe_mul_tlt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t2, &t0); + fe_mul_ttt(&t1, &t1, &t2); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 5; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 10; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 20; ++i) + fe_sq_tt(&t3, &t3); + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 10; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 50; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 100; ++i) + fe_sq_tt(&t3, &t3); + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 50; ++i) + fe_sq_tt(&t2, &t2); + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 5; ++i) + fe_sq_tt(&t1, &t1); + fe_mul_ttt(out, &t1, &t0); +} + +static __always_inline void fe_invert(fe *out, const fe *z) +{ + fe_loose l; + fe_copy_lt(&l, z); + fe_loose_invert(out, &l); +} + +/* Replace (f,g) with (g,f) if b == 1; + * replace (f,g) with (f,g) if b == 0. + * + * Preconditions: b in {0,1} + */ +static __always_inline void fe_cswap(fe *f, fe *g, unsigned int b) +{ + unsigned i; + b = 0-b; + for (i = 0; i < 10; i++) { + u32 x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + g->v[i] ^= x; + } +} + +/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/ +static __always_inline void fe_mul_121666_impl(u32 out[10], const u32 in1[10]) +{ + { const u32 x20 = in1[9]; + { const u32 x21 = in1[8]; + { const u32 x19 = in1[7]; + { const u32 x17 = in1[6]; + { const u32 x15 = in1[5]; + { const u32 x13 = in1[4]; + { const u32 x11 = in1[3]; + { const u32 x9 = in1[2]; + { const u32 x7 = in1[1]; + { const u32 x5 = in1[0]; + { const u32 x38 = 0; + { const u32 x39 = 0; + { const u32 x37 = 0; + { const u32 x35 = 0; + { const u32 x33 = 0; + { const u32 x31 = 0; + { const u32 x29 = 0; + { const u32 x27 = 0; + { const u32 x25 = 0; + { const u32 x23 = 121666; + { u64 x40 = ((u64)x23 * x5); + { u64 x41 = (((u64)x23 * x7) + ((u64)x25 * x5)); + { u64 x42 = ((((u64)(0x2 * x25) * x7) + ((u64)x23 * x9)) + ((u64)x27 * x5)); + { u64 x43 = (((((u64)x25 * x9) + ((u64)x27 * x7)) + ((u64)x23 * x11)) + ((u64)x29 * x5)); + { u64 x44 = (((((u64)x27 * x9) + (0x2 * (((u64)x25 * x11) + ((u64)x29 * x7)))) + ((u64)x23 * x13)) + ((u64)x31 * x5)); + { u64 x45 = (((((((u64)x27 * x11) + ((u64)x29 * x9)) + ((u64)x25 * x13)) + ((u64)x31 * x7)) + ((u64)x23 * x15)) + ((u64)x33 * x5)); + { u64 x46 = (((((0x2 * ((((u64)x29 * x11) + ((u64)x25 * x15)) + ((u64)x33 * x7))) + ((u64)x27 * x13)) + ((u64)x31 * x9)) + ((u64)x23 * x17)) + ((u64)x35 * x5)); + { u64 x47 = (((((((((u64)x29 * x13) + ((u64)x31 * x11)) + ((u64)x27 * x15)) + ((u64)x33 * x9)) + ((u64)x25 * x17)) + ((u64)x35 * x7)) + ((u64)x23 * x19)) + ((u64)x37 * x5)); + { u64 x48 = (((((((u64)x31 * x13) + (0x2 * (((((u64)x29 * x15) + ((u64)x33 * x11)) + ((u64)x25 * x19)) + ((u64)x37 * x7)))) + ((u64)x27 * x17)) + ((u64)x35 * x9)) + ((u64)x23 * x21)) + ((u64)x39 * x5)); + { u64 x49 = (((((((((((u64)x31 * x15) + ((u64)x33 * x13)) + ((u64)x29 * x17)) + ((u64)x35 * x11)) + ((u64)x27 * x19)) + ((u64)x37 * x9)) + ((u64)x25 * x21)) + ((u64)x39 * x7)) + ((u64)x23 * x20)) + ((u64)x38 * x5)); + { u64 x50 = (((((0x2 * ((((((u64)x33 * x15) + ((u64)x29 * x19)) + ((u64)x37 * x11)) + ((u64)x25 * x20)) + ((u64)x38 * x7))) + ((u64)x31 * x17)) + ((u64)x35 * x13)) + ((u64)x27 * x21)) + ((u64)x39 * x9)); + { u64 x51 = (((((((((u64)x33 * x17) + ((u64)x35 * x15)) + ((u64)x31 * x19)) + ((u64)x37 * x13)) + ((u64)x29 * x21)) + ((u64)x39 * x11)) + ((u64)x27 * x20)) + ((u64)x38 * x9)); + { u64 x52 = (((((u64)x35 * x17) + (0x2 * (((((u64)x33 * x19) + ((u64)x37 * x15)) + ((u64)x29 * x20)) + ((u64)x38 * x11)))) + ((u64)x31 * x21)) + ((u64)x39 * x13)); + { u64 x53 = (((((((u64)x35 * x19) + ((u64)x37 * x17)) + ((u64)x33 * x21)) + ((u64)x39 * x15)) + ((u64)x31 * x20)) + ((u64)x38 * x13)); + { u64 x54 = (((0x2 * ((((u64)x37 * x19) + ((u64)x33 * x20)) + ((u64)x38 * x15))) + ((u64)x35 * x21)) + ((u64)x39 * x17)); + { u64 x55 = (((((u64)x37 * x21) + ((u64)x39 * x19)) + ((u64)x35 * x20)) + ((u64)x38 * x17)); + { u64 x56 = (((u64)x39 * x21) + (0x2 * (((u64)x37 * x20) + ((u64)x38 * x19)))); + { u64 x57 = (((u64)x39 * x20) + ((u64)x38 * x21)); + { u64 x58 = ((u64)(0x2 * x38) * x20); + { u64 x59 = (x48 + (x58 << 0x4)); + { u64 x60 = (x59 + (x58 << 0x1)); + { u64 x61 = (x60 + x58); + { u64 x62 = (x47 + (x57 << 0x4)); + { u64 x63 = (x62 + (x57 << 0x1)); + { u64 x64 = (x63 + x57); + { u64 x65 = (x46 + (x56 << 0x4)); + { u64 x66 = (x65 + (x56 << 0x1)); + { u64 x67 = (x66 + x56); + { u64 x68 = (x45 + (x55 << 0x4)); + { u64 x69 = (x68 + (x55 << 0x1)); + { u64 x70 = (x69 + x55); + { u64 x71 = (x44 + (x54 << 0x4)); + { u64 x72 = (x71 + (x54 << 0x1)); + { u64 x73 = (x72 + x54); + { u64 x74 = (x43 + (x53 << 0x4)); + { u64 x75 = (x74 + (x53 << 0x1)); + { u64 x76 = (x75 + x53); + { u64 x77 = (x42 + (x52 << 0x4)); + { u64 x78 = (x77 + (x52 << 0x1)); + { u64 x79 = (x78 + x52); + { u64 x80 = (x41 + (x51 << 0x4)); + { u64 x81 = (x80 + (x51 << 0x1)); + { u64 x82 = (x81 + x51); + { u64 x83 = (x40 + (x50 << 0x4)); + { u64 x84 = (x83 + (x50 << 0x1)); + { u64 x85 = (x84 + x50); + { u64 x86 = (x85 >> 0x1a); + { u32 x87 = ((u32)x85 & 0x3ffffff); + { u64 x88 = (x86 + x82); + { u64 x89 = (x88 >> 0x19); + { u32 x90 = ((u32)x88 & 0x1ffffff); + { u64 x91 = (x89 + x79); + { u64 x92 = (x91 >> 0x1a); + { u32 x93 = ((u32)x91 & 0x3ffffff); + { u64 x94 = (x92 + x76); + { u64 x95 = (x94 >> 0x19); + { u32 x96 = ((u32)x94 & 0x1ffffff); + { u64 x97 = (x95 + x73); + { u64 x98 = (x97 >> 0x1a); + { u32 x99 = ((u32)x97 & 0x3ffffff); + { u64 x100 = (x98 + x70); + { u64 x101 = (x100 >> 0x19); + { u32 x102 = ((u32)x100 & 0x1ffffff); + { u64 x103 = (x101 + x67); + { u64 x104 = (x103 >> 0x1a); + { u32 x105 = ((u32)x103 & 0x3ffffff); + { u64 x106 = (x104 + x64); + { u64 x107 = (x106 >> 0x19); + { u32 x108 = ((u32)x106 & 0x1ffffff); + { u64 x109 = (x107 + x61); + { u64 x110 = (x109 >> 0x1a); + { u32 x111 = ((u32)x109 & 0x3ffffff); + { u64 x112 = (x110 + x49); + { u64 x113 = (x112 >> 0x19); + { u32 x114 = ((u32)x112 & 0x1ffffff); + { u64 x115 = (x87 + (0x13 * x113)); + { u32 x116 = (u32) (x115 >> 0x1a); + { u32 x117 = ((u32)x115 & 0x3ffffff); + { u32 x118 = (x116 + x90); + { u32 x119 = (x118 >> 0x19); + { u32 x120 = (x118 & 0x1ffffff); + out[0] = x117; + out[1] = x120; + out[2] = (x119 + x93); + out[3] = x96; + out[4] = x99; + out[5] = x102; + out[6] = x105; + out[7] = x108; + out[8] = x111; + out[9] = x114; + }}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}} +} + +static __always_inline void fe_mul121666(fe *h, const fe_loose *f) +{ + fe_mul_121666_impl(h->v, f->v); +} + +bool curve25519_fiat32(u8 out[CURVE25519_POINT_SIZE], const u8 scalar[CURVE25519_POINT_SIZE], const u8 point[CURVE25519_POINT_SIZE]) +{ + fe x1, x2, z2, x3, z3, tmp0, tmp1; + fe_loose x2l, z2l, x3l, tmp0l, tmp1l; + unsigned swap = 0; + int pos; + u8 e[32]; + + memcpy(e, scalar, 32); + normalize_secret(e); + + /* The following implementation was transcribed to Coq and proven to + * correspond to unary scalar multiplication in affine coordinates given that + * x1 != 0 is the x coordinate of some point on the curve. It was also checked + * in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + * = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + * underlying field, so it applies to Curve25519 itself and the quadratic + * twist of Curve25519. It was not proven in Coq that prime-field arithmetic + * correctly simulates extension-field arithmetic on prime-field values. + * The decoding of the byte array representation of e was not considered. + * Specification of Montgomery curves in affine coordinates: + * + * Proof that these form a group that is isomorphic to a Weierstrass curve: + * + * Coq transcription and correctness proof of the loop (where scalarbits=255): + * + * + * preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + */ + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + + for (pos = 254; pos >= 0; --pos) { + /* loop invariant as of right before the test, for the case where x1 != 0: + * pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + * let r := e >> (pos+1) in the following equalities of projective points: + * to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + * to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + * x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + */ + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + /* Coq transcription of ladderstep formula (called from transcribed loop): + * + * + * x1 != 0 + * x1 = 0 + */ + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + /* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) */ + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); + + return true; +} diff --git a/curve25519-hacl64.c b/curve25519-hacl64.c new file mode 100644 index 0000000..af2460b --- /dev/null +++ b/curve25519-hacl64.c @@ -0,0 +1,751 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2016-2017 INRIA and Microsoft Corporation. + * Copyright (C) 2018 Jason A. Donenfeld . All Rights Reserved. + * + * This is a machine-generated formally verified implementation of curve25519 DH from: + * https://github.com/mitls/hacl-star + */ + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +typedef __uint128_t u128; + +static __always_inline u64 u64_eq_mask(u64 x, u64 y) +{ + x = ~(x ^ y); + x &= x << 32; + x &= x << 16; + x &= x << 8; + x &= x << 4; + x &= x << 2; + x &= x << 1; + return ((s64)x) >> 63; +} + +static __always_inline u64 u64_gte_mask(u64 x, u64 y) +{ + u64 low63 = ~((u64)((s64)((s64)(x & 0x7fffffffffffffffLLU) - (s64)(y & 0x7fffffffffffffffLLU)) >> 63)); + u64 high_bit = ~((u64)((s64)((s64)(x & 0x8000000000000000LLU) - (s64)(y & 0x8000000000000000LLU)) >> 63)); + return low63 & high_bit; +} + +static __always_inline void modulo_carry_top(u64 *b) +{ + u64 b4 = b[4]; + u64 b0 = b[0]; + u64 b4_ = b4 & 0x7ffffffffffffLLU; + u64 b0_ = b0 + 19 * (b4 >> 51); + b[4] = b4_; + b[0] = b0_; +} + +static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input) +{ + { + u128 xi = input[0]; + output[0] = ((u64)(xi)); + } + { + u128 xi = input[1]; + output[1] = ((u64)(xi)); + } + { + u128 xi = input[2]; + output[2] = ((u64)(xi)); + } + { + u128 xi = input[3]; + output[3] = ((u64)(xi)); + } + { + u128 xi = input[4]; + output[4] = ((u64)(xi)); + } +} + +static __always_inline void fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s) +{ + u32 i; + for (i = 0; i < 5; ++i) { + u128 xi = output[i]; + u64 yi = input[i]; + output[i] = ((xi) + (((u128)(yi) * (s)))); + } +} + +static __always_inline void fproduct_carry_wide_(u128 *tmp) +{ + u32 i; + for (i = 0; i < 4; ++i) { + u32 ctr = i; + u128 tctr = tmp[ctr]; + u128 tctrp1 = tmp[ctr + 1]; + u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; + u128 c = ((tctr) >> (51)); + tmp[ctr] = ((u128)(r0)); + tmp[ctr + 1] = ((tctrp1) + (c)); + } +} + +static __always_inline void fmul_shift_reduce(u64 *output) +{ + u64 tmp = output[4]; + u64 b0; + { + u32 ctr = 5 - 0 - 1; + u64 z = output[ctr - 1]; + output[ctr] = z; + } + { + u32 ctr = 5 - 1 - 1; + u64 z = output[ctr - 1]; + output[ctr] = z; + } + { + u32 ctr = 5 - 2 - 1; + u64 z = output[ctr - 1]; + output[ctr] = z; + } + { + u32 ctr = 5 - 3 - 1; + u64 z = output[ctr - 1]; + output[ctr] = z; + } + output[0] = tmp; + b0 = output[0]; + output[0] = 19 * b0; +} + +static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input, u64 *input21) +{ + u32 i; + u64 input2i; + { + u64 input2i = input21[0]; + fproduct_sum_scalar_multiplication_(output, input, input2i); + fmul_shift_reduce(input); + } + { + u64 input2i = input21[1]; + fproduct_sum_scalar_multiplication_(output, input, input2i); + fmul_shift_reduce(input); + } + { + u64 input2i = input21[2]; + fproduct_sum_scalar_multiplication_(output, input, input2i); + fmul_shift_reduce(input); + } + { + u64 input2i = input21[3]; + fproduct_sum_scalar_multiplication_(output, input, input2i); + fmul_shift_reduce(input); + } + i = 4; + input2i = input21[i]; + fproduct_sum_scalar_multiplication_(output, input, input2i); +} + +static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21) +{ + u64 tmp[5]; + memcpy(tmp, input, 5 * sizeof(*input)); + { + u128 b4; + u128 b0; + u128 b4_; + u128 b0_; + u64 i0; + u64 i1; + u64 i0_; + u64 i1_; + u128 t[5]; + { + u32 _i; + for (_i = 0; _i < 5; ++_i) + t[_i] = ((u128)(0)); + } + fmul_mul_shift_reduce_(t, tmp, input21); + fproduct_carry_wide_(t); + b4 = t[4]; + b0 = t[0]; + b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); + b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); + t[4] = b4_; + t[0] = b0_; + fproduct_copy_from_wide_(output, t); + i0 = output[0]; + i1 = output[1]; + i0_ = i0 & 0x7ffffffffffffLLU; + i1_ = i1 + (i0 >> 51); + output[0] = i0_; + output[1] = i1_; + } +} + +static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output) +{ + u64 r0 = output[0]; + u64 r1 = output[1]; + u64 r2 = output[2]; + u64 r3 = output[3]; + u64 r4 = output[4]; + u64 d0 = r0 * 2; + u64 d1 = r1 * 2; + u64 d2 = r2 * 2 * 19; + u64 d419 = r4 * 19; + u64 d4 = d419 * 2; + u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) + (((u128)(d2) * (r3)))); + u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) + (((u128)(r3 * 19) * (r3)))); + u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) + (((u128)(d4) * (r3)))); + u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) + (((u128)(r4) * (d419)))); + u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) + (((u128)(r2) * (r2)))); + tmp[0] = s0; + tmp[1] = s1; + tmp[2] = s2; + tmp[3] = s3; + tmp[4] = s4; +} + +static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output) +{ + u128 b4; + u128 b0; + u128 b4_; + u128 b0_; + u64 i0; + u64 i1; + u64 i0_; + u64 i1_; + fsquare_fsquare__(tmp, output); + fproduct_carry_wide_(tmp); + b4 = tmp[4]; + b0 = tmp[0]; + b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); + b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); + tmp[4] = b4_; + tmp[0] = b0_; + fproduct_copy_from_wide_(output, tmp); + i0 = output[0]; + i1 = output[1]; + i0_ = i0 & 0x7ffffffffffffLLU; + i1_ = i1 + (i0 >> 51); + output[0] = i0_; + output[1] = i1_; +} + +static __always_inline void fsquare_fsquare_times_(u64 *input, u128 *tmp, u32 count1) +{ + u32 i; + fsquare_fsquare_(tmp, input); + for (i = 1; i < count1; ++i) + fsquare_fsquare_(tmp, input); +} + +static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, u32 count1) +{ + u128 t[5]; + { + u32 _i; + for (_i = 0; _i < 5; ++_i) + t[_i] = ((u128)(0)); + } + memcpy(output, input, 5 * sizeof(*input)); + fsquare_fsquare_times_(output, t, count1); +} + +static __always_inline void fsquare_fsquare_times_inplace(u64 *output, u32 count1) +{ + u128 t[5]; + { + u32 _i; + for (_i = 0; _i < 5; ++_i) + t[_i] = ((u128)(0)); + } + fsquare_fsquare_times_(output, t, count1); +} + +static __always_inline void crecip_crecip(u64 *out, u64 *z) +{ + u64 buf[20] = { 0 }; + u64 *a0 = buf; + u64 *t00 = buf + 5; + u64 *b0 = buf + 10; + u64 *t01; + u64 *b1; + u64 *c0; + u64 *a; + u64 *t0; + u64 *b; + u64 *c; + fsquare_fsquare_times(a0, z, 1); + fsquare_fsquare_times(t00, a0, 2); + fmul_fmul(b0, t00, z); + fmul_fmul(a0, b0, a0); + fsquare_fsquare_times(t00, a0, 1); + fmul_fmul(b0, t00, b0); + fsquare_fsquare_times(t00, b0, 5); + t01 = buf + 5; + b1 = buf + 10; + c0 = buf + 15; + fmul_fmul(b1, t01, b1); + fsquare_fsquare_times(t01, b1, 10); + fmul_fmul(c0, t01, b1); + fsquare_fsquare_times(t01, c0, 20); + fmul_fmul(t01, t01, c0); + fsquare_fsquare_times_inplace(t01, 10); + fmul_fmul(b1, t01, b1); + fsquare_fsquare_times(t01, b1, 50); + a = buf; + t0 = buf + 5; + b = buf + 10; + c = buf + 15; + fmul_fmul(c, t0, b); + fsquare_fsquare_times(t0, c, 100); + fmul_fmul(t0, t0, c); + fsquare_fsquare_times_inplace(t0, 50); + fmul_fmul(t0, t0, b); + fsquare_fsquare_times_inplace(t0, 5); + fmul_fmul(out, t0, a); +} + +static __always_inline void fsum(u64 *a, u64 *b) +{ + u32 i; + for (i = 0; i < 5; ++i) { + u64 xi = a[i]; + u64 yi = b[i]; + a[i] = xi + yi; + } +} + +static __always_inline void fdifference(u64 *a, u64 *b) +{ + u64 tmp[5] = { 0 }; + u64 b0; + u64 b1; + u64 b2; + u64 b3; + u64 b4; + memcpy(tmp, b, 5 * sizeof(*b)); + b0 = tmp[0]; + b1 = tmp[1]; + b2 = tmp[2]; + b3 = tmp[3]; + b4 = tmp[4]; + tmp[0] = b0 + 0x3fffffffffff68LLU; + tmp[1] = b1 + 0x3ffffffffffff8LLU; + tmp[2] = b2 + 0x3ffffffffffff8LLU; + tmp[3] = b3 + 0x3ffffffffffff8LLU; + tmp[4] = b4 + 0x3ffffffffffff8LLU; + { + u64 xi = a[0]; + u64 yi = tmp[0]; + a[0] = yi - xi; + } + { + u64 xi = a[1]; + u64 yi = tmp[1]; + a[1] = yi - xi; + } + { + u64 xi = a[2]; + u64 yi = tmp[2]; + a[2] = yi - xi; + } + { + u64 xi = a[3]; + u64 yi = tmp[3]; + a[3] = yi - xi; + } + { + u64 xi = a[4]; + u64 yi = tmp[4]; + a[4] = yi - xi; + } +} + +static __always_inline void fscalar(u64 *output, u64 *b, u64 s) +{ + u128 tmp[5]; + u128 b4; + u128 b0; + u128 b4_; + u128 b0_; + { + u64 xi = b[0]; + tmp[0] = ((u128)(xi) * (s)); + } + { + u64 xi = b[1]; + tmp[1] = ((u128)(xi) * (s)); + } + { + u64 xi = b[2]; + tmp[2] = ((u128)(xi) * (s)); + } + { + u64 xi = b[3]; + tmp[3] = ((u128)(xi) * (s)); + } + { + u64 xi = b[4]; + tmp[4] = ((u128)(xi) * (s)); + } + fproduct_carry_wide_(tmp); + b4 = tmp[4]; + b0 = tmp[0]; + b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); + b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); + tmp[4] = b4_; + tmp[0] = b0_; + fproduct_copy_from_wide_(output, tmp); +} + +static __always_inline void fmul(u64 *output, u64 *a, u64 *b) +{ + fmul_fmul(output, a, b); +} + +static __always_inline void crecip(u64 *output, u64 *input) +{ + crecip_crecip(output, input); +} + +static __always_inline void point_swap_conditional_step(u64 *a, u64 *b, u64 swap1, u32 ctr) +{ + u32 i = ctr - 1; + u64 ai = a[i]; + u64 bi = b[i]; + u64 x = swap1 & (ai ^ bi); + u64 ai1 = ai ^ x; + u64 bi1 = bi ^ x; + a[i] = ai1; + b[i] = bi1; +} + +static __always_inline void point_swap_conditional_(u64 *a, u64 *b, u64 swap1, u32 ctr) +{ + u32 i; + for (i = ctr; i > 0; --i) + point_swap_conditional_step(a, b, swap1, i); +} + +static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap) +{ + u64 swap1 = 0 - iswap; + point_swap_conditional_(a, b, swap1, 5); + point_swap_conditional_(a + 5, b + 5, swap1, 5); +} + +static __always_inline void point_copy(u64 *output, u64 *input) +{ + memcpy(output, input, 5 * sizeof(*input)); + memcpy(output + 5, input + 5, 5 * sizeof(*input)); +} + +static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p, u64 *pq, u64 *qmqp) +{ + u64 *qx = qmqp; + u64 *x2 = pp; + u64 *z2 = pp + 5; + u64 *x3 = ppq; + u64 *z3 = ppq + 5; + u64 *x = p; + u64 *z = p + 5; + u64 *xprime = pq; + u64 *zprime = pq + 5; + u64 buf[40] = { 0 }; + u64 *origx = buf; + u64 *origxprime0 = buf + 5; + u64 *xxprime0; + u64 *zzprime0; + u64 *origxprime; + xxprime0 = buf + 25; + zzprime0 = buf + 30; + memcpy(origx, x, 5 * sizeof(*x)); + fsum(x, z); + fdifference(z, origx); + memcpy(origxprime0, xprime, 5 * sizeof(*xprime)); + fsum(xprime, zprime); + fdifference(zprime, origxprime0); + fmul(xxprime0, xprime, z); + fmul(zzprime0, x, zprime); + origxprime = buf + 5; + { + u64 *xx0; + u64 *zz0; + u64 *xxprime; + u64 *zzprime; + u64 *zzzprime; + xx0 = buf + 15; + zz0 = buf + 20; + xxprime = buf + 25; + zzprime = buf + 30; + zzzprime = buf + 35; + memcpy(origxprime, xxprime, 5 * sizeof(*xxprime)); + fsum(xxprime, zzprime); + fdifference(zzprime, origxprime); + fsquare_fsquare_times(x3, xxprime, 1); + fsquare_fsquare_times(zzzprime, zzprime, 1); + fmul(z3, zzzprime, qx); + fsquare_fsquare_times(xx0, x, 1); + fsquare_fsquare_times(zz0, z, 1); + { + u64 *zzz; + u64 *xx; + u64 *zz; + u64 scalar; + zzz = buf + 10; + xx = buf + 15; + zz = buf + 20; + fmul(x2, xx, zz); + fdifference(zz, xx); + scalar = 121665; + fscalar(zzz, zz, scalar); + fsum(zzz, xx); + fmul(z2, zzz, zz); + } + } +} + +static __always_inline void ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt) +{ + u64 bit0 = (u64)(byt >> 7); + u64 bit; + point_swap_conditional(nq, nqpq, bit0); + addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q); + bit = (u64)(byt >> 7); + point_swap_conditional(nq2, nqpq2, bit); +} + +static __always_inline void ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt) +{ + u8 byt1; + ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt); + byt1 = byt << 1; + ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1); +} + +static __always_inline void ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u8 byt, u32 i) +{ + while (i--) { + ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2, nqpq2, q, byt); + byt <<= 2; + } +} + +static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, u64 *q, u32 i) +{ + while (i--) { + u8 byte = n1[i]; + ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, byte, 4); + } +} + +static __always_inline void ladder_cmult(u64 *result, u8 *n1, u64 *q) +{ + u64 point_buf[40] = { 0 }; + u64 *nq = point_buf; + u64 *nqpq = point_buf + 10; + u64 *nq2 = point_buf + 20; + u64 *nqpq2 = point_buf + 30; + point_copy(nqpq, q); + nq[0] = 1; + ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32); + point_copy(result, nq); +} + +static __always_inline void format_fexpand(u64 *output, const u8 *input) +{ + const u8 *x00 = input + 6; + const u8 *x01 = input + 12; + const u8 *x02 = input + 19; + const u8 *x0 = input + 24; + u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4; + i0 = le64_to_cpup((__force __le64 *)input); + i1 = le64_to_cpup((__force __le64 *)x00); + i2 = le64_to_cpup((__force __le64 *)x01); + i3 = le64_to_cpup((__force __le64 *)x02); + i4 = le64_to_cpup((__force __le64 *)x0); + output0 = i0 & 0x7ffffffffffffLLU; + output1 = i1 >> 3 & 0x7ffffffffffffLLU; + output2 = i2 >> 6 & 0x7ffffffffffffLLU; + output3 = i3 >> 1 & 0x7ffffffffffffLLU; + output4 = i4 >> 12 & 0x7ffffffffffffLLU; + output[0] = output0; + output[1] = output1; + output[2] = output2; + output[3] = output3; + output[4] = output4; +} + +static __always_inline void format_fcontract_first_carry_pass(u64 *input) +{ + u64 t0 = input[0]; + u64 t1 = input[1]; + u64 t2 = input[2]; + u64 t3 = input[3]; + u64 t4 = input[4]; + u64 t1_ = t1 + (t0 >> 51); + u64 t0_ = t0 & 0x7ffffffffffffLLU; + u64 t2_ = t2 + (t1_ >> 51); + u64 t1__ = t1_ & 0x7ffffffffffffLLU; + u64 t3_ = t3 + (t2_ >> 51); + u64 t2__ = t2_ & 0x7ffffffffffffLLU; + u64 t4_ = t4 + (t3_ >> 51); + u64 t3__ = t3_ & 0x7ffffffffffffLLU; + input[0] = t0_; + input[1] = t1__; + input[2] = t2__; + input[3] = t3__; + input[4] = t4_; +} + +static __always_inline void format_fcontract_first_carry_full(u64 *input) +{ + format_fcontract_first_carry_pass(input); + modulo_carry_top(input); +} + +static __always_inline void format_fcontract_second_carry_pass(u64 *input) +{ + u64 t0 = input[0]; + u64 t1 = input[1]; + u64 t2 = input[2]; + u64 t3 = input[3]; + u64 t4 = input[4]; + u64 t1_ = t1 + (t0 >> 51); + u64 t0_ = t0 & 0x7ffffffffffffLLU; + u64 t2_ = t2 + (t1_ >> 51); + u64 t1__ = t1_ & 0x7ffffffffffffLLU; + u64 t3_ = t3 + (t2_ >> 51); + u64 t2__ = t2_ & 0x7ffffffffffffLLU; + u64 t4_ = t4 + (t3_ >> 51); + u64 t3__ = t3_ & 0x7ffffffffffffLLU; + input[0] = t0_; + input[1] = t1__; + input[2] = t2__; + input[3] = t3__; + input[4] = t4_; +} + +static __always_inline void format_fcontract_second_carry_full(u64 *input) +{ + u64 i0; + u64 i1; + u64 i0_; + u64 i1_; + format_fcontract_second_carry_pass(input); + modulo_carry_top(input); + i0 = input[0]; + i1 = input[1]; + i0_ = i0 & 0x7ffffffffffffLLU; + i1_ = i1 + (i0 >> 51); + input[0] = i0_; + input[1] = i1_; +} + +static __always_inline void format_fcontract_trim(u64 *input) +{ + u64 a0 = input[0]; + u64 a1 = input[1]; + u64 a2 = input[2]; + u64 a3 = input[3]; + u64 a4 = input[4]; + u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU); + u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU); + u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU); + u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU); + u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU); + u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4; + u64 a0_ = a0 - (0x7ffffffffffedLLU & mask); + u64 a1_ = a1 - (0x7ffffffffffffLLU & mask); + u64 a2_ = a2 - (0x7ffffffffffffLLU & mask); + u64 a3_ = a3 - (0x7ffffffffffffLLU & mask); + u64 a4_ = a4 - (0x7ffffffffffffLLU & mask); + input[0] = a0_; + input[1] = a1_; + input[2] = a2_; + input[3] = a3_; + input[4] = a4_; +} + +static __always_inline void format_fcontract_store(u8 *output, u64 *input) +{ + u64 t0 = input[0]; + u64 t1 = input[1]; + u64 t2 = input[2]; + u64 t3 = input[3]; + u64 t4 = input[4]; + u64 o0 = t1 << 51 | t0; + u64 o1 = t2 << 38 | t1 >> 13; + u64 o2 = t3 << 25 | t2 >> 26; + u64 o3 = t4 << 12 | t3 >> 39; + u8 *b0 = output; + u8 *b1 = output + 8; + u8 *b2 = output + 16; + u8 *b3 = output + 24; + *(__force __le64 *)b0 = cpu_to_le64(o0); + *(__force __le64 *)b1 = cpu_to_le64(o1); + *(__force __le64 *)b2 = cpu_to_le64(o2); + *(__force __le64 *)b3 = cpu_to_le64(o3); +} + +static __always_inline void format_fcontract(u8 *output, u64 *input) +{ + format_fcontract_first_carry_full(input); + format_fcontract_second_carry_full(input); + format_fcontract_trim(input); + format_fcontract_store(output, input); +} + +static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point) +{ + u64 *x = point; + u64 *z = point + 5; + u64 buf[10] __aligned(32) = { 0 }; + u64 *zmone = buf; + u64 *sc = buf + 5; + crecip(zmone, z); + fmul(sc, x, zmone); + format_fcontract(scalar, sc); +} + +bool curve25519_hacl64(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ + u64 buf0[10] __aligned(32) = { 0 }; + u64 *x0 = buf0; + u64 *z = buf0 + 5; + u64 *q; + format_fexpand(x0, basepoint); + z[0] = 1; + q = buf0; + { + u8 e[32] __aligned(32) = { 0 }; + u8 *scalar; + memcpy(e, secret, 32); + normalize_secret(e); + scalar = e; + { + u64 buf[15] = { 0 }; + u64 *nq = buf; + u64 *x = nq; + x[0] = 1; + ladder_cmult(nq, scalar, q); + format_scalar_of_point(mypublic, nq); + } + } + + return true; +} diff --git a/curve25519-sandy2x-asm.S b/curve25519-sandy2x-asm.S new file mode 100644 index 0000000..f2e466b --- /dev/null +++ b/curve25519-sandy2x-asm.S @@ -0,0 +1,3261 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Tung Chou + */ + +#include + +.data +.align 16 +curve25519_sandy2x_v0_0: .quad 0, 0 +curve25519_sandy2x_v1_0: .quad 1, 0 +curve25519_sandy2x_v2_1: .quad 2, 1 +curve25519_sandy2x_v9_0: .quad 9, 0 +curve25519_sandy2x_v9_9: .quad 9, 9 +curve25519_sandy2x_v19_19: .quad 19, 19 +curve25519_sandy2x_v38_1: .quad 38, 1 +curve25519_sandy2x_v38_38: .quad 38, 38 +curve25519_sandy2x_v121666_121666: .quad 121666, 121666 +curve25519_sandy2x_m25: .quad 33554431, 33554431 +curve25519_sandy2x_m26: .quad 67108863, 67108863 +curve25519_sandy2x_subc0: .quad 0x07FFFFDA, 0x03FFFFFE +curve25519_sandy2x_subc2: .quad 0x07FFFFFE, 0x03FFFFFE +curve25519_sandy2x_REDMASK51: .quad 0x0007FFFFFFFFFFFF + +.text +.align 32 +#ifdef CONFIG_AS_AVX +ENTRY(curve25519_sandy2x_fe51_mul) + push %rbp + mov %rsp,%rbp + sub $96,%rsp + and $-32,%rsp + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq %rdi,56(%rsp) + mov %rdx,%rcx + movq 24(%rsi),%rdx + imulq $19,%rdx,%rax + movq %rax,64(%rsp) + mulq 16(%rcx) + mov %rax,%r8 + mov %rdx,%r9 + movq 32(%rsi),%rdx + imulq $19,%rdx,%rax + movq %rax,72(%rsp) + mulq 8(%rcx) + add %rax,%r8 + adc %rdx,%r9 + movq 0(%rsi),%rax + mulq 0(%rcx) + add %rax,%r8 + adc %rdx,%r9 + movq 0(%rsi),%rax + mulq 8(%rcx) + mov %rax,%r10 + mov %rdx,%r11 + movq 0(%rsi),%rax + mulq 16(%rcx) + mov %rax,%r12 + mov %rdx,%r13 + movq 0(%rsi),%rax + mulq 24(%rcx) + mov %rax,%r14 + mov %rdx,%r15 + movq 0(%rsi),%rax + mulq 32(%rcx) + mov %rax,%rbx + mov %rdx,%rbp + movq 8(%rsi),%rax + mulq 0(%rcx) + add %rax,%r10 + adc %rdx,%r11 + movq 8(%rsi),%rax + mulq 8(%rcx) + add %rax,%r12 + adc %rdx,%r13 + movq 8(%rsi),%rax + mulq 16(%rcx) + add %rax,%r14 + adc %rdx,%r15 + movq 8(%rsi),%rax + mulq 24(%rcx) + add %rax,%rbx + adc %rdx,%rbp + movq 8(%rsi),%rdx + imulq $19,%rdx,%rax + mulq 32(%rcx) + add %rax,%r8 + adc %rdx,%r9 + movq 16(%rsi),%rax + mulq 0(%rcx) + add %rax,%r12 + adc %rdx,%r13 + movq 16(%rsi),%rax + mulq 8(%rcx) + add %rax,%r14 + adc %rdx,%r15 + movq 16(%rsi),%rax + mulq 16(%rcx) + add %rax,%rbx + adc %rdx,%rbp + movq 16(%rsi),%rdx + imulq $19,%rdx,%rax + mulq 24(%rcx) + add %rax,%r8 + adc %rdx,%r9 + movq 16(%rsi),%rdx + imulq $19,%rdx,%rax + mulq 32(%rcx) + add %rax,%r10 + adc %rdx,%r11 + movq 24(%rsi),%rax + mulq 0(%rcx) + add %rax,%r14 + adc %rdx,%r15 + movq 24(%rsi),%rax + mulq 8(%rcx) + add %rax,%rbx + adc %rdx,%rbp + movq 64(%rsp),%rax + mulq 24(%rcx) + add %rax,%r10 + adc %rdx,%r11 + movq 64(%rsp),%rax + mulq 32(%rcx) + add %rax,%r12 + adc %rdx,%r13 + movq 32(%rsi),%rax + mulq 0(%rcx) + add %rax,%rbx + adc %rdx,%rbp + movq 72(%rsp),%rax + mulq 16(%rcx) + add %rax,%r10 + adc %rdx,%r11 + movq 72(%rsp),%rax + mulq 24(%rcx) + add %rax,%r12 + adc %rdx,%r13 + movq 72(%rsp),%rax + mulq 32(%rcx) + add %rax,%r14 + adc %rdx,%r15 + movq curve25519_sandy2x_REDMASK51(%rip),%rsi + shld $13,%r8,%r9 + and %rsi,%r8 + shld $13,%r10,%r11 + and %rsi,%r10 + add %r9,%r10 + shld $13,%r12,%r13 + and %rsi,%r12 + add %r11,%r12 + shld $13,%r14,%r15 + and %rsi,%r14 + add %r13,%r14 + shld $13,%rbx,%rbp + and %rsi,%rbx + add %r15,%rbx + imulq $19,%rbp,%rdx + add %rdx,%r8 + mov %r8,%rdx + shr $51,%rdx + add %r10,%rdx + mov %rdx,%rcx + shr $51,%rdx + and %rsi,%r8 + add %r12,%rdx + mov %rdx,%r9 + shr $51,%rdx + and %rsi,%rcx + add %r14,%rdx + mov %rdx,%rax + shr $51,%rdx + and %rsi,%r9 + add %rbx,%rdx + mov %rdx,%r10 + shr $51,%rdx + and %rsi,%rax + imulq $19,%rdx,%rdx + add %rdx,%r8 + and %rsi,%r10 + movq %r8,0(%rdi) + movq %rcx,8(%rdi) + movq %r9,16(%rdi) + movq %rax,24(%rdi) + movq %r10,32(%rdi) + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + leave + ret +ENDPROC(curve25519_sandy2x_fe51_mul) + +.align 32 +ENTRY(curve25519_sandy2x_fe51_nsquare) + push %rbp + mov %rsp,%rbp + sub $64,%rsp + and $-32,%rsp + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq %r13,16(%rsp) + movq %r14,24(%rsp) + movq %r15,32(%rsp) + movq %rbx,40(%rsp) + movq %rbp,48(%rsp) + movq 0(%rsi),%rcx + movq 8(%rsi),%r8 + movq 16(%rsi),%r9 + movq 24(%rsi),%rax + movq 32(%rsi),%rsi + movq %r9,16(%rdi) + movq %rax,24(%rdi) + movq %rsi,32(%rdi) + mov %rdx,%rsi + + .align 16 + .Lloop: + sub $1,%rsi + mov %rcx,%rax + mul %rcx + add %rcx,%rcx + mov %rax,%r9 + mov %rdx,%r10 + mov %rcx,%rax + mul %r8 + mov %rax,%r11 + mov %rdx,%r12 + mov %rcx,%rax + mulq 16(%rdi) + mov %rax,%r13 + mov %rdx,%r14 + mov %rcx,%rax + mulq 24(%rdi) + mov %rax,%r15 + mov %rdx,%rbx + mov %rcx,%rax + mulq 32(%rdi) + mov %rax,%rcx + mov %rdx,%rbp + mov %r8,%rax + mul %r8 + add %r8,%r8 + add %rax,%r13 + adc %rdx,%r14 + mov %r8,%rax + mulq 16(%rdi) + add %rax,%r15 + adc %rdx,%rbx + mov %r8,%rax + imulq $19, %r8,%r8 + mulq 24(%rdi) + add %rax,%rcx + adc %rdx,%rbp + mov %r8,%rax + mulq 32(%rdi) + add %rax,%r9 + adc %rdx,%r10 + movq 16(%rdi),%rax + mulq 16(%rdi) + add %rax,%rcx + adc %rdx,%rbp + shld $13,%rcx,%rbp + movq 16(%rdi),%rax + imulq $38, %rax,%rax + mulq 24(%rdi) + add %rax,%r9 + adc %rdx,%r10 + shld $13,%r9,%r10 + movq 16(%rdi),%rax + imulq $38, %rax,%rax + mulq 32(%rdi) + add %rax,%r11 + adc %rdx,%r12 + movq 24(%rdi),%rax + imulq $19, %rax,%rax + mulq 24(%rdi) + add %rax,%r11 + adc %rdx,%r12 + shld $13,%r11,%r12 + movq 24(%rdi),%rax + imulq $38, %rax,%rax + mulq 32(%rdi) + add %rax,%r13 + adc %rdx,%r14 + shld $13,%r13,%r14 + movq 32(%rdi),%rax + imulq $19, %rax,%rax + mulq 32(%rdi) + add %rax,%r15 + adc %rdx,%rbx + shld $13,%r15,%rbx + movq curve25519_sandy2x_REDMASK51(%rip),%rdx + and %rdx,%rcx + add %rbx,%rcx + and %rdx,%r9 + and %rdx,%r11 + add %r10,%r11 + and %rdx,%r13 + add %r12,%r13 + and %rdx,%r15 + add %r14,%r15 + imulq $19, %rbp,%rbp + lea (%r9,%rbp),%r9 + mov %r9,%rax + shr $51,%r9 + add %r11,%r9 + and %rdx,%rax + mov %r9,%r8 + shr $51,%r9 + add %r13,%r9 + and %rdx,%r8 + mov %r9,%r10 + shr $51,%r9 + add %r15,%r9 + and %rdx,%r10 + movq %r10,16(%rdi) + mov %r9,%r10 + shr $51,%r9 + add %rcx,%r9 + and %rdx,%r10 + movq %r10,24(%rdi) + mov %r9,%r10 + shr $51,%r9 + imulq $19, %r9,%r9 + lea (%rax,%r9),%rcx + and %rdx,%r10 + movq %r10,32(%rdi) + cmp $0,%rsi + jne .Lloop + + movq %rcx,0(%rdi) + movq %r8,8(%rdi) + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + movq 16(%rsp),%r13 + movq 24(%rsp),%r14 + movq 32(%rsp),%r15 + movq 40(%rsp),%rbx + movq 48(%rsp),%rbp + leave + ret +ENDPROC(curve25519_sandy2x_fe51_nsquare) + +.align 32 +ENTRY(curve25519_sandy2x_fe51_pack) + push %rbp + mov %rsp,%rbp + sub $32,%rsp + and $-32,%rsp + movq %r11,0(%rsp) + movq %r12,8(%rsp) + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%r8 + movq 24(%rsi),%r9 + movq 32(%rsi),%rsi + movq curve25519_sandy2x_REDMASK51(%rip),%rax + lea -18(%rax),%r10 + mov $3,%r11 + + .align 16 + .Lreduceloop: + mov %rdx,%r12 + shr $51,%r12 + and %rax,%rdx + add %r12,%rcx + mov %rcx,%r12 + shr $51,%r12 + and %rax,%rcx + add %r12,%r8 + mov %r8,%r12 + shr $51,%r12 + and %rax,%r8 + add %r12,%r9 + mov %r9,%r12 + shr $51,%r12 + and %rax,%r9 + add %r12,%rsi + mov %rsi,%r12 + shr $51,%r12 + and %rax,%rsi + imulq $19, %r12,%r12 + add %r12,%rdx + sub $1,%r11 + ja .Lreduceloop + + mov $1,%r12 + cmp %r10,%rdx + cmovl %r11,%r12 + cmp %rax,%rcx + cmovne %r11,%r12 + cmp %rax,%r8 + cmovne %r11,%r12 + cmp %rax,%r9 + cmovne %r11,%r12 + cmp %rax,%rsi + cmovne %r11,%r12 + neg %r12 + and %r12,%rax + and %r12,%r10 + sub %r10,%rdx + sub %rax,%rcx + sub %rax,%r8 + sub %rax,%r9 + sub %rax,%rsi + mov %rdx,%rax + and $0xFF,%eax + movb %al,0(%rdi) + mov %rdx,%rax + shr $8,%rax + and $0xFF,%eax + movb %al,1(%rdi) + mov %rdx,%rax + shr $16,%rax + and $0xFF,%eax + movb %al,2(%rdi) + mov %rdx,%rax + shr $24,%rax + and $0xFF,%eax + movb %al,3(%rdi) + mov %rdx,%rax + shr $32,%rax + and $0xFF,%eax + movb %al,4(%rdi) + mov %rdx,%rax + shr $40,%rax + and $0xFF,%eax + movb %al,5(%rdi) + mov %rdx,%rdx + shr $48,%rdx + mov %rcx,%rax + shl $3,%rax + and $0xF8,%eax + xor %rdx,%rax + movb %al,6(%rdi) + mov %rcx,%rdx + shr $5,%rdx + and $0xFF,%edx + movb %dl,7(%rdi) + mov %rcx,%rdx + shr $13,%rdx + and $0xFF,%edx + movb %dl,8(%rdi) + mov %rcx,%rdx + shr $21,%rdx + and $0xFF,%edx + movb %dl,9(%rdi) + mov %rcx,%rdx + shr $29,%rdx + and $0xFF,%edx + movb %dl,10(%rdi) + mov %rcx,%rdx + shr $37,%rdx + and $0xFF,%edx + movb %dl,11(%rdi) + mov %rcx,%rdx + shr $45,%rdx + mov %r8,%rcx + shl $6,%rcx + and $0xC0,%ecx + xor %rdx,%rcx + movb %cl,12(%rdi) + mov %r8,%rdx + shr $2,%rdx + and $0xFF,%edx + movb %dl,13(%rdi) + mov %r8,%rdx + shr $10,%rdx + and $0xFF,%edx + movb %dl,14(%rdi) + mov %r8,%rdx + shr $18,%rdx + and $0xFF,%edx + movb %dl,15(%rdi) + mov %r8,%rdx + shr $26,%rdx + and $0xFF,%edx + movb %dl,16(%rdi) + mov %r8,%rdx + shr $34,%rdx + and $0xFF,%edx + movb %dl,17(%rdi) + mov %r8,%rdx + shr $42,%rdx + movb %dl,18(%rdi) + mov %r8,%rdx + shr $50,%rdx + mov %r9,%rcx + shl $1,%rcx + and $0xFE,%ecx + xor %rdx,%rcx + movb %cl,19(%rdi) + mov %r9,%rdx + shr $7,%rdx + and $0xFF,%edx + movb %dl,20(%rdi) + mov %r9,%rdx + shr $15,%rdx + and $0xFF,%edx + movb %dl,21(%rdi) + mov %r9,%rdx + shr $23,%rdx + and $0xFF,%edx + movb %dl,22(%rdi) + mov %r9,%rdx + shr $31,%rdx + and $0xFF,%edx + movb %dl,23(%rdi) + mov %r9,%rdx + shr $39,%rdx + and $0xFF,%edx + movb %dl,24(%rdi) + mov %r9,%rdx + shr $47,%rdx + mov %rsi,%rcx + shl $4,%rcx + and $0xF0,%ecx + xor %rdx,%rcx + movb %cl,25(%rdi) + mov %rsi,%rdx + shr $4,%rdx + and $0xFF,%edx + movb %dl,26(%rdi) + mov %rsi,%rdx + shr $12,%rdx + and $0xFF,%edx + movb %dl,27(%rdi) + mov %rsi,%rdx + shr $20,%rdx + and $0xFF,%edx + movb %dl,28(%rdi) + mov %rsi,%rdx + shr $28,%rdx + and $0xFF,%edx + movb %dl,29(%rdi) + mov %rsi,%rdx + shr $36,%rdx + and $0xFF,%edx + movb %dl,30(%rdi) + mov %rsi,%rsi + shr $44,%rsi + movb %sil,31(%rdi) + movq 0(%rsp),%r11 + movq 8(%rsp),%r12 + leave + ret +ENDPROC(curve25519_sandy2x_fe51_pack) + +.align 32 +ENTRY(curve25519_sandy2x_ladder) + push %rbp + mov %rsp,%rbp + sub $1856,%rsp + and $-32,%rsp + movq %r11,1824(%rsp) + movq %r12,1832(%rsp) + movq %r13,1840(%rsp) + movq %r14,1848(%rsp) + vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 + vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 + vmovdqu 0(%rdi),%xmm2 + vmovdqa %xmm2,0(%rsp) + vmovdqu 16(%rdi),%xmm2 + vmovdqa %xmm2,16(%rsp) + vmovdqu 32(%rdi),%xmm2 + vmovdqa %xmm2,32(%rsp) + vmovdqu 48(%rdi),%xmm2 + vmovdqa %xmm2,48(%rsp) + vmovdqu 64(%rdi),%xmm2 + vmovdqa %xmm2,64(%rsp) + vmovdqa %xmm1,80(%rsp) + vmovdqa %xmm0,96(%rsp) + vmovdqa %xmm0,112(%rsp) + vmovdqa %xmm0,128(%rsp) + vmovdqa %xmm0,144(%rsp) + vmovdqa %xmm1,%xmm0 + vpxor %xmm1,%xmm1,%xmm1 + vpxor %xmm2,%xmm2,%xmm2 + vpxor %xmm3,%xmm3,%xmm3 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + vpxor %xmm8,%xmm8,%xmm8 + vpxor %xmm9,%xmm9,%xmm9 + vmovdqu 0(%rdi),%xmm10 + vmovdqa %xmm10,160(%rsp) + vmovdqu 16(%rdi),%xmm10 + vmovdqa %xmm10,176(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,192(%rsp) + vmovdqu 32(%rdi),%xmm10 + vmovdqa %xmm10,208(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,224(%rsp) + vmovdqu 48(%rdi),%xmm10 + vmovdqa %xmm10,240(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,256(%rsp) + vmovdqu 64(%rdi),%xmm10 + vmovdqa %xmm10,272(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,288(%rsp) + vmovdqu 8(%rdi),%xmm10 + vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,304(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,320(%rsp) + vmovdqu 24(%rdi),%xmm10 + vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,336(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,352(%rsp) + vmovdqu 40(%rdi),%xmm10 + vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,368(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,384(%rsp) + vmovdqu 56(%rdi),%xmm10 + vpmuludq curve25519_sandy2x_v2_1(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,400(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,416(%rsp) + vmovdqu 0(%rdi),%xmm10 + vmovdqu 64(%rdi),%xmm11 + vblendps $12, %xmm11, %xmm10, %xmm10 + vpshufd $2,%xmm10,%xmm10 + vpmuludq curve25519_sandy2x_v38_1(%rip),%xmm10,%xmm10 + vmovdqa %xmm10,432(%rsp) + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%r8 + movq 24(%rsi),%r9 + shrd $1,%rcx,%rdx + shrd $1,%r8,%rcx + shrd $1,%r9,%r8 + shr $1,%r9 + xorq 0(%rsi),%rdx + xorq 8(%rsi),%rcx + xorq 16(%rsi),%r8 + xorq 24(%rsi),%r9 + leaq 800(%rsp),%rsi + mov $64,%rax + + .align 16 + .Lladder_small_loop: + mov %rdx,%r10 + mov %rcx,%r11 + mov %r8,%r12 + mov %r9,%r13 + shr $1,%rdx + shr $1,%rcx + shr $1,%r8 + shr $1,%r9 + and $1,%r10d + and $1,%r11d + and $1,%r12d + and $1,%r13d + neg %r10 + neg %r11 + neg %r12 + neg %r13 + movl %r10d,0(%rsi) + movl %r11d,256(%rsi) + movl %r12d,512(%rsi) + movl %r13d,768(%rsi) + add $4,%rsi + sub $1,%rax + jne .Lladder_small_loop + mov $255,%rdx + add $760,%rsi + + .align 16 + .Lladder_loop: + sub $1,%rdx + vbroadcastss 0(%rsi),%xmm10 + sub $4,%rsi + vmovdqa 0(%rsp),%xmm11 + vmovdqa 80(%rsp),%xmm12 + vpxor %xmm11,%xmm0,%xmm13 + vpand %xmm10,%xmm13,%xmm13 + vpxor %xmm13,%xmm0,%xmm0 + vpxor %xmm13,%xmm11,%xmm11 + vpxor %xmm12,%xmm1,%xmm13 + vpand %xmm10,%xmm13,%xmm13 + vpxor %xmm13,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vmovdqa 16(%rsp),%xmm13 + vmovdqa 96(%rsp),%xmm14 + vpxor %xmm13,%xmm2,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm2,%xmm2 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm3,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm3,%xmm3 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,0(%rsp) + vmovdqa %xmm14,16(%rsp) + vmovdqa 32(%rsp),%xmm13 + vmovdqa 112(%rsp),%xmm14 + vpxor %xmm13,%xmm4,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm4,%xmm4 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm5,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm5,%xmm5 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,32(%rsp) + vmovdqa %xmm14,80(%rsp) + vmovdqa 48(%rsp),%xmm13 + vmovdqa 128(%rsp),%xmm14 + vpxor %xmm13,%xmm6,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm6,%xmm6 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm7,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm7,%xmm7 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,48(%rsp) + vmovdqa %xmm14,96(%rsp) + vmovdqa 64(%rsp),%xmm13 + vmovdqa 144(%rsp),%xmm14 + vpxor %xmm13,%xmm8,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm9,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm9,%xmm9 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,64(%rsp) + vmovdqa %xmm14,112(%rsp) + vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 + vpsubq %xmm12,%xmm10,%xmm10 + vpaddq %xmm12,%xmm11,%xmm11 + vpunpckhqdq %xmm10,%xmm11,%xmm12 + vpunpcklqdq %xmm10,%xmm11,%xmm10 + vpaddq %xmm1,%xmm0,%xmm11 + vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 + vpsubq %xmm1,%xmm0,%xmm0 + vpunpckhqdq %xmm11,%xmm0,%xmm1 + vpunpcklqdq %xmm11,%xmm0,%xmm0 + vpmuludq %xmm0,%xmm10,%xmm11 + vpmuludq %xmm1,%xmm10,%xmm13 + vmovdqa %xmm1,128(%rsp) + vpaddq %xmm1,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm12,%xmm14 + vmovdqa %xmm0,144(%rsp) + vpaddq %xmm14,%xmm13,%xmm13 + vpmuludq %xmm1,%xmm12,%xmm0 + vmovdqa %xmm1,448(%rsp) + vpaddq %xmm3,%xmm2,%xmm1 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 + vpsubq %xmm3,%xmm2,%xmm2 + vpunpckhqdq %xmm1,%xmm2,%xmm3 + vpunpcklqdq %xmm1,%xmm2,%xmm1 + vpmuludq %xmm1,%xmm10,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq %xmm3,%xmm10,%xmm2 + vmovdqa %xmm3,464(%rsp) + vpaddq %xmm3,%xmm3,%xmm3 + vpmuludq %xmm1,%xmm12,%xmm14 + vmovdqa %xmm1,480(%rsp) + vpaddq %xmm14,%xmm2,%xmm2 + vpmuludq %xmm3,%xmm12,%xmm1 + vmovdqa %xmm3,496(%rsp) + vpaddq %xmm5,%xmm4,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 + vpsubq %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm3,%xmm4,%xmm5 + vpunpcklqdq %xmm3,%xmm4,%xmm3 + vpmuludq %xmm3,%xmm10,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq %xmm5,%xmm10,%xmm4 + vmovdqa %xmm5,512(%rsp) + vpaddq %xmm5,%xmm5,%xmm5 + vpmuludq %xmm3,%xmm12,%xmm14 + vmovdqa %xmm3,528(%rsp) + vpaddq %xmm14,%xmm4,%xmm4 + vpaddq %xmm7,%xmm6,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 + vpsubq %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm3,%xmm6,%xmm7 + vpunpcklqdq %xmm3,%xmm6,%xmm3 + vpmuludq %xmm3,%xmm10,%xmm6 + vpmuludq %xmm5,%xmm12,%xmm14 + vmovdqa %xmm5,544(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 + vmovdqa %xmm5,560(%rsp) + vpaddq %xmm14,%xmm6,%xmm6 + vpmuludq %xmm7,%xmm10,%xmm5 + vmovdqa %xmm7,576(%rsp) + vpaddq %xmm7,%xmm7,%xmm7 + vpmuludq %xmm3,%xmm12,%xmm14 + vmovdqa %xmm3,592(%rsp) + vpaddq %xmm14,%xmm5,%xmm5 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vmovdqa %xmm3,608(%rsp) + vpaddq %xmm9,%xmm8,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 + vpsubq %xmm9,%xmm8,%xmm8 + vpunpckhqdq %xmm3,%xmm8,%xmm9 + vpunpcklqdq %xmm3,%xmm8,%xmm3 + vmovdqa %xmm3,624(%rsp) + vpmuludq %xmm7,%xmm12,%xmm8 + vmovdqa %xmm7,640(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 + vmovdqa %xmm7,656(%rsp) + vpmuludq %xmm3,%xmm10,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq %xmm9,%xmm10,%xmm7 + vmovdqa %xmm9,672(%rsp) + vpaddq %xmm9,%xmm9,%xmm9 + vpmuludq %xmm3,%xmm12,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vmovdqa %xmm3,688(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 + vpmuludq %xmm9,%xmm12,%xmm3 + vmovdqa %xmm9,704(%rsp) + vpaddq %xmm3,%xmm11,%xmm11 + vmovdqa 0(%rsp),%xmm3 + vmovdqa 16(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 480(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 464(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 528(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 512(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 592(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 576(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 624(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 672(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 448(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 480(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 496(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 528(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 544(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 592(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 640(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 624(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 704(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm0,%xmm0 + vmovdqa 32(%rsp),%xmm3 + vmovdqa 80(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 480(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 464(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 528(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 512(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 592(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 576(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 624(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 672(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 448(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 480(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 496(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 528(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 544(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 592(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 640(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 624(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 704(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm1,%xmm1 + vmovdqa 48(%rsp),%xmm3 + vmovdqa 96(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 480(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 464(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 528(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 512(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 592(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 576(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 624(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 672(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 448(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 480(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 496(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 528(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 544(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 592(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 640(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 624(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 704(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm6,%xmm6 + vmovdqa 64(%rsp),%xmm3 + vmovdqa 112(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 480(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 464(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 528(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 512(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 592(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 576(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 624(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 672(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 448(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 480(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 496(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 528(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 544(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 592(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 640(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 624(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 704(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm8,%xmm8 + vpsrlq $25,%xmm4,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 + vpsrlq $26,%xmm11,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm6,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm13,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 + vpsrlq $25,%xmm5,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm0,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 + vpsrlq $26,%xmm8,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $25,%xmm2,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 + vpsrlq $25,%xmm7,%xmm3 + vpsllq $4,%xmm3,%xmm9 + vpaddq %xmm3,%xmm11,%xmm11 + vpsllq $1,%xmm3,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpaddq %xmm9,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $26,%xmm1,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $26,%xmm11,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $25,%xmm4,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 + vpunpcklqdq %xmm13,%xmm11,%xmm3 + vpunpckhqdq %xmm13,%xmm11,%xmm9 + vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 + vpsubq %xmm3,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm3,%xmm10,%xmm9 + vpunpcklqdq %xmm3,%xmm10,%xmm10 + vpmuludq %xmm10,%xmm10,%xmm3 + vpaddq %xmm10,%xmm10,%xmm10 + vpmuludq %xmm9,%xmm10,%xmm11 + vpunpcklqdq %xmm2,%xmm0,%xmm12 + vpunpckhqdq %xmm2,%xmm0,%xmm0 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 + vpsubq %xmm12,%xmm2,%xmm2 + vpaddq %xmm0,%xmm12,%xmm12 + vpunpckhqdq %xmm12,%xmm2,%xmm0 + vpunpcklqdq %xmm12,%xmm2,%xmm2 + vpmuludq %xmm2,%xmm10,%xmm12 + vpaddq %xmm9,%xmm9,%xmm13 + vpmuludq %xmm13,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm10,%xmm9 + vpmuludq %xmm2,%xmm13,%xmm14 + vpaddq %xmm14,%xmm9,%xmm9 + vpunpcklqdq %xmm4,%xmm1,%xmm14 + vpunpckhqdq %xmm4,%xmm1,%xmm1 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 + vpsubq %xmm14,%xmm4,%xmm4 + vpaddq %xmm1,%xmm14,%xmm14 + vpunpckhqdq %xmm14,%xmm4,%xmm1 + vpunpcklqdq %xmm14,%xmm4,%xmm4 + vmovdqa %xmm1,0(%rsp) + vpaddq %xmm1,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsp) + vpmuludq %xmm4,%xmm10,%xmm1 + vpmuludq %xmm2,%xmm2,%xmm14 + vpaddq %xmm14,%xmm1,%xmm1 + vpmuludq 0(%rsp),%xmm10,%xmm14 + vpmuludq %xmm4,%xmm13,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm5,%xmm6,%xmm15 + vpunpckhqdq %xmm5,%xmm6,%xmm5 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 + vpsubq %xmm15,%xmm6,%xmm6 + vpaddq %xmm5,%xmm15,%xmm15 + vpunpckhqdq %xmm15,%xmm6,%xmm5 + vpunpcklqdq %xmm15,%xmm6,%xmm6 + vmovdqa %xmm6,48(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 + vmovdqa %xmm6,64(%rsp) + vmovdqa %xmm5,80(%rsp) + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 + vmovdqa %xmm5,96(%rsp) + vpmuludq 48(%rsp),%xmm10,%xmm5 + vpaddq %xmm0,%xmm0,%xmm6 + vpmuludq %xmm6,%xmm0,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpmuludq 80(%rsp),%xmm10,%xmm0 + vpmuludq %xmm4,%xmm6,%xmm15 + vpaddq %xmm15,%xmm0,%xmm0 + vpmuludq %xmm6,%xmm13,%xmm15 + vpaddq %xmm15,%xmm1,%xmm1 + vpmuludq %xmm6,%xmm2,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm7,%xmm8,%xmm15 + vpunpckhqdq %xmm7,%xmm8,%xmm7 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 + vpsubq %xmm15,%xmm8,%xmm8 + vpaddq %xmm7,%xmm15,%xmm15 + vpunpckhqdq %xmm15,%xmm8,%xmm7 + vpunpcklqdq %xmm15,%xmm8,%xmm8 + vmovdqa %xmm8,112(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 + vmovdqa %xmm8,448(%rsp) + vpmuludq 112(%rsp),%xmm10,%xmm8 + vpmuludq %xmm7,%xmm10,%xmm10 + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 + vpmuludq %xmm15,%xmm7,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq %xmm15,%xmm13,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm15,%xmm2,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq 80(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm7,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq 16(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 48(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 112(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + vpmuludq %xmm15,%xmm6,%xmm7 + vpaddq %xmm7,%xmm12,%xmm12 + vpmuludq %xmm15,%xmm4,%xmm7 + vpaddq %xmm7,%xmm9,%xmm9 + vpaddq %xmm2,%xmm2,%xmm2 + vpmuludq %xmm4,%xmm2,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 448(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 448(%rsp),%xmm6,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq 0(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 48(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq 80(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 96(%rsp),%xmm4,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm4,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpaddq %xmm4,%xmm4,%xmm2 + vpmuludq 448(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq 16(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq 48(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm14,%xmm14 + vpmuludq 96(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 448(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 16(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm8,%xmm8 + vpmuludq 48(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 80(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vpmuludq 112(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm0,%xmm0 + vmovdqa 48(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 448(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 80(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 448(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm14,%xmm14 + vpmuludq 64(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 64(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 96(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vmovdqa 48(%rsp),%xmm4 + vpmuludq 96(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 0(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vmovdqa 32(%rsp),%xmm2 + vpmuludq 0(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vmovdqa 64(%rsp),%xmm2 + vpmuludq 48(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vmovdqa 96(%rsp),%xmm2 + vpmuludq 80(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vmovdqa 448(%rsp),%xmm2 + vpmuludq 112(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpsrlq $26,%xmm3,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 + vpsrlq $25,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $25,%xmm11,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm5,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm12,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm0,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm9,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 + vpsrlq $26,%xmm8,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $25,%xmm10,%xmm2 + vpsllq $4,%xmm2,%xmm4 + vpaddq %xmm2,%xmm3,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpaddq %xmm2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $26,%xmm3,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 + vpunpckhqdq %xmm11,%xmm3,%xmm2 + vmovdqa %xmm2,0(%rsp) + vpshufd $0,%xmm3,%xmm2 + vpshufd $0,%xmm11,%xmm3 + vpmuludq 160(%rsp),%xmm2,%xmm4 + vpmuludq 432(%rsp),%xmm3,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 176(%rsp),%xmm2,%xmm6 + vpmuludq 304(%rsp),%xmm3,%xmm7 + vpaddq %xmm7,%xmm6,%xmm6 + vpmuludq 208(%rsp),%xmm2,%xmm7 + vpmuludq 336(%rsp),%xmm3,%xmm11 + vpaddq %xmm11,%xmm7,%xmm7 + vpmuludq 240(%rsp),%xmm2,%xmm11 + vpmuludq 368(%rsp),%xmm3,%xmm13 + vpaddq %xmm13,%xmm11,%xmm11 + vpmuludq 272(%rsp),%xmm2,%xmm2 + vpmuludq 400(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpunpckhqdq %xmm9,%xmm12,%xmm3 + vmovdqa %xmm3,16(%rsp) + vpshufd $0,%xmm12,%xmm3 + vpshufd $0,%xmm9,%xmm9 + vpmuludq 288(%rsp),%xmm3,%xmm12 + vpaddq %xmm12,%xmm4,%xmm4 + vpmuludq 416(%rsp),%xmm9,%xmm12 + vpaddq %xmm12,%xmm4,%xmm4 + vpmuludq 160(%rsp),%xmm3,%xmm12 + vpaddq %xmm12,%xmm6,%xmm6 + vpmuludq 432(%rsp),%xmm9,%xmm12 + vpaddq %xmm12,%xmm6,%xmm6 + vpmuludq 176(%rsp),%xmm3,%xmm12 + vpaddq %xmm12,%xmm7,%xmm7 + vpmuludq 304(%rsp),%xmm9,%xmm12 + vpaddq %xmm12,%xmm7,%xmm7 + vpmuludq 208(%rsp),%xmm3,%xmm12 + vpaddq %xmm12,%xmm11,%xmm11 + vpmuludq 336(%rsp),%xmm9,%xmm12 + vpaddq %xmm12,%xmm11,%xmm11 + vpmuludq 240(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 368(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm1,%xmm3 + vmovdqa %xmm3,32(%rsp) + vpshufd $0,%xmm1,%xmm1 + vpshufd $0,%xmm14,%xmm3 + vpmuludq 256(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm4,%xmm4 + vpmuludq 384(%rsp),%xmm3,%xmm9 + vpaddq %xmm9,%xmm4,%xmm4 + vpmuludq 288(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm6,%xmm6 + vpmuludq 416(%rsp),%xmm3,%xmm9 + vpaddq %xmm9,%xmm6,%xmm6 + vpmuludq 160(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm7,%xmm7 + vpmuludq 432(%rsp),%xmm3,%xmm9 + vpaddq %xmm9,%xmm7,%xmm7 + vpmuludq 176(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm11,%xmm11 + vpmuludq 304(%rsp),%xmm3,%xmm9 + vpaddq %xmm9,%xmm11,%xmm11 + vpmuludq 208(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm2,%xmm2 + vpmuludq 336(%rsp),%xmm3,%xmm1 + vpaddq %xmm1,%xmm2,%xmm2 + vpunpckhqdq %xmm0,%xmm5,%xmm1 + vmovdqa %xmm1,48(%rsp) + vpshufd $0,%xmm5,%xmm1 + vpshufd $0,%xmm0,%xmm0 + vpmuludq 224(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 352(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 256(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 384(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 288(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 416(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 160(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 432(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 176(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm2,%xmm2 + vpmuludq 304(%rsp),%xmm0,%xmm0 + vpaddq %xmm0,%xmm2,%xmm2 + vpunpckhqdq %xmm10,%xmm8,%xmm0 + vmovdqa %xmm0,64(%rsp) + vpshufd $0,%xmm8,%xmm0 + vpshufd $0,%xmm10,%xmm1 + vpmuludq 192(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 320(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 224(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 352(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 256(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 384(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 288(%rsp),%xmm0,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 416(%rsp),%xmm1,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 160(%rsp),%xmm0,%xmm0 + vpaddq %xmm0,%xmm2,%xmm2 + vpmuludq 432(%rsp),%xmm1,%xmm0 + vpaddq %xmm0,%xmm2,%xmm2 + vmovdqa %xmm4,80(%rsp) + vmovdqa %xmm6,96(%rsp) + vmovdqa %xmm7,112(%rsp) + vmovdqa %xmm11,448(%rsp) + vmovdqa %xmm2,496(%rsp) + vmovdqa 144(%rsp),%xmm0 + vpmuludq %xmm0,%xmm0,%xmm1 + vpaddq %xmm0,%xmm0,%xmm0 + vmovdqa 128(%rsp),%xmm2 + vpmuludq %xmm2,%xmm0,%xmm3 + vmovdqa 480(%rsp),%xmm4 + vpmuludq %xmm4,%xmm0,%xmm5 + vmovdqa 464(%rsp),%xmm6 + vpmuludq %xmm6,%xmm0,%xmm7 + vmovdqa 528(%rsp),%xmm8 + vpmuludq %xmm8,%xmm0,%xmm9 + vpmuludq 512(%rsp),%xmm0,%xmm10 + vpmuludq 592(%rsp),%xmm0,%xmm11 + vpmuludq 576(%rsp),%xmm0,%xmm12 + vpmuludq 624(%rsp),%xmm0,%xmm13 + vmovdqa 672(%rsp),%xmm14 + vpmuludq %xmm14,%xmm0,%xmm0 + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 + vpmuludq %xmm15,%xmm14,%xmm14 + vpaddq %xmm14,%xmm13,%xmm13 + vpaddq %xmm6,%xmm6,%xmm14 + vpmuludq %xmm14,%xmm6,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm2,%xmm2,%xmm6 + vpmuludq %xmm6,%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq %xmm15,%xmm6,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vpmuludq %xmm15,%xmm4,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpmuludq 544(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 592(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 640(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 624(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq %xmm4,%xmm6,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq %xmm14,%xmm6,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq %xmm8,%xmm6,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq %xmm15,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq %xmm15,%xmm8,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm4,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq %xmm14,%xmm4,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpaddq %xmm4,%xmm4,%xmm2 + vpmuludq %xmm8,%xmm2,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vpmuludq 688(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq 688(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vpmuludq 512(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq 592(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm13,%xmm13 + vpmuludq 576(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq 656(%rsp),%xmm8,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpmuludq %xmm8,%xmm14,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq %xmm8,%xmm8,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpaddq %xmm8,%xmm8,%xmm2 + vpmuludq 688(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vpmuludq 544(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 592(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 656(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 544(%rsp),%xmm4 + vpmuludq 688(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm7,%xmm7 + vpmuludq 544(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm13,%xmm13 + vpmuludq 592(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm0,%xmm0 + vpmuludq 640(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vpmuludq 624(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vmovdqa 592(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 688(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 608(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 544(%rsp),%xmm4 + vpmuludq 608(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 544(%rsp),%xmm4 + vpmuludq 656(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vmovdqa 592(%rsp),%xmm4 + vpmuludq 656(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm7,%xmm7 + vmovdqa 640(%rsp),%xmm4 + vpmuludq 688(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 512(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vmovdqa 560(%rsp),%xmm2 + vpmuludq 512(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vmovdqa 608(%rsp),%xmm2 + vpmuludq 592(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vmovdqa 656(%rsp),%xmm2 + vpmuludq 576(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vmovdqa 688(%rsp),%xmm2 + vpmuludq 624(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $25,%xmm10,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm3,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 + vpsrlq $26,%xmm11,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm5,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 + vpsrlq $25,%xmm12,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm7,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $26,%xmm13,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 + vpsrlq $26,%xmm9,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $25,%xmm0,%xmm2 + vpsllq $4,%xmm2,%xmm4 + vpaddq %xmm2,%xmm1,%xmm1 + vpsllq $1,%xmm2,%xmm2 + vpaddq %xmm2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm10,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpunpckhqdq %xmm3,%xmm1,%xmm2 + vpunpcklqdq %xmm3,%xmm1,%xmm1 + vmovdqa %xmm1,464(%rsp) + vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 + vpsubq %xmm1,%xmm3,%xmm3 + vpunpckhqdq %xmm3,%xmm2,%xmm1 + vpunpcklqdq %xmm3,%xmm2,%xmm2 + vmovdqa %xmm2,480(%rsp) + vmovdqa %xmm1,512(%rsp) + vpsllq $1,%xmm1,%xmm1 + vmovdqa %xmm1,528(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 + vmovdqa 80(%rsp),%xmm1 + vpunpcklqdq %xmm1,%xmm3,%xmm2 + vpunpckhqdq %xmm1,%xmm3,%xmm1 + vpunpckhqdq %xmm7,%xmm5,%xmm3 + vpunpcklqdq %xmm7,%xmm5,%xmm4 + vmovdqa %xmm4,544(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 + vpsubq %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm5,%xmm3,%xmm4 + vpunpcklqdq %xmm5,%xmm3,%xmm3 + vmovdqa %xmm3,560(%rsp) + vmovdqa %xmm4,576(%rsp) + vpsllq $1,%xmm4,%xmm4 + vmovdqa %xmm4,592(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 + vmovdqa 96(%rsp),%xmm3 + vpunpcklqdq %xmm3,%xmm5,%xmm4 + vpunpckhqdq %xmm3,%xmm5,%xmm3 + vpunpckhqdq %xmm10,%xmm9,%xmm5 + vpunpcklqdq %xmm10,%xmm9,%xmm6 + vmovdqa %xmm6,608(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 + vpsubq %xmm6,%xmm7,%xmm7 + vpunpckhqdq %xmm7,%xmm5,%xmm6 + vpunpcklqdq %xmm7,%xmm5,%xmm5 + vmovdqa %xmm5,624(%rsp) + vmovdqa %xmm6,640(%rsp) + vpsllq $1,%xmm6,%xmm6 + vmovdqa %xmm6,656(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 + vmovdqa 112(%rsp),%xmm5 + vpunpcklqdq %xmm5,%xmm7,%xmm6 + vpunpckhqdq %xmm5,%xmm7,%xmm5 + vpunpckhqdq %xmm12,%xmm11,%xmm7 + vpunpcklqdq %xmm12,%xmm11,%xmm8 + vmovdqa %xmm8,672(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 + vpsubq %xmm8,%xmm9,%xmm9 + vpunpckhqdq %xmm9,%xmm7,%xmm8 + vpunpcklqdq %xmm9,%xmm7,%xmm7 + vmovdqa %xmm7,688(%rsp) + vmovdqa %xmm8,704(%rsp) + vpsllq $1,%xmm8,%xmm8 + vmovdqa %xmm8,720(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 + vmovdqa 448(%rsp),%xmm7 + vpunpcklqdq %xmm7,%xmm9,%xmm8 + vpunpckhqdq %xmm7,%xmm9,%xmm7 + vpunpckhqdq %xmm0,%xmm13,%xmm9 + vpunpcklqdq %xmm0,%xmm13,%xmm0 + vmovdqa %xmm0,448(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 + vpsubq %xmm0,%xmm10,%xmm10 + vpunpckhqdq %xmm10,%xmm9,%xmm0 + vpunpcklqdq %xmm10,%xmm9,%xmm9 + vmovdqa %xmm9,736(%rsp) + vmovdqa %xmm0,752(%rsp) + vpsllq $1,%xmm0,%xmm0 + vmovdqa %xmm0,768(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 + vmovdqa 496(%rsp),%xmm0 + vpunpcklqdq %xmm0,%xmm10,%xmm9 + vpunpckhqdq %xmm0,%xmm10,%xmm0 + vpsrlq $26,%xmm2,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 + vpsrlq $25,%xmm5,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $25,%xmm1,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 + vpsrlq $26,%xmm8,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm4,%xmm10 + vpaddq %xmm10,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 + vpsrlq $25,%xmm7,%xmm10 + vpaddq %xmm10,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $25,%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 + vpsrlq $26,%xmm9,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $26,%xmm6,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm0,%xmm10 + vpsllq $4,%xmm10,%xmm11 + vpaddq %xmm10,%xmm2,%xmm2 + vpsllq $1,%xmm10,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpaddq %xmm11,%xmm2,%xmm2 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm5,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 + vpunpckhqdq %xmm1,%xmm2,%xmm10 + vmovdqa %xmm10,80(%rsp) + vpunpcklqdq %xmm1,%xmm2,%xmm1 + vpunpckhqdq %xmm3,%xmm4,%xmm2 + vmovdqa %xmm2,96(%rsp) + vpunpcklqdq %xmm3,%xmm4,%xmm2 + vpunpckhqdq %xmm5,%xmm6,%xmm3 + vmovdqa %xmm3,112(%rsp) + vpunpcklqdq %xmm5,%xmm6,%xmm3 + vpunpckhqdq %xmm7,%xmm8,%xmm4 + vmovdqa %xmm4,128(%rsp) + vpunpcklqdq %xmm7,%xmm8,%xmm4 + vpunpckhqdq %xmm0,%xmm9,%xmm5 + vmovdqa %xmm5,144(%rsp) + vpunpcklqdq %xmm0,%xmm9,%xmm0 + vmovdqa 464(%rsp),%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vpunpcklqdq %xmm1,%xmm5,%xmm6 + vpunpckhqdq %xmm1,%xmm5,%xmm1 + vpmuludq 512(%rsp),%xmm6,%xmm5 + vpmuludq 480(%rsp),%xmm1,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 560(%rsp),%xmm6,%xmm7 + vpmuludq 528(%rsp),%xmm1,%xmm8 + vpaddq %xmm8,%xmm7,%xmm7 + vpmuludq 576(%rsp),%xmm6,%xmm8 + vpmuludq 560(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm8,%xmm8 + vpmuludq 624(%rsp),%xmm6,%xmm9 + vpmuludq 592(%rsp),%xmm1,%xmm10 + vpaddq %xmm10,%xmm9,%xmm9 + vpmuludq 640(%rsp),%xmm6,%xmm10 + vpmuludq 624(%rsp),%xmm1,%xmm11 + vpaddq %xmm11,%xmm10,%xmm10 + vpmuludq 688(%rsp),%xmm6,%xmm11 + vpmuludq 656(%rsp),%xmm1,%xmm12 + vpaddq %xmm12,%xmm11,%xmm11 + vpmuludq 704(%rsp),%xmm6,%xmm12 + vpmuludq 688(%rsp),%xmm1,%xmm13 + vpaddq %xmm13,%xmm12,%xmm12 + vpmuludq 736(%rsp),%xmm6,%xmm13 + vpmuludq 720(%rsp),%xmm1,%xmm14 + vpaddq %xmm14,%xmm13,%xmm13 + vpmuludq 752(%rsp),%xmm6,%xmm14 + vpmuludq 736(%rsp),%xmm1,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpmuludq 480(%rsp),%xmm6,%xmm6 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 768(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vmovdqa 544(%rsp),%xmm1 + vpaddq %xmm1,%xmm2,%xmm2 + vpunpcklqdq %xmm2,%xmm1,%xmm15 + vpunpckhqdq %xmm2,%xmm1,%xmm1 + vpmuludq 480(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 512(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 560(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 576(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 624(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 640(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 688(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 704(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 + vpmuludq 736(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 752(%rsp),%xmm15,%xmm15 + vpaddq %xmm15,%xmm5,%xmm5 + vpmuludq 480(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 528(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 560(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 592(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 624(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 656(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 688(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 720(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 736(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 768(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vmovdqa 608(%rsp),%xmm1 + vpaddq %xmm1,%xmm3,%xmm3 + vpunpcklqdq %xmm3,%xmm1,%xmm2 + vpunpckhqdq %xmm3,%xmm1,%xmm1 + vpmuludq 480(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpmuludq 512(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + vpmuludq 560(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 576(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq 624(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 640(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 688(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 704(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 736(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 752(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 480(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 528(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 560(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 592(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 624(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 656(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 688(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 720(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 736(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 768(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vmovdqa 672(%rsp),%xmm1 + vpaddq %xmm1,%xmm4,%xmm4 + vpunpcklqdq %xmm4,%xmm1,%xmm2 + vpunpckhqdq %xmm4,%xmm1,%xmm1 + vpmuludq 480(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 512(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq 560(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 576(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 624(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 640(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 688(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 704(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 736(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpmuludq 752(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 480(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 528(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 560(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 592(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 624(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 656(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 688(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 720(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 736(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 768(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vmovdqa 448(%rsp),%xmm1 + vpaddq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm0,%xmm1,%xmm2 + vpunpckhqdq %xmm0,%xmm1,%xmm0 + vpmuludq 480(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq 512(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 560(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpmuludq 576(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm5,%xmm5 + vpmuludq 624(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vpmuludq 640(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm8,%xmm8 + vpmuludq 688(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vpmuludq 704(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm10,%xmm10 + vpmuludq 736(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq 752(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 480(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 + vpmuludq 528(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpmuludq 560(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm5,%xmm5 + vpmuludq 592(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vpmuludq 624(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm8,%xmm8 + vpmuludq 656(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vpmuludq 688(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm10,%xmm10 + vpmuludq 720(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq 736(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpmuludq 768(%rsp),%xmm0,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpsrlq $26,%xmm6,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm10,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm5,%xmm0 + vpaddq %xmm0,%xmm7,%xmm7 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm11,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm7,%xmm0 + vpaddq %xmm0,%xmm8,%xmm8 + vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 + vpsrlq $25,%xmm12,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm8,%xmm0 + vpaddq %xmm0,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm13,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 + vpsrlq $26,%xmm9,%xmm0 + vpaddq %xmm0,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $25,%xmm14,%xmm0 + vpsllq $4,%xmm0,%xmm1 + vpaddq %xmm0,%xmm6,%xmm6 + vpsllq $1,%xmm0,%xmm0 + vpaddq %xmm0,%xmm1,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $25,%xmm10,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $26,%xmm6,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpunpckhqdq %xmm5,%xmm6,%xmm1 + vpunpcklqdq %xmm5,%xmm6,%xmm0 + vpunpckhqdq %xmm8,%xmm7,%xmm3 + vpunpcklqdq %xmm8,%xmm7,%xmm2 + vpunpckhqdq %xmm10,%xmm9,%xmm5 + vpunpcklqdq %xmm10,%xmm9,%xmm4 + vpunpckhqdq %xmm12,%xmm11,%xmm7 + vpunpcklqdq %xmm12,%xmm11,%xmm6 + vpunpckhqdq %xmm14,%xmm13,%xmm9 + vpunpcklqdq %xmm14,%xmm13,%xmm8 + cmp $0,%rdx + jne .Lladder_loop + vmovdqu %xmm1,160(%rdi) + vmovdqu %xmm0,80(%rdi) + vmovdqu %xmm3,176(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm5,192(%rdi) + vmovdqu %xmm4,112(%rdi) + vmovdqu %xmm7,208(%rdi) + vmovdqu %xmm6,128(%rdi) + vmovdqu %xmm9,224(%rdi) + vmovdqu %xmm8,144(%rdi) + movq 1824(%rsp),%r11 + movq 1832(%rsp),%r12 + movq 1840(%rsp),%r13 + movq 1848(%rsp),%r14 + leave + ret +ENDPROC(curve25519_sandy2x_ladder) + +.align 32 +ENTRY(curve25519_sandy2x_ladder_base) + push %rbp + mov %rsp,%rbp + sub $1568,%rsp + and $-32,%rsp + movq %r11,1536(%rsp) + movq %r12,1544(%rsp) + movq %r13,1552(%rsp) + vmovdqa curve25519_sandy2x_v0_0(%rip),%xmm0 + vmovdqa curve25519_sandy2x_v1_0(%rip),%xmm1 + vmovdqa curve25519_sandy2x_v9_0(%rip),%xmm2 + vmovdqa %xmm2,0(%rsp) + vmovdqa %xmm0,16(%rsp) + vmovdqa %xmm0,32(%rsp) + vmovdqa %xmm0,48(%rsp) + vmovdqa %xmm0,64(%rsp) + vmovdqa %xmm1,80(%rsp) + vmovdqa %xmm0,96(%rsp) + vmovdqa %xmm0,112(%rsp) + vmovdqa %xmm0,128(%rsp) + vmovdqa %xmm0,144(%rsp) + vmovdqa %xmm1,%xmm0 + vpxor %xmm1,%xmm1,%xmm1 + vpxor %xmm2,%xmm2,%xmm2 + vpxor %xmm3,%xmm3,%xmm3 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + vpxor %xmm8,%xmm8,%xmm8 + vpxor %xmm9,%xmm9,%xmm9 + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%r8 + movq 24(%rsi),%r9 + shrd $1,%rcx,%rdx + shrd $1,%r8,%rcx + shrd $1,%r9,%r8 + shr $1,%r9 + xorq 0(%rsi),%rdx + xorq 8(%rsi),%rcx + xorq 16(%rsi),%r8 + xorq 24(%rsi),%r9 + leaq 512(%rsp),%rsi + mov $64,%rax + + .align 16 + .Lladder_base_small_loop: + mov %rdx,%r10 + mov %rcx,%r11 + mov %r8,%r12 + mov %r9,%r13 + shr $1,%rdx + shr $1,%rcx + shr $1,%r8 + shr $1,%r9 + and $1,%r10d + and $1,%r11d + and $1,%r12d + and $1,%r13d + neg %r10 + neg %r11 + neg %r12 + neg %r13 + movl %r10d,0(%rsi) + movl %r11d,256(%rsi) + movl %r12d,512(%rsi) + movl %r13d,768(%rsi) + add $4,%rsi + sub $1,%rax + jne .Lladder_base_small_loop + mov $255,%rdx + add $760,%rsi + + .align 16 + .Lladder_base_loop: + sub $1,%rdx + vbroadcastss 0(%rsi),%xmm10 + sub $4,%rsi + vmovdqa 0(%rsp),%xmm11 + vmovdqa 80(%rsp),%xmm12 + vpxor %xmm11,%xmm0,%xmm13 + vpand %xmm10,%xmm13,%xmm13 + vpxor %xmm13,%xmm0,%xmm0 + vpxor %xmm13,%xmm11,%xmm11 + vpxor %xmm12,%xmm1,%xmm13 + vpand %xmm10,%xmm13,%xmm13 + vpxor %xmm13,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vmovdqa 16(%rsp),%xmm13 + vmovdqa 96(%rsp),%xmm14 + vpxor %xmm13,%xmm2,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm2,%xmm2 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm3,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm3,%xmm3 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,0(%rsp) + vmovdqa %xmm14,16(%rsp) + vmovdqa 32(%rsp),%xmm13 + vmovdqa 112(%rsp),%xmm14 + vpxor %xmm13,%xmm4,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm4,%xmm4 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm5,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm5,%xmm5 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,32(%rsp) + vmovdqa %xmm14,80(%rsp) + vmovdqa 48(%rsp),%xmm13 + vmovdqa 128(%rsp),%xmm14 + vpxor %xmm13,%xmm6,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm6,%xmm6 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm7,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm7,%xmm7 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,48(%rsp) + vmovdqa %xmm14,96(%rsp) + vmovdqa 64(%rsp),%xmm13 + vmovdqa 144(%rsp),%xmm14 + vpxor %xmm13,%xmm8,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm14,%xmm9,%xmm15 + vpand %xmm10,%xmm15,%xmm15 + vpxor %xmm15,%xmm9,%xmm9 + vpxor %xmm15,%xmm14,%xmm14 + vmovdqa %xmm13,64(%rsp) + vmovdqa %xmm14,112(%rsp) + vpaddq curve25519_sandy2x_subc0(%rip),%xmm11,%xmm10 + vpsubq %xmm12,%xmm10,%xmm10 + vpaddq %xmm12,%xmm11,%xmm11 + vpunpckhqdq %xmm10,%xmm11,%xmm12 + vpunpcklqdq %xmm10,%xmm11,%xmm10 + vpaddq %xmm1,%xmm0,%xmm11 + vpaddq curve25519_sandy2x_subc0(%rip),%xmm0,%xmm0 + vpsubq %xmm1,%xmm0,%xmm0 + vpunpckhqdq %xmm11,%xmm0,%xmm1 + vpunpcklqdq %xmm11,%xmm0,%xmm0 + vpmuludq %xmm0,%xmm10,%xmm11 + vpmuludq %xmm1,%xmm10,%xmm13 + vmovdqa %xmm1,128(%rsp) + vpaddq %xmm1,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm12,%xmm14 + vmovdqa %xmm0,144(%rsp) + vpaddq %xmm14,%xmm13,%xmm13 + vpmuludq %xmm1,%xmm12,%xmm0 + vmovdqa %xmm1,160(%rsp) + vpaddq %xmm3,%xmm2,%xmm1 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm2,%xmm2 + vpsubq %xmm3,%xmm2,%xmm2 + vpunpckhqdq %xmm1,%xmm2,%xmm3 + vpunpcklqdq %xmm1,%xmm2,%xmm1 + vpmuludq %xmm1,%xmm10,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq %xmm3,%xmm10,%xmm2 + vmovdqa %xmm3,176(%rsp) + vpaddq %xmm3,%xmm3,%xmm3 + vpmuludq %xmm1,%xmm12,%xmm14 + vmovdqa %xmm1,192(%rsp) + vpaddq %xmm14,%xmm2,%xmm2 + vpmuludq %xmm3,%xmm12,%xmm1 + vmovdqa %xmm3,208(%rsp) + vpaddq %xmm5,%xmm4,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm4,%xmm4 + vpsubq %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm3,%xmm4,%xmm5 + vpunpcklqdq %xmm3,%xmm4,%xmm3 + vpmuludq %xmm3,%xmm10,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq %xmm5,%xmm10,%xmm4 + vmovdqa %xmm5,224(%rsp) + vpaddq %xmm5,%xmm5,%xmm5 + vpmuludq %xmm3,%xmm12,%xmm14 + vmovdqa %xmm3,240(%rsp) + vpaddq %xmm14,%xmm4,%xmm4 + vpaddq %xmm7,%xmm6,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm6,%xmm6 + vpsubq %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm3,%xmm6,%xmm7 + vpunpcklqdq %xmm3,%xmm6,%xmm3 + vpmuludq %xmm3,%xmm10,%xmm6 + vpmuludq %xmm5,%xmm12,%xmm14 + vmovdqa %xmm5,256(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm5,%xmm5 + vmovdqa %xmm5,272(%rsp) + vpaddq %xmm14,%xmm6,%xmm6 + vpmuludq %xmm7,%xmm10,%xmm5 + vmovdqa %xmm7,288(%rsp) + vpaddq %xmm7,%xmm7,%xmm7 + vpmuludq %xmm3,%xmm12,%xmm14 + vmovdqa %xmm3,304(%rsp) + vpaddq %xmm14,%xmm5,%xmm5 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vmovdqa %xmm3,320(%rsp) + vpaddq %xmm9,%xmm8,%xmm3 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm8,%xmm8 + vpsubq %xmm9,%xmm8,%xmm8 + vpunpckhqdq %xmm3,%xmm8,%xmm9 + vpunpcklqdq %xmm3,%xmm8,%xmm3 + vmovdqa %xmm3,336(%rsp) + vpmuludq %xmm7,%xmm12,%xmm8 + vmovdqa %xmm7,352(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm7,%xmm7 + vmovdqa %xmm7,368(%rsp) + vpmuludq %xmm3,%xmm10,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq %xmm9,%xmm10,%xmm7 + vmovdqa %xmm9,384(%rsp) + vpaddq %xmm9,%xmm9,%xmm9 + vpmuludq %xmm3,%xmm12,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vmovdqa %xmm3,400(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm12,%xmm12 + vpmuludq %xmm9,%xmm12,%xmm3 + vmovdqa %xmm9,416(%rsp) + vpaddq %xmm3,%xmm11,%xmm11 + vmovdqa 0(%rsp),%xmm3 + vmovdqa 16(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 192(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 176(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 240(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 224(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 304(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 288(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 336(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 384(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 160(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 192(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 208(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 240(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 256(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 304(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 352(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 336(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 416(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm0,%xmm0 + vmovdqa 32(%rsp),%xmm3 + vmovdqa 80(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 192(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 176(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 240(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 224(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 304(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 288(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 336(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 384(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 160(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 192(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 208(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 240(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 256(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 304(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 352(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 336(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 416(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm1,%xmm1 + vmovdqa 48(%rsp),%xmm3 + vmovdqa 96(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpmuludq 192(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 176(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 240(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 224(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 304(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 288(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 336(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 384(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 160(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 192(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 208(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 240(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 256(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 304(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 352(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 336(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 416(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm6,%xmm6 + vmovdqa 64(%rsp),%xmm3 + vmovdqa 112(%rsp),%xmm9 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm10 + vpsubq %xmm9,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm10,%xmm3,%xmm9 + vpunpcklqdq %xmm10,%xmm3,%xmm3 + vpmuludq 144(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpmuludq 128(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm3,%xmm3 + vpmuludq 192(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpmuludq 176(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm13,%xmm13 + vpmuludq 240(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpmuludq 224(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm2,%xmm2 + vpmuludq 304(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpmuludq 288(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpmuludq 336(%rsp),%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpmuludq 384(%rsp),%xmm3,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 144(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm9,%xmm9 + vpmuludq 160(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 192(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 208(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpmuludq 240(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpmuludq 256(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpmuludq 304(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpmuludq 352(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 336(%rsp),%xmm9,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 416(%rsp),%xmm9,%xmm9 + vpaddq %xmm9,%xmm8,%xmm8 + vpsrlq $25,%xmm4,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 + vpsrlq $26,%xmm11,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm6,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm13,%xmm3 + vpaddq %xmm3,%xmm0,%xmm0 + vpand curve25519_sandy2x_m25(%rip),%xmm13,%xmm13 + vpsrlq $25,%xmm5,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm0,%xmm3 + vpaddq %xmm3,%xmm2,%xmm2 + vpand curve25519_sandy2x_m26(%rip),%xmm0,%xmm0 + vpsrlq $26,%xmm8,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $25,%xmm2,%xmm3 + vpaddq %xmm3,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm2,%xmm2 + vpsrlq $25,%xmm7,%xmm3 + vpsllq $4,%xmm3,%xmm9 + vpaddq %xmm3,%xmm11,%xmm11 + vpsllq $1,%xmm3,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpaddq %xmm9,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $26,%xmm1,%xmm3 + vpaddq %xmm3,%xmm4,%xmm4 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $26,%xmm11,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $25,%xmm4,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm4,%xmm4 + vpunpcklqdq %xmm13,%xmm11,%xmm3 + vpunpckhqdq %xmm13,%xmm11,%xmm9 + vpaddq curve25519_sandy2x_subc0(%rip),%xmm9,%xmm10 + vpsubq %xmm3,%xmm10,%xmm10 + vpaddq %xmm9,%xmm3,%xmm3 + vpunpckhqdq %xmm3,%xmm10,%xmm9 + vpunpcklqdq %xmm3,%xmm10,%xmm10 + vpmuludq %xmm10,%xmm10,%xmm3 + vpaddq %xmm10,%xmm10,%xmm10 + vpmuludq %xmm9,%xmm10,%xmm11 + vpunpcklqdq %xmm2,%xmm0,%xmm12 + vpunpckhqdq %xmm2,%xmm0,%xmm0 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm0,%xmm2 + vpsubq %xmm12,%xmm2,%xmm2 + vpaddq %xmm0,%xmm12,%xmm12 + vpunpckhqdq %xmm12,%xmm2,%xmm0 + vpunpcklqdq %xmm12,%xmm2,%xmm2 + vpmuludq %xmm2,%xmm10,%xmm12 + vpaddq %xmm9,%xmm9,%xmm13 + vpmuludq %xmm13,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm10,%xmm9 + vpmuludq %xmm2,%xmm13,%xmm14 + vpaddq %xmm14,%xmm9,%xmm9 + vpunpcklqdq %xmm4,%xmm1,%xmm14 + vpunpckhqdq %xmm4,%xmm1,%xmm1 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm1,%xmm4 + vpsubq %xmm14,%xmm4,%xmm4 + vpaddq %xmm1,%xmm14,%xmm14 + vpunpckhqdq %xmm14,%xmm4,%xmm1 + vpunpcklqdq %xmm14,%xmm4,%xmm4 + vmovdqa %xmm1,0(%rsp) + vpaddq %xmm1,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsp) + vpmuludq %xmm4,%xmm10,%xmm1 + vpmuludq %xmm2,%xmm2,%xmm14 + vpaddq %xmm14,%xmm1,%xmm1 + vpmuludq 0(%rsp),%xmm10,%xmm14 + vpmuludq %xmm4,%xmm13,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm5,%xmm6,%xmm15 + vpunpckhqdq %xmm5,%xmm6,%xmm5 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm6 + vpsubq %xmm15,%xmm6,%xmm6 + vpaddq %xmm5,%xmm15,%xmm15 + vpunpckhqdq %xmm15,%xmm6,%xmm5 + vpunpcklqdq %xmm15,%xmm6,%xmm6 + vmovdqa %xmm6,48(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm6,%xmm6 + vmovdqa %xmm6,64(%rsp) + vmovdqa %xmm5,80(%rsp) + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm5,%xmm5 + vmovdqa %xmm5,96(%rsp) + vpmuludq 48(%rsp),%xmm10,%xmm5 + vpaddq %xmm0,%xmm0,%xmm6 + vpmuludq %xmm6,%xmm0,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpmuludq 80(%rsp),%xmm10,%xmm0 + vpmuludq %xmm4,%xmm6,%xmm15 + vpaddq %xmm15,%xmm0,%xmm0 + vpmuludq %xmm6,%xmm13,%xmm15 + vpaddq %xmm15,%xmm1,%xmm1 + vpmuludq %xmm6,%xmm2,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm7,%xmm8,%xmm15 + vpunpckhqdq %xmm7,%xmm8,%xmm7 + vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm8 + vpsubq %xmm15,%xmm8,%xmm8 + vpaddq %xmm7,%xmm15,%xmm15 + vpunpckhqdq %xmm15,%xmm8,%xmm7 + vpunpcklqdq %xmm15,%xmm8,%xmm8 + vmovdqa %xmm8,112(%rsp) + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm8,%xmm8 + vmovdqa %xmm8,160(%rsp) + vpmuludq 112(%rsp),%xmm10,%xmm8 + vpmuludq %xmm7,%xmm10,%xmm10 + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm7,%xmm15 + vpmuludq %xmm15,%xmm7,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq %xmm15,%xmm13,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm15,%xmm2,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq 80(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm7,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq 16(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 48(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 112(%rsp),%xmm13,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + vpmuludq %xmm15,%xmm6,%xmm7 + vpaddq %xmm7,%xmm12,%xmm12 + vpmuludq %xmm15,%xmm4,%xmm7 + vpaddq %xmm7,%xmm9,%xmm9 + vpaddq %xmm2,%xmm2,%xmm2 + vpmuludq %xmm4,%xmm2,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 160(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 160(%rsp),%xmm6,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq 0(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 48(%rsp),%xmm2,%xmm7 + vpaddq %xmm7,%xmm8,%xmm8 + vpmuludq 80(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 96(%rsp),%xmm4,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm4,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpaddq %xmm4,%xmm4,%xmm2 + vpmuludq 160(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq 16(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq 48(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm14,%xmm14 + vpmuludq 96(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 160(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 16(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm8,%xmm8 + vpmuludq 48(%rsp),%xmm6,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 80(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vpmuludq 112(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm0,%xmm0 + vmovdqa 48(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 160(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 80(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 160(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm14,%xmm14 + vpmuludq 64(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 64(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vmovdqa 16(%rsp),%xmm4 + vpmuludq 96(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vmovdqa 48(%rsp),%xmm4 + vpmuludq 96(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 0(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vmovdqa 32(%rsp),%xmm2 + vpmuludq 0(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vmovdqa 64(%rsp),%xmm2 + vpmuludq 48(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vmovdqa 96(%rsp),%xmm2 + vpmuludq 80(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vmovdqa 160(%rsp),%xmm2 + vpmuludq 112(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpsrlq $26,%xmm3,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 + vpsrlq $25,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $25,%xmm11,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpand curve25519_sandy2x_m25(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm5,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm12,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpand curve25519_sandy2x_m26(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm0,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm9,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm9,%xmm9 + vpsrlq $26,%xmm8,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $25,%xmm10,%xmm2 + vpsllq $4,%xmm2,%xmm4 + vpaddq %xmm2,%xmm3,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpaddq %xmm2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $26,%xmm3,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m26(%rip),%xmm3,%xmm3 + vpunpckhqdq %xmm11,%xmm3,%xmm2 + vmovdqa %xmm2,0(%rsp) + vpunpcklqdq %xmm11,%xmm3,%xmm2 + vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 + vmovdqa %xmm2,80(%rsp) + vpunpckhqdq %xmm9,%xmm12,%xmm2 + vmovdqa %xmm2,16(%rsp) + vpunpcklqdq %xmm9,%xmm12,%xmm2 + vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm2,%xmm2 + vmovdqa %xmm2,96(%rsp) + vpunpckhqdq %xmm14,%xmm1,%xmm2 + vmovdqa %xmm2,32(%rsp) + vpunpcklqdq %xmm14,%xmm1,%xmm1 + vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm1,%xmm1 + vmovdqa %xmm1,112(%rsp) + vpunpckhqdq %xmm0,%xmm5,%xmm1 + vmovdqa %xmm1,48(%rsp) + vpunpcklqdq %xmm0,%xmm5,%xmm0 + vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 + vmovdqa %xmm0,160(%rsp) + vpunpckhqdq %xmm10,%xmm8,%xmm0 + vmovdqa %xmm0,64(%rsp) + vpunpcklqdq %xmm10,%xmm8,%xmm0 + vpmuludq curve25519_sandy2x_v9_9(%rip),%xmm0,%xmm0 + vmovdqa %xmm0,208(%rsp) + vmovdqa 144(%rsp),%xmm0 + vpmuludq %xmm0,%xmm0,%xmm1 + vpaddq %xmm0,%xmm0,%xmm0 + vmovdqa 128(%rsp),%xmm2 + vpmuludq %xmm2,%xmm0,%xmm3 + vmovdqa 192(%rsp),%xmm4 + vpmuludq %xmm4,%xmm0,%xmm5 + vmovdqa 176(%rsp),%xmm6 + vpmuludq %xmm6,%xmm0,%xmm7 + vmovdqa 240(%rsp),%xmm8 + vpmuludq %xmm8,%xmm0,%xmm9 + vpmuludq 224(%rsp),%xmm0,%xmm10 + vpmuludq 304(%rsp),%xmm0,%xmm11 + vpmuludq 288(%rsp),%xmm0,%xmm12 + vpmuludq 336(%rsp),%xmm0,%xmm13 + vmovdqa 384(%rsp),%xmm14 + vpmuludq %xmm14,%xmm0,%xmm0 + vpmuludq curve25519_sandy2x_v38_38(%rip),%xmm14,%xmm15 + vpmuludq %xmm15,%xmm14,%xmm14 + vpaddq %xmm14,%xmm13,%xmm13 + vpaddq %xmm6,%xmm6,%xmm14 + vpmuludq %xmm14,%xmm6,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm2,%xmm2,%xmm6 + vpmuludq %xmm6,%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq %xmm15,%xmm6,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vpmuludq %xmm15,%xmm4,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpmuludq 256(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 304(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 352(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 336(%rsp),%xmm6,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq %xmm4,%xmm6,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq %xmm14,%xmm6,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq %xmm8,%xmm6,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq %xmm15,%xmm14,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq %xmm15,%xmm8,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm4,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq %xmm14,%xmm4,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpaddq %xmm4,%xmm4,%xmm2 + vpmuludq %xmm8,%xmm2,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vpmuludq 400(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpmuludq 400(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vpmuludq 224(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq 304(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm13,%xmm13 + vpmuludq 288(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpmuludq 368(%rsp),%xmm8,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpmuludq %xmm8,%xmm14,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq %xmm8,%xmm8,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpaddq %xmm8,%xmm8,%xmm2 + vpmuludq 400(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vpmuludq 256(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 304(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 368(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 256(%rsp),%xmm4 + vpmuludq 400(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm7,%xmm7 + vpmuludq 256(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm13,%xmm13 + vpmuludq 304(%rsp),%xmm14,%xmm4 + vpaddq %xmm4,%xmm0,%xmm0 + vpmuludq 352(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm11,%xmm11 + vpmuludq 336(%rsp),%xmm15,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vmovdqa 304(%rsp),%xmm4 + vpaddq %xmm4,%xmm4,%xmm4 + vpmuludq 400(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm9,%xmm9 + vpmuludq 320(%rsp),%xmm2,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vmovdqa 256(%rsp),%xmm4 + vpmuludq 320(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm3,%xmm3 + vmovdqa 256(%rsp),%xmm4 + vpmuludq 368(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm5,%xmm5 + vmovdqa 304(%rsp),%xmm4 + vpmuludq 368(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm7,%xmm7 + vmovdqa 352(%rsp),%xmm4 + vpmuludq 400(%rsp),%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + vpmuludq 224(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vmovdqa 272(%rsp),%xmm2 + vpmuludq 224(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm1,%xmm1 + vmovdqa 320(%rsp),%xmm2 + vpmuludq 304(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vmovdqa 368(%rsp),%xmm2 + vpmuludq 288(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vmovdqa 400(%rsp),%xmm2 + vpmuludq 336(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpsrlq $25,%xmm10,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm3,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 + vpsrlq $26,%xmm11,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm5,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm5,%xmm5 + vpsrlq $25,%xmm12,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm7,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $26,%xmm13,%xmm2 + vpaddq %xmm2,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 + vpsrlq $26,%xmm9,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $25,%xmm0,%xmm2 + vpsllq $4,%xmm2,%xmm4 + vpaddq %xmm2,%xmm1,%xmm1 + vpsllq $1,%xmm2,%xmm2 + vpaddq %xmm2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm1,%xmm1 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm10,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $26,%xmm1,%xmm2 + vpaddq %xmm2,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm1,%xmm1 + vpunpckhqdq %xmm3,%xmm1,%xmm2 + vpunpcklqdq %xmm3,%xmm1,%xmm1 + vmovdqa %xmm1,176(%rsp) + vpaddq curve25519_sandy2x_subc0(%rip),%xmm2,%xmm3 + vpsubq %xmm1,%xmm3,%xmm3 + vpunpckhqdq %xmm3,%xmm2,%xmm1 + vpunpcklqdq %xmm3,%xmm2,%xmm2 + vmovdqa %xmm2,192(%rsp) + vmovdqa %xmm1,224(%rsp) + vpsllq $1,%xmm1,%xmm1 + vmovdqa %xmm1,240(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm3,%xmm3 + vmovdqa 80(%rsp),%xmm1 + vpunpcklqdq %xmm1,%xmm3,%xmm2 + vpunpckhqdq %xmm1,%xmm3,%xmm1 + vpunpckhqdq %xmm7,%xmm5,%xmm3 + vpunpcklqdq %xmm7,%xmm5,%xmm4 + vmovdqa %xmm4,256(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm3,%xmm5 + vpsubq %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm5,%xmm3,%xmm4 + vpunpcklqdq %xmm5,%xmm3,%xmm3 + vmovdqa %xmm3,272(%rsp) + vmovdqa %xmm4,288(%rsp) + vpsllq $1,%xmm4,%xmm4 + vmovdqa %xmm4,304(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm5,%xmm5 + vmovdqa 96(%rsp),%xmm3 + vpunpcklqdq %xmm3,%xmm5,%xmm4 + vpunpckhqdq %xmm3,%xmm5,%xmm3 + vpunpckhqdq %xmm10,%xmm9,%xmm5 + vpunpcklqdq %xmm10,%xmm9,%xmm6 + vmovdqa %xmm6,320(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm5,%xmm7 + vpsubq %xmm6,%xmm7,%xmm7 + vpunpckhqdq %xmm7,%xmm5,%xmm6 + vpunpcklqdq %xmm7,%xmm5,%xmm5 + vmovdqa %xmm5,336(%rsp) + vmovdqa %xmm6,352(%rsp) + vpsllq $1,%xmm6,%xmm6 + vmovdqa %xmm6,368(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm7,%xmm7 + vmovdqa 112(%rsp),%xmm5 + vpunpcklqdq %xmm5,%xmm7,%xmm6 + vpunpckhqdq %xmm5,%xmm7,%xmm5 + vpunpckhqdq %xmm12,%xmm11,%xmm7 + vpunpcklqdq %xmm12,%xmm11,%xmm8 + vmovdqa %xmm8,384(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm7,%xmm9 + vpsubq %xmm8,%xmm9,%xmm9 + vpunpckhqdq %xmm9,%xmm7,%xmm8 + vpunpcklqdq %xmm9,%xmm7,%xmm7 + vmovdqa %xmm7,400(%rsp) + vmovdqa %xmm8,416(%rsp) + vpsllq $1,%xmm8,%xmm8 + vmovdqa %xmm8,432(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm9,%xmm9 + vmovdqa 160(%rsp),%xmm7 + vpunpcklqdq %xmm7,%xmm9,%xmm8 + vpunpckhqdq %xmm7,%xmm9,%xmm7 + vpunpckhqdq %xmm0,%xmm13,%xmm9 + vpunpcklqdq %xmm0,%xmm13,%xmm0 + vmovdqa %xmm0,160(%rsp) + vpaddq curve25519_sandy2x_subc2(%rip),%xmm9,%xmm10 + vpsubq %xmm0,%xmm10,%xmm10 + vpunpckhqdq %xmm10,%xmm9,%xmm0 + vpunpcklqdq %xmm10,%xmm9,%xmm9 + vmovdqa %xmm9,448(%rsp) + vmovdqa %xmm0,464(%rsp) + vpsllq $1,%xmm0,%xmm0 + vmovdqa %xmm0,480(%rsp) + vpmuludq curve25519_sandy2x_v121666_121666(%rip),%xmm10,%xmm10 + vmovdqa 208(%rsp),%xmm0 + vpunpcklqdq %xmm0,%xmm10,%xmm9 + vpunpckhqdq %xmm0,%xmm10,%xmm0 + vpsrlq $26,%xmm2,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 + vpsrlq $25,%xmm5,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $25,%xmm1,%xmm10 + vpaddq %xmm10,%xmm4,%xmm4 + vpand curve25519_sandy2x_m25(%rip),%xmm1,%xmm1 + vpsrlq $26,%xmm8,%xmm10 + vpaddq %xmm10,%xmm7,%xmm7 + vpand curve25519_sandy2x_m26(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm4,%xmm10 + vpaddq %xmm10,%xmm3,%xmm3 + vpand curve25519_sandy2x_m26(%rip),%xmm4,%xmm4 + vpsrlq $25,%xmm7,%xmm10 + vpaddq %xmm10,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm7,%xmm7 + vpsrlq $25,%xmm3,%xmm10 + vpaddq %xmm10,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm3,%xmm3 + vpsrlq $26,%xmm9,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $26,%xmm6,%xmm10 + vpaddq %xmm10,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm0,%xmm10 + vpsllq $4,%xmm10,%xmm11 + vpaddq %xmm10,%xmm2,%xmm2 + vpsllq $1,%xmm10,%xmm10 + vpaddq %xmm10,%xmm11,%xmm11 + vpaddq %xmm11,%xmm2,%xmm2 + vpand curve25519_sandy2x_m25(%rip),%xmm0,%xmm0 + vpsrlq $25,%xmm5,%xmm10 + vpaddq %xmm10,%xmm8,%xmm8 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm10 + vpaddq %xmm10,%xmm1,%xmm1 + vpand curve25519_sandy2x_m26(%rip),%xmm2,%xmm2 + vpunpckhqdq %xmm1,%xmm2,%xmm10 + vmovdqa %xmm10,80(%rsp) + vpunpcklqdq %xmm1,%xmm2,%xmm1 + vpunpckhqdq %xmm3,%xmm4,%xmm2 + vmovdqa %xmm2,96(%rsp) + vpunpcklqdq %xmm3,%xmm4,%xmm2 + vpunpckhqdq %xmm5,%xmm6,%xmm3 + vmovdqa %xmm3,112(%rsp) + vpunpcklqdq %xmm5,%xmm6,%xmm3 + vpunpckhqdq %xmm7,%xmm8,%xmm4 + vmovdqa %xmm4,128(%rsp) + vpunpcklqdq %xmm7,%xmm8,%xmm4 + vpunpckhqdq %xmm0,%xmm9,%xmm5 + vmovdqa %xmm5,144(%rsp) + vpunpcklqdq %xmm0,%xmm9,%xmm0 + vmovdqa 176(%rsp),%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vpunpcklqdq %xmm1,%xmm5,%xmm6 + vpunpckhqdq %xmm1,%xmm5,%xmm1 + vpmuludq 224(%rsp),%xmm6,%xmm5 + vpmuludq 192(%rsp),%xmm1,%xmm7 + vpaddq %xmm7,%xmm5,%xmm5 + vpmuludq 272(%rsp),%xmm6,%xmm7 + vpmuludq 240(%rsp),%xmm1,%xmm8 + vpaddq %xmm8,%xmm7,%xmm7 + vpmuludq 288(%rsp),%xmm6,%xmm8 + vpmuludq 272(%rsp),%xmm1,%xmm9 + vpaddq %xmm9,%xmm8,%xmm8 + vpmuludq 336(%rsp),%xmm6,%xmm9 + vpmuludq 304(%rsp),%xmm1,%xmm10 + vpaddq %xmm10,%xmm9,%xmm9 + vpmuludq 352(%rsp),%xmm6,%xmm10 + vpmuludq 336(%rsp),%xmm1,%xmm11 + vpaddq %xmm11,%xmm10,%xmm10 + vpmuludq 400(%rsp),%xmm6,%xmm11 + vpmuludq 368(%rsp),%xmm1,%xmm12 + vpaddq %xmm12,%xmm11,%xmm11 + vpmuludq 416(%rsp),%xmm6,%xmm12 + vpmuludq 400(%rsp),%xmm1,%xmm13 + vpaddq %xmm13,%xmm12,%xmm12 + vpmuludq 448(%rsp),%xmm6,%xmm13 + vpmuludq 432(%rsp),%xmm1,%xmm14 + vpaddq %xmm14,%xmm13,%xmm13 + vpmuludq 464(%rsp),%xmm6,%xmm14 + vpmuludq 448(%rsp),%xmm1,%xmm15 + vpaddq %xmm15,%xmm14,%xmm14 + vpmuludq 192(%rsp),%xmm6,%xmm6 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 480(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vmovdqa 256(%rsp),%xmm1 + vpaddq %xmm1,%xmm2,%xmm2 + vpunpcklqdq %xmm2,%xmm1,%xmm15 + vpunpckhqdq %xmm2,%xmm1,%xmm1 + vpmuludq 192(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 224(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 272(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 288(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 336(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 352(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 400(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 416(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm15,%xmm15 + vpmuludq 448(%rsp),%xmm15,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 464(%rsp),%xmm15,%xmm15 + vpaddq %xmm15,%xmm5,%xmm5 + vpmuludq 192(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 240(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 272(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 304(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 336(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 368(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 400(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 432(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 448(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 480(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vmovdqa 320(%rsp),%xmm1 + vpaddq %xmm1,%xmm3,%xmm3 + vpunpcklqdq %xmm3,%xmm1,%xmm2 + vpunpckhqdq %xmm3,%xmm1,%xmm1 + vpmuludq 192(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpmuludq 224(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + vpmuludq 272(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 288(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq 336(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 352(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 400(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 416(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 448(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 464(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 192(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 240(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq 272(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 304(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 336(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 368(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 400(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 432(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 448(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 480(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vmovdqa 384(%rsp),%xmm1 + vpaddq %xmm1,%xmm4,%xmm4 + vpunpcklqdq %xmm4,%xmm1,%xmm2 + vpunpckhqdq %xmm4,%xmm1,%xmm1 + vpmuludq 192(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm11,%xmm11 + vpmuludq 224(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq 272(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpmuludq 288(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 336(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm6,%xmm6 + vpmuludq 352(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm5,%xmm5 + vpmuludq 400(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm7,%xmm7 + vpmuludq 416(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm8,%xmm8 + vpmuludq 448(%rsp),%xmm2,%xmm3 + vpaddq %xmm3,%xmm9,%xmm9 + vpmuludq 464(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 192(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 240(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq 272(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm1,%xmm1 + vpmuludq 304(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm6,%xmm6 + vpmuludq 336(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm5,%xmm5 + vpmuludq 368(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm7,%xmm7 + vpmuludq 400(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm8,%xmm8 + vpmuludq 432(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm9,%xmm9 + vpmuludq 448(%rsp),%xmm1,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + vpmuludq 480(%rsp),%xmm1,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vmovdqa 160(%rsp),%xmm1 + vpaddq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm0,%xmm1,%xmm2 + vpunpckhqdq %xmm0,%xmm1,%xmm0 + vpmuludq 192(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq 224(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm2,%xmm2 + vpmuludq 272(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpmuludq 288(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm5,%xmm5 + vpmuludq 336(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vpmuludq 352(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm8,%xmm8 + vpmuludq 400(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vpmuludq 416(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm10,%xmm10 + vpmuludq 448(%rsp),%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq 464(%rsp),%xmm2,%xmm2 + vpaddq %xmm2,%xmm12,%xmm12 + vpmuludq 192(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq curve25519_sandy2x_v19_19(%rip),%xmm0,%xmm0 + vpmuludq 240(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpmuludq 272(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm5,%xmm5 + vpmuludq 304(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm7,%xmm7 + vpmuludq 336(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm8,%xmm8 + vpmuludq 368(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm9,%xmm9 + vpmuludq 400(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm10,%xmm10 + vpmuludq 432(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq 448(%rsp),%xmm0,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpmuludq 480(%rsp),%xmm0,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpsrlq $26,%xmm6,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpsrlq $25,%xmm10,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $25,%xmm5,%xmm0 + vpaddq %xmm0,%xmm7,%xmm7 + vpand curve25519_sandy2x_m25(%rip),%xmm5,%xmm5 + vpsrlq $26,%xmm11,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpand curve25519_sandy2x_m26(%rip),%xmm11,%xmm11 + vpsrlq $26,%xmm7,%xmm0 + vpaddq %xmm0,%xmm8,%xmm8 + vpand curve25519_sandy2x_m26(%rip),%xmm7,%xmm7 + vpsrlq $25,%xmm12,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpand curve25519_sandy2x_m25(%rip),%xmm12,%xmm12 + vpsrlq $25,%xmm8,%xmm0 + vpaddq %xmm0,%xmm9,%xmm9 + vpand curve25519_sandy2x_m25(%rip),%xmm8,%xmm8 + vpsrlq $26,%xmm13,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpand curve25519_sandy2x_m26(%rip),%xmm13,%xmm13 + vpsrlq $26,%xmm9,%xmm0 + vpaddq %xmm0,%xmm10,%xmm10 + vpand curve25519_sandy2x_m26(%rip),%xmm9,%xmm9 + vpsrlq $25,%xmm14,%xmm0 + vpsllq $4,%xmm0,%xmm1 + vpaddq %xmm0,%xmm6,%xmm6 + vpsllq $1,%xmm0,%xmm0 + vpaddq %xmm0,%xmm1,%xmm1 + vpaddq %xmm1,%xmm6,%xmm6 + vpand curve25519_sandy2x_m25(%rip),%xmm14,%xmm14 + vpsrlq $25,%xmm10,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpand curve25519_sandy2x_m25(%rip),%xmm10,%xmm10 + vpsrlq $26,%xmm6,%xmm0 + vpaddq %xmm0,%xmm5,%xmm5 + vpand curve25519_sandy2x_m26(%rip),%xmm6,%xmm6 + vpunpckhqdq %xmm5,%xmm6,%xmm1 + vpunpcklqdq %xmm5,%xmm6,%xmm0 + vpunpckhqdq %xmm8,%xmm7,%xmm3 + vpunpcklqdq %xmm8,%xmm7,%xmm2 + vpunpckhqdq %xmm10,%xmm9,%xmm5 + vpunpcklqdq %xmm10,%xmm9,%xmm4 + vpunpckhqdq %xmm12,%xmm11,%xmm7 + vpunpcklqdq %xmm12,%xmm11,%xmm6 + vpunpckhqdq %xmm14,%xmm13,%xmm9 + vpunpcklqdq %xmm14,%xmm13,%xmm8 + cmp $0,%rdx + jne .Lladder_base_loop + vmovdqu %xmm1,80(%rdi) + vmovdqu %xmm0,0(%rdi) + vmovdqu %xmm3,96(%rdi) + vmovdqu %xmm2,16(%rdi) + vmovdqu %xmm5,112(%rdi) + vmovdqu %xmm4,32(%rdi) + vmovdqu %xmm7,128(%rdi) + vmovdqu %xmm6,48(%rdi) + vmovdqu %xmm9,144(%rdi) + vmovdqu %xmm8,64(%rdi) + movq 1536(%rsp),%r11 + movq 1544(%rsp),%r12 + movq 1552(%rsp),%r13 + leave + ret +ENDPROC(curve25519_sandy2x_ladder_base) +#endif /* CONFIG_AS_AVX */ diff --git a/curve25519-sandy2x.c b/curve25519-sandy2x.c new file mode 100644 index 0000000..e8d5d2b --- /dev/null +++ b/curve25519-sandy2x.c @@ -0,0 +1,139 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. + * + * Original author: Tung Chou + */ + +#include +#include + +enum { CURVE25519_POINT_SIZE = 32 }; + +static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) +{ + secret[0] &= 248; + secret[31] &= 127; + secret[31] |= 64; +} + +typedef u64 fe[10]; +typedef u64 fe51[5]; +asmlinkage void curve25519_sandy2x_ladder(fe *, const u8 *); +asmlinkage void curve25519_sandy2x_ladder_base(fe *, const u8 *); +asmlinkage void curve25519_sandy2x_fe51_pack(u8 *, const fe51 *); +asmlinkage void curve25519_sandy2x_fe51_mul(fe51 *, const fe51 *, const fe51 *); +asmlinkage void curve25519_sandy2x_fe51_nsquare(fe51 *, const fe51 *, int); + +static inline u32 le24_to_cpupv(const u8 *in) +{ + return le16_to_cpup((__le16 *)in) | ((u32)in[2]) << 16; +} + +static inline void fe_frombytes(fe h, const u8 *s) +{ + u64 h0 = le32_to_cpup((__le32 *)s); + u64 h1 = le24_to_cpupv(s + 4) << 6; + u64 h2 = le24_to_cpupv(s + 7) << 5; + u64 h3 = le24_to_cpupv(s + 10) << 3; + u64 h4 = le24_to_cpupv(s + 13) << 2; + u64 h5 = le32_to_cpup((__le32 *)(s + 16)); + u64 h6 = le24_to_cpupv(s + 20) << 7; + u64 h7 = le24_to_cpupv(s + 23) << 5; + u64 h8 = le24_to_cpupv(s + 26) << 4; + u64 h9 = (le24_to_cpupv(s + 29) & 8388607) << 2; + u64 carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9; + + carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF; + carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF; + carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF; + carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF; + carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF; + + carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF; + carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF; + carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF; + carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF; + carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF; + + h[0] = h0; + h[1] = h1; + h[2] = h2; + h[3] = h3; + h[4] = h4; + h[5] = h5; + h[6] = h6; + h[7] = h7; + h[8] = h8; + h[9] = h9; +} + +static inline void fe51_invert(fe51 *r, const fe51 *x) +{ + fe51 z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t; + + /* 2 */ curve25519_sandy2x_fe51_nsquare(&z2, x, 1); + /* 4 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2, 1); + /* 8 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 1); + /* 9 */ curve25519_sandy2x_fe51_mul(&z9, (const fe51 *)&t, x); + /* 11 */ curve25519_sandy2x_fe51_mul(&z11, (const fe51 *)&z9, (const fe51 *)&z2); + /* 22 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z11, 1); + /* 2^5 - 2^0 = 31 */ curve25519_sandy2x_fe51_mul(&z2_5_0, (const fe51 *)&t, (const fe51 *)&z9); + + /* 2^10 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_5_0, 5); + /* 2^10 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_10_0, (const fe51 *)&t, (const fe51 *)&z2_5_0); + + /* 2^20 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_10_0, 10); + /* 2^20 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_20_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); + + /* 2^40 - 2^20 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_20_0, 20); + /* 2^40 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_20_0); + + /* 2^50 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 10); + /* 2^50 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_50_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); + + /* 2^100 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_50_0, 50); + /* 2^100 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_100_0, (const fe51 *)&t, (const fe51 *)&z2_50_0); + + /* 2^200 - 2^100 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_100_0, 100); + /* 2^200 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_100_0); + + /* 2^250 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 50); + /* 2^250 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_50_0); + + /* 2^255 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 5); + /* 2^255 - 21 */ curve25519_sandy2x_fe51_mul(r, (const fe51 *)t, (const fe51 *)&z11); +} + +bool curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) +{ + u8 e[32]; + fe var[3]; + fe51 x_51, z_51; + + memcpy(e, secret, 32); + normalize_secret(e); +#define x1 var[0] +#define x2 var[1] +#define z2 var[2] + fe_frombytes(x1, basepoint); + curve25519_sandy2x_ladder(var, e); + z_51[0] = (z2[1] << 26) + z2[0]; + z_51[1] = (z2[3] << 26) + z2[2]; + z_51[2] = (z2[5] << 26) + z2[4]; + z_51[3] = (z2[7] << 26) + z2[6]; + z_51[4] = (z2[9] << 26) + z2[8]; + x_51[0] = (x2[1] << 26) + x2[0]; + x_51[1] = (x2[3] << 26) + x2[2]; + x_51[2] = (x2[5] << 26) + x2[4]; + x_51[3] = (x2[7] << 26) + x2[6]; + x_51[4] = (x2[9] << 26) + x2[8]; +#undef x1 +#undef x2 +#undef z2 + fe51_invert(&z_51, (const fe51 *)&z_51); + curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); + curve25519_sandy2x_fe51_pack(mypublic, (const fe51 *)&x_51); + + return true; +} diff --git a/curve25519-u128.c b/curve25519-u128.c deleted file mode 100644 index b51d18a..0000000 --- a/curve25519-u128.c +++ /dev/null @@ -1,420 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2008 Google Inc. All Rights Reserved. - * Copyright (C) 2015-2018 Jason A. Donenfeld . All Rights Reserved. - * - * Original author: Adam Langley - */ - -#include -#include - -enum { CURVE25519_POINT_SIZE = 32 }; - -typedef u64 limb; -typedef limb felem[5]; -typedef __uint128_t u128; - -static __always_inline void normalize_secret(u8 secret[CURVE25519_POINT_SIZE]) -{ - secret[0] &= 248; - secret[31] &= 127; - secret[31] |= 64; -} - -/* Sum two numbers: output += in */ -static __always_inline void fsum(limb *output, const limb *in) -{ - output[0] += in[0]; - output[1] += in[1]; - output[2] += in[2]; - output[3] += in[3]; - output[4] += in[4]; -} - -/* Find the difference of two numbers: output = in - output - * (note the order of the arguments!) - * - * Assumes that out[i] < 2**52 - * On return, out[i] < 2**55 - */ -static __always_inline void fdifference_backwards(felem out, const felem in) -{ - /* 152 is 19 << 3 */ - static const limb two54m152 = (((limb)1) << 54) - 152; - static const limb two54m8 = (((limb)1) << 54) - 8; - - out[0] = in[0] + two54m152 - out[0]; - out[1] = in[1] + two54m8 - out[1]; - out[2] = in[2] + two54m8 - out[2]; - out[3] = in[3] + two54m8 - out[3]; - out[4] = in[4] + two54m8 - out[4]; -} - -/* Multiply a number by a scalar: output = in * scalar */ -static __always_inline void fscalar_product(felem output, const felem in, const limb scalar) -{ - u128 a; - - a = ((u128) in[0]) * scalar; - output[0] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[1]) * scalar + ((limb) (a >> 51)); - output[1] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[2]) * scalar + ((limb) (a >> 51)); - output[2] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[3]) * scalar + ((limb) (a >> 51)); - output[3] = ((limb)a) & 0x7ffffffffffffUL; - - a = ((u128) in[4]) * scalar + ((limb) (a >> 51)); - output[4] = ((limb)a) & 0x7ffffffffffffUL; - - output[0] += (a >> 51) * 19; -} - -/* Multiply two numbers: output = in2 * in - * - * output must be distinct to both inputs. The inputs are reduced coefficient - * form, the output is not. - * - * Assumes that in[i] < 2**55 and likewise for in2. - * On return, output[i] < 2**52 - */ -static __always_inline void fmul(felem output, const felem in2, const felem in) -{ - u128 t[5]; - limb r0, r1, r2, r3, r4, s0, s1, s2, s3, s4, c; - - r0 = in[0]; - r1 = in[1]; - r2 = in[2]; - r3 = in[3]; - r4 = in[4]; - - s0 = in2[0]; - s1 = in2[1]; - s2 = in2[2]; - s3 = in2[3]; - s4 = in2[4]; - - t[0] = ((u128) r0) * s0; - t[1] = ((u128) r0) * s1 + ((u128) r1) * s0; - t[2] = ((u128) r0) * s2 + ((u128) r2) * s0 + ((u128) r1) * s1; - t[3] = ((u128) r0) * s3 + ((u128) r3) * s0 + ((u128) r1) * s2 + ((u128) r2) * s1; - t[4] = ((u128) r0) * s4 + ((u128) r4) * s0 + ((u128) r3) * s1 + ((u128) r1) * s3 + ((u128) r2) * s2; - - r4 *= 19; - r1 *= 19; - r2 *= 19; - r3 *= 19; - - t[0] += ((u128) r4) * s1 + ((u128) r1) * s4 + ((u128) r2) * s3 + ((u128) r3) * s2; - t[1] += ((u128) r4) * s2 + ((u128) r2) * s4 + ((u128) r3) * s3; - t[2] += ((u128) r4) * s3 + ((u128) r3) * s4; - t[3] += ((u128) r4) * s4; - - r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); - t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); - t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); - t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); - t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); - r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; - r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; - r2 += c; - - output[0] = r0; - output[1] = r1; - output[2] = r2; - output[3] = r3; - output[4] = r4; -} - -static __always_inline void fsquare_times(felem output, const felem in, limb count) -{ - u128 t[5]; - limb r0, r1, r2, r3, r4, c; - limb d0, d1, d2, d4, d419; - - r0 = in[0]; - r1 = in[1]; - r2 = in[2]; - r3 = in[3]; - r4 = in[4]; - - do { - d0 = r0 * 2; - d1 = r1 * 2; - d2 = r2 * 2 * 19; - d419 = r4 * 19; - d4 = d419 * 2; - - t[0] = ((u128) r0) * r0 + ((u128) d4) * r1 + (((u128) d2) * (r3 )); - t[1] = ((u128) d0) * r1 + ((u128) d4) * r2 + (((u128) r3) * (r3 * 19)); - t[2] = ((u128) d0) * r2 + ((u128) r1) * r1 + (((u128) d4) * (r3 )); - t[3] = ((u128) d0) * r3 + ((u128) d1) * r2 + (((u128) r4) * (d419 )); - t[4] = ((u128) d0) * r4 + ((u128) d1) * r3 + (((u128) r2) * (r2 )); - - r0 = (limb)t[0] & 0x7ffffffffffffUL; c = (limb)(t[0] >> 51); - t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffffUL; c = (limb)(t[1] >> 51); - t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffffUL; c = (limb)(t[2] >> 51); - t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffffUL; c = (limb)(t[3] >> 51); - t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffffUL; c = (limb)(t[4] >> 51); - r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffffUL; - r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffffUL; - r2 += c; - } while (--count); - - output[0] = r0; - output[1] = r1; - output[2] = r2; - output[3] = r3; - output[4] = r4; -} - -/* Load a little-endian 64-bit number */ -static inline limb load_limb(const u8 *in) -{ - return le64_to_cpu(*(__le64 *)in); -} - -static inline void store_limb(u8 *out, limb in) -{ - *(__le64 *)out = cpu_to_le64(in); -} - -/* Take a little-endian, 32-byte number and expand it into polynomial form */ -static inline void fexpand(limb *output, const u8 *in) -{ - output[0] = load_limb(in) & 0x7ffffffffffffUL; - output[1] = (load_limb(in + 6) >> 3) & 0x7ffffffffffffUL; - output[2] = (load_limb(in + 12) >> 6) & 0x7ffffffffffffUL; - output[3] = (load_limb(in + 19) >> 1) & 0x7ffffffffffffUL; - output[4] = (load_limb(in + 24) >> 12) & 0x7ffffffffffffUL; -} - -/* Take a fully reduced polynomial form number and contract it into a - * little-endian, 32-byte array - */ -static void fcontract(u8 *output, const felem input) -{ - u128 t[5]; - - t[0] = input[0]; - t[1] = input[1]; - t[2] = input[2]; - t[3] = input[3]; - t[4] = input[4]; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - /* now t is between 0 and 2^255-1, properly carried. */ - /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ - - t[0] += 19; - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffffUL; - - /* now between 19 and 2^255-1 in both cases, and offset by 19. */ - - t[0] += 0x8000000000000UL - 19; - t[1] += 0x8000000000000UL - 1; - t[2] += 0x8000000000000UL - 1; - t[3] += 0x8000000000000UL - 1; - t[4] += 0x8000000000000UL - 1; - - /* now between 2^255 and 2^256-20, and offset by 2^255. */ - - t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffffUL; - t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffffUL; - t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffffUL; - t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffffUL; - t[4] &= 0x7ffffffffffffUL; - - store_limb(output, t[0] | (t[1] << 51)); - store_limb(output+8, (t[1] >> 13) | (t[2] << 38)); - store_limb(output+16, (t[2] >> 26) | (t[3] << 25)); - store_limb(output+24, (t[3] >> 39) | (t[4] << 12)); -} - -/* Input: Q, Q', Q-Q' - * Output: 2Q, Q+Q' - * - * x2 z3: long form - * x3 z3: long form - * x z: short form, destroyed - * xprime zprime: short form, destroyed - * qmqp: short form, preserved - */ -static void fmonty(limb *x2, limb *z2, /* output 2Q */ - limb *x3, limb *z3, /* output Q + Q' */ - limb *x, limb *z, /* input Q */ - limb *xprime, limb *zprime, /* input Q' */ - - const limb *qmqp /* input Q - Q' */) -{ - limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], zzprime[5], zzzprime[5]; - - memcpy(origx, x, 5 * sizeof(limb)); - fsum(x, z); - fdifference_backwards(z, origx); // does x - z - - memcpy(origxprime, xprime, sizeof(limb) * 5); - fsum(xprime, zprime); - fdifference_backwards(zprime, origxprime); - fmul(xxprime, xprime, z); - fmul(zzprime, x, zprime); - memcpy(origxprime, xxprime, sizeof(limb) * 5); - fsum(xxprime, zzprime); - fdifference_backwards(zzprime, origxprime); - fsquare_times(x3, xxprime, 1); - fsquare_times(zzzprime, zzprime, 1); - fmul(z3, zzzprime, qmqp); - - fsquare_times(xx, x, 1); - fsquare_times(zz, z, 1); - fmul(x2, xx, zz); - fdifference_backwards(zz, xx); // does zz = xx - zz - fscalar_product(zzz, zz, 121665); - fsum(zzz, xx); - fmul(z2, zz, zzz); -} - -/* Maybe swap the contents of two limb arrays (@a and @b), each @len elements - * long. Perform the swap iff @swap is non-zero. - * - * This function performs the swap without leaking any side-channel - * information. - */ -static void swap_conditional(limb a[5], limb b[5], limb iswap) -{ - unsigned int i; - const limb swap = -iswap; - - for (i = 0; i < 5; ++i) { - const limb x = swap & (a[i] ^ b[i]); - - a[i] ^= x; - b[i] ^= x; - } -} - -/* Calculates nQ where Q is the x-coordinate of a point on the curve - * - * resultx/resultz: the x coordinate of the resulting curve point (short form) - * n: a little endian, 32-byte number - * q: a point of the curve (short form) - */ -static void cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) -{ - limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0}; - limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; - limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1}; - limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; - - unsigned int i, j; - - memcpy(nqpqx, q, sizeof(limb) * 5); - - for (i = 0; i < 32; ++i) { - u8 byte = n[31 - i]; - - for (j = 0; j < 8; ++j) { - const limb bit = byte >> 7; - - swap_conditional(nqx, nqpqx, bit); - swap_conditional(nqz, nqpqz, bit); - fmonty(nqx2, nqz2, - nqpqx2, nqpqz2, - nqx, nqz, - nqpqx, nqpqz, - q); - swap_conditional(nqx2, nqpqx2, bit); - swap_conditional(nqz2, nqpqz2, bit); - - t = nqx; - nqx = nqx2; - nqx2 = t; - t = nqz; - nqz = nqz2; - nqz2 = t; - t = nqpqx; - nqpqx = nqpqx2; - nqpqx2 = t; - t = nqpqz; - nqpqz = nqpqz2; - nqpqz2 = t; - - byte <<= 1; - } - } - - memcpy(resultx, nqx, sizeof(limb) * 5); - memcpy(resultz, nqz, sizeof(limb) * 5); -} - -static void crecip(felem out, const felem z) -{ - felem a, t0, b, c; - - /* 2 */ fsquare_times(a, z, 1); // a = 2 - /* 8 */ fsquare_times(t0, a, 2); - /* 9 */ fmul(b, t0, z); // b = 9 - /* 11 */ fmul(a, b, a); // a = 11 - /* 22 */ fsquare_times(t0, a, 1); - /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); - /* 2^10 - 2^5 */ fsquare_times(t0, b, 5); - /* 2^10 - 2^0 */ fmul(b, t0, b); - /* 2^20 - 2^10 */ fsquare_times(t0, b, 10); - /* 2^20 - 2^0 */ fmul(c, t0, b); - /* 2^40 - 2^20 */ fsquare_times(t0, c, 20); - /* 2^40 - 2^0 */ fmul(t0, t0, c); - /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10); - /* 2^50 - 2^0 */ fmul(b, t0, b); - /* 2^100 - 2^50 */ fsquare_times(t0, b, 50); - /* 2^100 - 2^0 */ fmul(c, t0, b); - /* 2^200 - 2^100 */ fsquare_times(t0, c, 100); - /* 2^200 - 2^0 */ fmul(t0, t0, c); - /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50); - /* 2^250 - 2^0 */ fmul(t0, t0, b); - /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5); - /* 2^255 - 21 */ fmul(out, t0, a); -} - -bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) -{ - limb bp[5], x[5], z[5], zmone[5]; - u8 e[32]; - - memcpy(e, secret, 32); - normalize_secret(e); - - fexpand(bp, basepoint); - cmult(x, z, e, bp); - crecip(zmone, z); - fmul(z, x, zmone); - fcontract(mypublic, z); - - memzero_explicit(e, sizeof(e)); - memzero_explicit(bp, sizeof(bp)); - memzero_explicit(x, sizeof(x)); - memzero_explicit(z, sizeof(z)); - memzero_explicit(zmone, sizeof(zmone)); - - return true; -} diff --git a/function.h b/function.h deleted file mode 100644 index 6f360e9..0000000 --- a/function.h +++ /dev/null @@ -1,68 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 - * - * Copyright (C) 2018 Jason A. Donenfeld . All Rights Reserved. - */ - -enum { WARMUP = 5000, TRIALS = 10000, IDLE = 1 * 1000 }; - -enum { CURVE25519_POINT_SIZE = 32 }; -bool curve25519(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]); - -struct curve25519_test_vector { - u8 private[CURVE25519_POINT_SIZE]; - u8 public[CURVE25519_POINT_SIZE]; - u8 result[CURVE25519_POINT_SIZE]; - bool valid; -}; - -static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { - { - .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, - .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, - .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .valid = true - }, - { - .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, - .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, - .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 }, - .valid = true - }, - { - .private = { 1 }, - .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, - .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, 0xb, 0x95, 0x48, 0xdc, 0xc, 0xd8, 0x19, 0x98, 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f }, - .valid = true - }, - { - .private = { 1 }, - .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, - .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x8, 0xed, 0xe3, 0xb, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 }, - .valid = true - }, - { - .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, - .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, - .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 }, - .valid = true - }, - { - .private = { 1, 2, 3, 4 }, - .public = { 0 }, - .result = { 0 }, - .valid = false - }, - { - .private = { 2, 4, 6, 8 }, - .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 }, - .result = { 0 }, - .valid = false - } -}; - -u8 dummy_out[CURVE25519_POINT_SIZE]; - -static __always_inline int function(void) -{ - return curve25519(dummy_out, curve25519_test_vectors[0].private, curve25519_test_vectors[0].public); -} diff --git a/main.c b/main.c index d538f48..6584eeb 100644 --- a/main.c +++ b/main.c @@ -6,34 +6,113 @@ #include #include #include -#include "function.h" +#include +#include +#include +#include static unsigned long stamp = 0; module_param(stamp, ulong, 0); int dummy; + +enum { CURVE25519_POINT_SIZE = 32 }; +u8 dummy_out[CURVE25519_POINT_SIZE]; +#include "test_vectors.h" + +#define declare_it(name) \ +bool curve25519_ ## name(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]); \ +static __always_inline int name(void) \ +{ \ + return curve25519_ ## name(dummy_out, curve25519_test_vectors[0].private, curve25519_test_vectors[0].public); \ +} + +#define do_it(name) do { \ + for (i = 0; i < WARMUP; ++i) \ + ret |= name(); \ + start_ ## name = get_cycles(); \ + for (i = 0; i < TRIALS; ++i) \ + ret |= name(); \ + end_ ## name = get_cycles(); \ +} while (0) + +#define test_it(name, before, after) do { \ + memset(out, __LINE__, CURVE25519_POINT_SIZE); \ + before; \ + ret = curve25519_ ## name(out, curve25519_test_vectors[i].private, curve25519_test_vectors[i].public); \ + after; \ + if (memcmp(out, curve25519_test_vectors[i].result, CURVE25519_POINT_SIZE)) { \ + pr_err(#name " self-test %zu: FAIL\n", i + 1); \ + return false; \ + } \ +} while (0) + +#define report_it(name) do { \ + pr_err("%lu: %7s: %llu cycles per call\n", stamp, #name, (end_ ## name - start_ ## name) / TRIALS); \ +} while (0) + + +declare_it(donna64) +declare_it(hacl64) +declare_it(sandy2x) +declare_it(amd64) +declare_it(fiat32) +declare_it(donna32) + +static bool verify(void) +{ + int ret; + size_t i = 0; + u8 out[CURVE25519_POINT_SIZE]; + + for (i = 0; i < ARRAY_SIZE(curve25519_test_vectors); ++i) { + test_it(donna64, {}, {}); + test_it(hacl64, {}, {}); + test_it(sandy2x, kernel_fpu_begin(), kernel_fpu_end()); + test_it(amd64, {}, {}); + test_it(fiat32, {}, {}); + test_it(donna32, {}, {}); + } + return true; +} + static int __init mod_init(void) { + enum { WARMUP = 5000, TRIALS = 10000, IDLE = 1 * 1000 }; int ret = 0, i; - cycles_t start, end; + cycles_t start_donna64, end_donna64; + cycles_t start_hacl64, end_hacl64; + cycles_t start_sandy2x, end_sandy2x; + cycles_t start_amd64, end_amd64; + cycles_t start_fiat32, end_fiat32; + cycles_t start_donna32, end_donna32; unsigned long flags; DEFINE_SPINLOCK(lock); + + if (!verify()) + return -EBFONT; msleep(IDLE); spin_lock_irqsave(&lock, flags); - - for (i = 0; i < WARMUP; ++i) - ret |= function(); - start = get_cycles(); - for (i = 0; i < TRIALS; ++i) - ret |= function(); - end = get_cycles(); + do_it(donna64); + do_it(hacl64); + kernel_fpu_begin(); + do_it(sandy2x); + kernel_fpu_end(); + do_it(amd64); + do_it(fiat32); + do_it(donna32); spin_unlock_irqrestore(&lock, flags); - pr_err("%lu: %llu cycles per call\n", stamp, (end - start) / TRIALS); + report_it(donna64); + report_it(hacl64); + report_it(sandy2x); + report_it(amd64); + report_it(fiat32); + report_it(donna32); /* Don't let compiler be too clever. */ dummy = ret; diff --git a/test_vectors.h b/test_vectors.h new file mode 100644 index 0000000..91b24ee --- /dev/null +++ b/test_vectors.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Copyright (C) 2018 Jason A. Donenfeld . All Rights Reserved. + */ + +struct curve25519_test_vector { + u8 private[CURVE25519_POINT_SIZE]; + u8 public[CURVE25519_POINT_SIZE]; + u8 result[CURVE25519_POINT_SIZE]; +}; + +static const struct curve25519_test_vector curve25519_test_vectors[] __initconst = { + { + .private = { 0x77, 0x07, 0x6d, 0x0a, 0x73, 0x18, 0xa5, 0x7d, 0x3c, 0x16, 0xc1, 0x72, 0x51, 0xb2, 0x66, 0x45, 0xdf, 0x4c, 0x2f, 0x87, 0xeb, 0xc0, 0x99, 0x2a, 0xb1, 0x77, 0xfb, 0xa5, 0x1d, 0xb9, 0x2c, 0x2a }, + .public = { 0xde, 0x9e, 0xdb, 0x7d, 0x7b, 0x7d, 0xc1, 0xb4, 0xd3, 0x5b, 0x61, 0xc2, 0xec, 0xe4, 0x35, 0x37, 0x3f, 0x83, 0x43, 0xc8, 0x5b, 0x78, 0x67, 0x4d, 0xad, 0xfc, 0x7e, 0x14, 0x6f, 0x88, 0x2b, 0x4f }, + .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 } + }, + { + .private = { 0x5d, 0xab, 0x08, 0x7e, 0x62, 0x4a, 0x8a, 0x4b, 0x79, 0xe1, 0x7f, 0x8b, 0x83, 0x80, 0x0e, 0xe6, 0x6f, 0x3b, 0xb1, 0x29, 0x26, 0x18, 0xb6, 0xfd, 0x1c, 0x2f, 0x8b, 0x27, 0xff, 0x88, 0xe0, 0xeb }, + .public = { 0x85, 0x20, 0xf0, 0x09, 0x89, 0x30, 0xa7, 0x54, 0x74, 0x8b, 0x7d, 0xdc, 0xb4, 0x3e, 0xf7, 0x5a, 0x0d, 0xbf, 0x3a, 0x0d, 0x26, 0x38, 0x1a, 0xf4, 0xeb, 0xa4, 0xa9, 0x8e, 0xaa, 0x9b, 0x4e, 0x6a }, + .result = { 0x4a, 0x5d, 0x9d, 0x5b, 0xa4, 0xce, 0x2d, 0xe1, 0x72, 0x8e, 0x3b, 0xf4, 0x80, 0x35, 0x0f, 0x25, 0xe0, 0x7e, 0x21, 0xc9, 0x47, 0xd1, 0x9e, 0x33, 0x76, 0xf0, 0x9b, 0x3c, 0x1e, 0x16, 0x17, 0x42 } + }, + { + .private = { 1 }, + .public = { 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + .result = { 0x3c, 0x77, 0x77, 0xca, 0xf9, 0x97, 0xb2, 0x64, 0x41, 0x60, 0x77, 0x66, 0x5b, 0x4e, 0x22, 0x9d, 0xb, 0x95, 0x48, 0xdc, 0xc, 0xd8, 0x19, 0x98, 0xdd, 0xcd, 0xc5, 0xc8, 0x53, 0x3c, 0x79, 0x7f } + }, + { + .private = { 1 }, + .public = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, + .result = { 0xb3, 0x2d, 0x13, 0x62, 0xc2, 0x48, 0xd6, 0x2f, 0xe6, 0x26, 0x19, 0xcf, 0xf0, 0x4d, 0xd4, 0x3d, 0xb7, 0x3f, 0xfc, 0x1b, 0x63, 0x8, 0xed, 0xe3, 0xb, 0x78, 0xd8, 0x73, 0x80, 0xf1, 0xe8, 0x34 } + }, + { + .private = { 0xa5, 0x46, 0xe3, 0x6b, 0xf0, 0x52, 0x7c, 0x9d, 0x3b, 0x16, 0x15, 0x4b, 0x82, 0x46, 0x5e, 0xdd, 0x62, 0x14, 0x4c, 0x0a, 0xc1, 0xfc, 0x5a, 0x18, 0x50, 0x6a, 0x22, 0x44, 0xba, 0x44, 0x9a, 0xc4 }, + .public = { 0xe6, 0xdb, 0x68, 0x67, 0x58, 0x30, 0x30, 0xdb, 0x35, 0x94, 0xc1, 0xa4, 0x24, 0xb1, 0x5f, 0x7c, 0x72, 0x66, 0x24, 0xec, 0x26, 0xb3, 0x35, 0x3b, 0x10, 0xa9, 0x03, 0xa6, 0xd0, 0xab, 0x1c, 0x4c }, + .result = { 0xc3, 0xda, 0x55, 0x37, 0x9d, 0xe9, 0xc6, 0x90, 0x8e, 0x94, 0xea, 0x4d, 0xf2, 0x8d, 0x08, 0x4f, 0x32, 0xec, 0xcf, 0x03, 0x49, 0x1c, 0x71, 0xf7, 0x54, 0xb4, 0x07, 0x55, 0x77, 0xa2, 0x85, 0x52 } + }, + { + .private = { 1, 2, 3, 4 }, + .public = { 0 }, + .result = { 0 } + }, + { + .private = { 2, 4, 6, 8 }, + .public = { 0xe0, 0xeb, 0x7a, 0x7c, 0x3b, 0x41, 0xb8, 0xae, 0x16, 0x56, 0xe3, 0xfa, 0xf1, 0x9f, 0xc4, 0x6a, 0xda, 0x09, 0x8d, 0xeb, 0x9c, 0x32, 0xb1, 0xfd, 0x86, 0x62, 0x05, 0x16, 0x5f, 0x49, 0xb8 }, + .result = { 0 } + } +}; -- cgit v1.2.3-59-g8ed1b