From b65de973614c0d4b529fb2461aace171c0aa5f89 Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Thu, 22 Jun 2017 22:50:50 +0200 Subject: curve25519: keep certain sandy2x functions in C We can let the compiler optimize how it sees fit. --- src/crypto/curve25519-avx-x86_64.S | 227 ------------------------------------- src/crypto/curve25519.c | 89 ++++++++++++++- 2 files changed, 84 insertions(+), 232 deletions(-) (limited to 'src/crypto') diff --git a/src/crypto/curve25519-avx-x86_64.S b/src/crypto/curve25519-avx-x86_64.S index 81d0c74..bd636b5 100644 --- a/src/crypto/curve25519-avx-x86_64.S +++ b/src/crypto/curve25519-avx-x86_64.S @@ -3257,230 +3257,3 @@ ENTRY(curve25519_sandy2x_ladder_base) add %r11,%rsp ret ENDPROC(curve25519_sandy2x_ladder_base) - -ENTRY(curve25519_sandy2x_fe_frombytes) - pushq %r14 - pushq %r13 - pushq %r12 - pushq %rbp - pushq %rbx - movzbl 5(%rsi), %r9d - movzbl 6(%rsi), %eax - movzbl 4(%rsi), %edx - movzbl 11(%rsi), %r10d - movzbl 7(%rsi), %ecx - movzbl 20(%rsi), %r14d - movzbl 23(%rsi), %r11d - salq $8, %r9 - movzbl 27(%rsi), %r12d - movl (%rsi), %r8d - salq $16, %rax - movzbl 29(%rsi), %r13d - movl 16(%rsi), %ebx - orq %rax, %r9 - movzbl 9(%rsi), %eax - salq $8, %r10 - orq %rdx, %r9 - movzbl 8(%rsi), %edx - salq $8, %r12 - salq $6, %r9 - salq $16, %rax - salq $8, %rdx - orq %rax, %rdx - movzbl 12(%rsi), %eax - orq %rcx, %rdx - movzbl 10(%rsi), %ecx - salq $5, %rdx - salq $16, %rax - orq %rax, %r10 - movzbl 14(%rsi), %eax - orq %rcx, %r10 - movzbl 13(%rsi), %ecx - salq $3, %r10 - salq $8, %rax - movq %rax, %rbp - movzbl 15(%rsi), %eax - salq $16, %rax - orq %rax, %rbp - movzbl 22(%rsi), %eax - orq %rcx, %rbp - movzbl 21(%rsi), %ecx - salq $16, %rax - salq $8, %rcx - orq %rax, %rcx - movzbl 24(%rsi), %eax - orq %r14, %rcx - salq $7, %rcx - movq %rcx, %r14 - movzbl 25(%rsi), %ecx - salq $8, %rax - salq $16, %rcx - orq %rcx, %rax - movzbl 28(%rsi), %ecx - orq %r11, %rax - movzbl 26(%rsi), %r11d - salq $5, %rax - salq $16, %rcx - orq %rcx, %r12 - movzbl 30(%rsi), %ecx - orq %r11, %r12 - movzbl 31(%rsi), %r11d - movq %rax, %rsi - salq $4, %r12 - shrq $25, %rsi - andl $33554431, %eax - addq %r12, %rsi - movl %r8d, %r12d - shrq $26, %r8 - salq $8, %rcx - andl $67108863, %r12d - salq $16, %r11 - orq %rcx, %r11 - movq %r9, %rcx - andl $33554431, %r9d - shrq $25, %rcx - orq %r13, %r11 - movq %r10, %r13 - addq %rcx, %rdx - movl %ebx, %ecx - andl $33554431, %r13d - addq %r8, %r9 - shrq $25, %rcx - movq %rdx, %r8 - shrq $26, %rdx - addq %r14, %rcx - shrq $25, %r10 - movl %r12d, %r14d - addq %r13, %rdx - andl $8388607, %r11d - movq %r14, (%rdi) - movq %r9, 8(%rdi) - andl $33554431, %ebx - movq %rdx, 24(%rdi) - leaq (%r10,%rbp,4), %rdx - andl $67108863, %r8d - movq %rbx, 40(%rdi) - movq %r8, 16(%rdi) - movq %rdx, 32(%rdi) - movq %rcx, %rdx - shrq $26, %rcx - addq %rax, %rcx - movq %rsi, %rax - shrq $26, %rsi - andl $67108863, %eax - andl $67108863, %edx - movq %rcx, 56(%rdi) - movq %rax, 64(%rdi) - leaq (%rsi,%r11,4), %rax - movq %rdx, 48(%rdi) - popq %rbx - movq %rax, 72(%rdi) - popq %rbp - popq %r12 - popq %r13 - popq %r14 - ret -ENDPROC(curve25519_sandy2x_fe_frombytes) - -ENTRY(curve25519_sandy2x_fe51_invert) - pushq %rbp - pushq %rbx - movq %rdi, %rbp - movl $1, %edx - movq %rsi, %rbx - subq $440, %rsp - movq %rsp, %rdi - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rdi - movq %rsp, %rsi - movl $1, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rsi - movl $1, %edx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rsi - leaq 48(%rsp), %rdi - movq %rbx, %rdx - call curve25519_sandy2x_fe51_mul - leaq 48(%rsp), %rsi - leaq 96(%rsp), %rdi - movq %rsp, %rdx - call curve25519_sandy2x_fe51_mul - leaq 96(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $1, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 48(%rsp), %rdx - leaq 384(%rsp), %rsi - leaq 144(%rsp), %rdi - call curve25519_sandy2x_fe51_mul - leaq 144(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $5, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 144(%rsp), %rdx - leaq 384(%rsp), %rsi - leaq 192(%rsp), %rdi - call curve25519_sandy2x_fe51_mul - leaq 192(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $10, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 192(%rsp), %rdx - leaq 384(%rsp), %rsi - leaq 240(%rsp), %rdi - call curve25519_sandy2x_fe51_mul - leaq 240(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $20, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rsi - leaq 240(%rsp), %rdx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_mul - leaq 384(%rsp), %rsi - movl $10, %edx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_nsquare - leaq 192(%rsp), %rdx - leaq 384(%rsp), %rsi - leaq 288(%rsp), %rdi - call curve25519_sandy2x_fe51_mul - leaq 288(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $50, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 288(%rsp), %rdx - leaq 384(%rsp), %rsi - leaq 336(%rsp), %rdi - call curve25519_sandy2x_fe51_mul - leaq 336(%rsp), %rsi - leaq 384(%rsp), %rdi - movl $100, %edx - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rsi - leaq 336(%rsp), %rdx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_mul - leaq 384(%rsp), %rsi - movl $50, %edx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_nsquare - leaq 384(%rsp), %rsi - leaq 288(%rsp), %rdx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_mul - leaq 384(%rsp), %rsi - movl $5, %edx - movq %rsi, %rdi - call curve25519_sandy2x_fe51_nsquare - leaq 96(%rsp), %rdx - leaq 384(%rsp), %rsi - movq %rbp, %rdi - call curve25519_sandy2x_fe51_mul - addq $440, %rsp - popq %rbx - popq %rbp - ret -ENDPROC(curve25519_sandy2x_fe51_invert) diff --git a/src/crypto/curve25519.c b/src/crypto/curve25519.c index 780df70..1d79ab6 100644 --- a/src/crypto/curve25519.c +++ b/src/crypto/curve25519.c @@ -39,10 +39,89 @@ typedef u64 fe[10]; typedef u64 fe51[5]; asmlinkage void curve25519_sandy2x_ladder(fe *, const u8 *); asmlinkage void curve25519_sandy2x_ladder_base(fe *, const u8 *); -asmlinkage void curve25519_sandy2x_fe_frombytes(fe, const u8 *); asmlinkage void curve25519_sandy2x_fe51_pack(u8 *, const fe51 *); asmlinkage void curve25519_sandy2x_fe51_mul(fe51 *, const fe51 *, const fe51 *); -asmlinkage void curve25519_sandy2x_fe51_invert(fe51 *, const fe51 *); +asmlinkage void curve25519_sandy2x_fe51_nsquare(fe51 *, const fe51 *, int); + +static inline u32 le24_to_cpupv(const u8 *in) +{ + return le16_to_cpup((__le16 *)in) | ((u32)in[2]) << 16; +} + +static inline void fe_frombytes(fe h, const u8 *s) +{ + u64 h0 = le32_to_cpup((__le32 *)s); + u64 h1 = le24_to_cpupv(s + 4) << 6; + u64 h2 = le24_to_cpupv(s + 7) << 5; + u64 h3 = le24_to_cpupv(s + 10) << 3; + u64 h4 = le24_to_cpupv(s + 13) << 2; + u64 h5 = le32_to_cpup((__le32 *)(s + 16)); + u64 h6 = le24_to_cpupv(s + 20) << 7; + u64 h7 = le24_to_cpupv(s + 23) << 5; + u64 h8 = le24_to_cpupv(s + 26) << 4; + u64 h9 = (le24_to_cpupv(s + 29) & 8388607) << 2; + u64 carry0, carry1, carry2, carry3, carry4, carry5, carry6, carry7, carry8, carry9; + + carry9 = h9 >> 25; h0 += carry9 * 19; h9 &= 0x1FFFFFF; + carry1 = h1 >> 25; h2 += carry1; h1 &= 0x1FFFFFF; + carry3 = h3 >> 25; h4 += carry3; h3 &= 0x1FFFFFF; + carry5 = h5 >> 25; h6 += carry5; h5 &= 0x1FFFFFF; + carry7 = h7 >> 25; h8 += carry7; h7 &= 0x1FFFFFF; + + carry0 = h0 >> 26; h1 += carry0; h0 &= 0x3FFFFFF; + carry2 = h2 >> 26; h3 += carry2; h2 &= 0x3FFFFFF; + carry4 = h4 >> 26; h5 += carry4; h4 &= 0x3FFFFFF; + carry6 = h6 >> 26; h7 += carry6; h6 &= 0x3FFFFFF; + carry8 = h8 >> 26; h9 += carry8; h8 &= 0x3FFFFFF; + + h[0] = h0; + h[1] = h1; + h[2] = h2; + h[3] = h3; + h[4] = h4; + h[5] = h5; + h[6] = h6; + h[7] = h7; + h[8] = h8; + h[9] = h9; +} + +static inline void fe51_invert(fe51 *r, const fe51 *x) +{ + fe51 z2, z9, z11, z2_5_0, z2_10_0, z2_20_0, z2_50_0, z2_100_0, t; + + /* 2 */ curve25519_sandy2x_fe51_nsquare(&z2, x, 1); + /* 4 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2, 1); + /* 8 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 1); + /* 9 */ curve25519_sandy2x_fe51_mul(&z9, (const fe51 *)&t, x); + /* 11 */ curve25519_sandy2x_fe51_mul(&z11, (const fe51 *)&z9, (const fe51 *)&z2); + /* 22 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z11, 1); + /* 2^5 - 2^0 = 31 */ curve25519_sandy2x_fe51_mul(&z2_5_0, (const fe51 *)&t, (const fe51 *)&z9); + + /* 2^10 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_5_0, 5); + /* 2^10 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_10_0, (const fe51 *)&t, (const fe51 *)&z2_5_0); + + /* 2^20 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_10_0, 10); + /* 2^20 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_20_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); + + /* 2^40 - 2^20 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_20_0, 20); + /* 2^40 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_20_0); + + /* 2^50 - 2^10 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 10); + /* 2^50 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_50_0, (const fe51 *)&t, (const fe51 *)&z2_10_0); + + /* 2^100 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_50_0, 50); + /* 2^100 - 2^0 */ curve25519_sandy2x_fe51_mul(&z2_100_0, (const fe51 *)&t, (const fe51 *)&z2_50_0); + + /* 2^200 - 2^100 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&z2_100_0, 100); + /* 2^200 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_100_0); + + /* 2^250 - 2^50 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 50); + /* 2^250 - 2^0 */ curve25519_sandy2x_fe51_mul(&t, (const fe51 *)&t, (const fe51 *)&z2_50_0); + + /* 2^255 - 2^5 */ curve25519_sandy2x_fe51_nsquare(&t, (const fe51 *)&t, 5); + /* 2^255 - 21 */ curve25519_sandy2x_fe51_mul(r, (const fe51 *)t, (const fe51 *)&z11); +} static void curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secret[CURVE25519_POINT_SIZE], const u8 basepoint[CURVE25519_POINT_SIZE]) { @@ -54,7 +133,7 @@ static void curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secr #define x1 var[0] #define x2 var[1] #define z2 var[2] - curve25519_sandy2x_fe_frombytes(x1, basepoint); + fe_frombytes(x1, basepoint); curve25519_sandy2x_ladder(var, e); z_51[0] = (z2[1] << 26) + z2[0]; z_51[1] = (z2[3] << 26) + z2[2]; @@ -69,7 +148,7 @@ static void curve25519_sandy2x(u8 mypublic[CURVE25519_POINT_SIZE], const u8 secr #undef x1 #undef x2 #undef z2 - curve25519_sandy2x_fe51_invert(&z_51, (const fe51 *)&z_51); + fe51_invert(&z_51, (const fe51 *)&z_51); curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); curve25519_sandy2x_fe51_pack(mypublic, (const fe51 *)&x_51); @@ -101,7 +180,7 @@ static void curve25519_sandy2x_base(u8 pub[CURVE25519_POINT_SIZE], const u8 secr x_51[4] = (x2[9] << 26) + x2[8]; #undef x2 #undef z2 - curve25519_sandy2x_fe51_invert(&z_51, (const fe51 *)&z_51); + fe51_invert(&z_51, (const fe51 *)&z_51); curve25519_sandy2x_fe51_mul(&x_51, (const fe51 *)&x_51, (const fe51 *)&z_51); curve25519_sandy2x_fe51_pack(pub, (const fe51 *)&x_51); -- cgit v1.2.3-59-g8ed1b