From 5700b6174988bd8b682c7301a11060ae76983027 Mon Sep 17 00:00:00 2001 From: Samuel Neves Date: Wed, 22 Nov 2017 16:58:04 +0100 Subject: poly1305-x86_64: unclobber %rbp OpenSSL's Poly1305 kernels use %rbp as a scratch register. However, the kernel expects rbp to be a valid frame pointer at any given time in order to do proper unwinding. Thus we need to alter the code in order to preserve it. The most straightforward manner in which this was accomplished was by replacing $d3 in poly1305-x86_64.pl -- formerly %r10 -- by %rdi, and replace %rbp by %r10. Because %rdi, a pointer to the context structure, does not change and is not used by poly1305_iteration, it is safe to use it here, and the overhead of saving and restoring it should be minimal. Signed-off-by: Samuel Neves --- src/crypto/poly1305-x86_64.S | 276 +++++++++++++++++++++++-------------------- 1 file changed, 145 insertions(+), 131 deletions(-) (limited to 'src') diff --git a/src/crypto/poly1305-x86_64.S b/src/crypto/poly1305-x86_64.S index c9dd1bd..bff1d0e 100644 --- a/src/crypto/poly1305-x86_64.S +++ b/src/crypto/poly1305-x86_64.S @@ -85,11 +85,11 @@ ENTRY(poly1305_blocks_x86_64) jz .Lno_data pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lblocks_body: @@ -100,7 +100,7 @@ ENTRY(poly1305_blocks_x86_64) movq 0(%rdi),%r14 movq 8(%rdi),%rbx - movq 16(%rdi),%rbp + movq 16(%rdi),%r10 movq %r13,%r12 shrq $2,%r13 @@ -110,14 +110,15 @@ ENTRY(poly1305_blocks_x86_64) .align 32 .Loop: + addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 mulq %r14 movq %rax,%r9 movq %r11,%rax - movq %rdx,%r10 + movq %rdx,%rdi mulq %r14 movq %rax,%r14 @@ -127,47 +128,48 @@ ENTRY(poly1305_blocks_x86_64) mulq %rbx addq %rax,%r9 movq %r13,%rax - adcq %rdx,%r10 + adcq %rdx,%rdi mulq %rbx - movq %rbp,%rbx + movq %r10,%rbx addq %rax,%r14 adcq %rdx,%r8 imulq %r13,%rbx addq %rbx,%r9 movq %r8,%rbx - adcq $0,%r10 + adcq $0,%rdi - imulq %r11,%rbp + imulq %r11,%r10 addq %r9,%rbx movq $-4,%rax - adcq %rbp,%r10 + adcq %r10,%rdi - andq %r10,%rax - movq %r10,%rbp - shrq $2,%r10 - andq $3,%rbp - addq %r10,%rax + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax addq %rax,%r14 adcq $0,%rbx - adcq $0,%rbp + adcq $0,%r10 + movq %r12,%rax decq %r15 jnz .Loop + movq 0(%rsp),%rdi + movq %r14,0(%rdi) movq %rbx,8(%rdi) - movq %rbp,16(%rdi) + movq %r10,16(%rdi) - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rsp - .Lno_data: .Lblocks_epilogue: ret @@ -201,7 +203,7 @@ ENDPROC(poly1305_emit_x86_64) mulq %r14 movq %rax,%r9 movq %r11,%rax - movq %rdx,%r10 + movq %rdx,%rdi mulq %r14 movq %rax,%r14 @@ -211,42 +213,44 @@ ENDPROC(poly1305_emit_x86_64) mulq %rbx addq %rax,%r9 movq %r13,%rax - adcq %rdx,%r10 + adcq %rdx,%rdi mulq %rbx - movq %rbp,%rbx + movq %r10,%rbx addq %rax,%r14 adcq %rdx,%r8 imulq %r13,%rbx addq %rbx,%r9 movq %r8,%rbx - adcq $0,%r10 + adcq $0,%rdi - imulq %r11,%rbp + imulq %r11,%r10 addq %r9,%rbx movq $-4,%rax - adcq %rbp,%r10 + adcq %r10,%rdi - andq %r10,%rax - movq %r10,%rbp - shrq $2,%r10 - andq $3,%rbp - addq %r10,%rax + andq %rdi,%rax + movq %rdi,%r10 + shrq $2,%rdi + andq $3,%r10 + addq %rdi,%rax addq %rax,%r14 adcq $0,%rbx - adcq $0,%rbp + adcq $0,%r10 .endm .macro __poly1305_init_avx movq %r11,%r14 movq %r12,%rbx - xorq %rbp,%rbp + xorq %r10,%r10 leaq 48+64(%rdi),%rdi movq %r12,%rax + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movl $0x3ffffff,%eax movl $0x3ffffff,%edx @@ -304,7 +308,7 @@ ENDPROC(poly1305_emit_x86_64) movl %edx,36(%rdi) shrq $26,%r9 - movq %rbp,%rax + movq %r10,%rax shlq $24,%rax orq %rax,%r8 movl %r8d,48(%rdi) @@ -315,7 +319,9 @@ ENDPROC(poly1305_emit_x86_64) movl %r9d,68(%rdi) movq %r12,%rax + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movl $0x3ffffff,%eax movq %r14,%r8 @@ -347,7 +353,7 @@ ENDPROC(poly1305_emit_x86_64) shrq $26,%r8 movl %edx,44(%rdi) - movq %rbp,%rax + movq %r10,%rax shlq $24,%rax orq %rax,%r8 movl %r8d,60(%rdi) @@ -355,7 +361,9 @@ ENDPROC(poly1305_emit_x86_64) movl %r8d,76(%rdi) movq %r12,%rax + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movl $0x3ffffff,%eax movq %r14,%r8 @@ -387,7 +395,7 @@ ENDPROC(poly1305_emit_x86_64) shrq $26,%r8 movl %edx,40(%rdi) - movq %rbp,%rax + movq %r10,%rax shlq $24,%rax orq %rax,%r8 movl %r8d,56(%rdi) @@ -420,11 +428,11 @@ ENTRY(poly1305_blocks_avx) jz .Leven_avx pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lblocks_avx_body: @@ -432,7 +440,7 @@ ENTRY(poly1305_blocks_avx) movq 0(%rdi),%r8 movq 8(%rdi),%r9 - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq 24(%rdi),%r11 movq 32(%rdi),%r13 @@ -452,21 +460,21 @@ ENTRY(poly1305_blocks_avx) addq %r12,%r14 adcq %r9,%rbx - movq %rbp,%r8 + movq %r10,%r8 shlq $40,%r8 - shrq $24,%rbp + shrq $24,%r10 addq %r8,%rbx - adcq $0,%rbp + adcq $0,%r10 movq $-4,%r9 - movq %rbp,%r8 - andq %rbp,%r9 + movq %r10,%r8 + andq %r10,%r9 shrq $2,%r8 - andq $3,%rbp + andq $3,%r10 addq %r9,%r8 addq %r8,%r14 adcq $0,%rbx - adcq $0,%rbp + adcq $0,%r10 movq %r13,%r12 movq %r13,%rax @@ -476,9 +484,11 @@ ENTRY(poly1305_blocks_avx) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi testq %rcx,%rcx jz .Lstore_base2_64_avx @@ -495,11 +505,11 @@ ENTRY(poly1305_blocks_avx) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r11,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r12 andq $0x3ffffff,%rbx - orq %r12,%rbp + orq %r12,%r10 subq $16,%r15 jz .Lstore_base2_26_avx @@ -508,14 +518,14 @@ ENTRY(poly1305_blocks_avx) vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 jmp .Lproceed_avx .align 32 .Lstore_base2_64_avx: movq %r14,0(%rdi) movq %rbx,8(%rdi) - movq %rbp,16(%rdi) + movq %r10,16(%rdi) jmp .Ldone_avx .align 16 @@ -524,14 +534,13 @@ ENTRY(poly1305_blocks_avx) movl %edx,4(%rdi) movl %r14d,8(%rdi) movl %ebx,12(%rdi) - movl %ebp,16(%rdi) + movl %r10d,16(%rdi) .align 16 .Ldone_avx: - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rsp @@ -543,11 +552,11 @@ ENTRY(poly1305_blocks_avx) .Lbase2_64_avx: pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lbase2_64_avx_body: @@ -558,7 +567,7 @@ ENTRY(poly1305_blocks_avx) movq 0(%rdi),%r14 movq 8(%rdi),%rbx - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq %r13,%r12 movq %r13,%rax @@ -571,10 +580,12 @@ ENTRY(poly1305_blocks_avx) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 subq $16,%r15 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi .Linit_avx: @@ -589,17 +600,17 @@ ENTRY(poly1305_blocks_avx) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r8,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r9 andq $0x3ffffff,%rbx - orq %r9,%rbp + orq %r9,%r10 vmovd %eax,%xmm0 vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 movl $1,20(%rdi) __poly1305_init_avx @@ -607,11 +618,10 @@ ENTRY(poly1305_blocks_avx) .Lproceed_avx: movq %r15,%rdx - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rax leaq 48(%rsp),%rsp @@ -1224,11 +1234,11 @@ ENTRY(poly1305_blocks_avx2) jz .Leven_avx2 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lblocks_avx2_body: @@ -1236,7 +1246,7 @@ ENTRY(poly1305_blocks_avx2) movq 0(%rdi),%r8 movq 8(%rdi),%r9 - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq 24(%rdi),%r11 movq 32(%rdi),%r13 @@ -1256,21 +1266,21 @@ ENTRY(poly1305_blocks_avx2) addq %r12,%r14 adcq %r9,%rbx - movq %rbp,%r8 + movq %r10,%r8 shlq $40,%r8 - shrq $24,%rbp + shrq $24,%r10 addq %r8,%rbx - adcq $0,%rbp + adcq $0,%r10 movq $-4,%r9 - movq %rbp,%r8 - andq %rbp,%r9 + movq %r10,%r8 + andq %r10,%r9 shrq $2,%r8 - andq $3,%rbp + andq $3,%r10 addq %r9,%r8 addq %r8,%r14 adcq $0,%rbx - adcq $0,%rbp + adcq $0,%r10 movq %r13,%r12 movq %r13,%rax @@ -1281,10 +1291,12 @@ ENTRY(poly1305_blocks_avx2) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 subq $16,%r15 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movq %r12,%rax testq $63,%r15 @@ -1305,11 +1317,11 @@ ENTRY(poly1305_blocks_avx2) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r11,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r12 andq $0x3ffffff,%rbx - orq %r12,%rbp + orq %r12,%r10 testq %r15,%r15 jz .Lstore_base2_26_avx2 @@ -1318,14 +1330,14 @@ ENTRY(poly1305_blocks_avx2) vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 jmp .Lproceed_avx2 .align 32 .Lstore_base2_64_avx2: movq %r14,0(%rdi) movq %rbx,8(%rdi) - movq %rbp,16(%rdi) + movq %r10,16(%rdi) jmp .Ldone_avx2 .align 16 @@ -1334,14 +1346,13 @@ ENTRY(poly1305_blocks_avx2) movl %edx,4(%rdi) movl %r14d,8(%rdi) movl %ebx,12(%rdi) - movl %ebp,16(%rdi) + movl %r10d,16(%rdi) .align 16 .Ldone_avx2: - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rsp @@ -1355,11 +1366,11 @@ ENTRY(poly1305_blocks_avx2) pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lbase2_64_avx2_body: @@ -1370,7 +1381,7 @@ ENTRY(poly1305_blocks_avx2) movq 0(%rdi),%r14 movq 8(%rdi),%rbx - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq %r13,%r12 movq %r13,%rax @@ -1384,10 +1395,12 @@ ENTRY(poly1305_blocks_avx2) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 subq $16,%r15 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movq %r12,%rax testq $63,%r15 @@ -1406,17 +1419,17 @@ ENTRY(poly1305_blocks_avx2) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r8,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r9 andq $0x3ffffff,%rbx - orq %r9,%rbp + orq %r9,%r10 vmovd %eax,%xmm0 vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 movl $1,20(%rdi) __poly1305_init_avx @@ -1424,11 +1437,10 @@ ENTRY(poly1305_blocks_avx2) .Lproceed_avx2: movq %r15,%rdx - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rax leaq 48(%rsp),%rsp @@ -1796,11 +1808,11 @@ ENTRY(poly1305_blocks_avx512) jz .Leven_avx2_512 pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lblocks_avx2_body_512: @@ -1808,7 +1820,7 @@ ENTRY(poly1305_blocks_avx512) movq 0(%rdi),%r8 movq 8(%rdi),%r9 - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq 24(%rdi),%r11 movq 32(%rdi),%r13 @@ -1828,21 +1840,21 @@ ENTRY(poly1305_blocks_avx512) addq %r12,%r14 adcq %r9,%rbx - movq %rbp,%r8 + movq %r10,%r8 shlq $40,%r8 - shrq $24,%rbp + shrq $24,%r10 addq %r8,%rbx - adcq $0,%rbp + adcq $0,%r10 movq $-4,%r9 - movq %rbp,%r8 - andq %rbp,%r9 + movq %r10,%r8 + andq %r10,%r9 shrq $2,%r8 - andq $3,%rbp + andq $3,%r10 addq %r9,%r8 addq %r8,%r14 adcq $0,%rbx - adcq $0,%rbp + adcq $0,%r10 movq %r13,%r12 movq %r13,%rax @@ -1853,10 +1865,12 @@ ENTRY(poly1305_blocks_avx512) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 subq $16,%r15 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movq %r12,%rax testq $63,%r15 @@ -1877,11 +1891,11 @@ ENTRY(poly1305_blocks_avx512) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r11,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r12 andq $0x3ffffff,%rbx - orq %r12,%rbp + orq %r12,%r10 testq %r15,%r15 jz .Lstore_base2_26_avx2_512 @@ -1890,14 +1904,14 @@ ENTRY(poly1305_blocks_avx512) vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 jmp .Lproceed_avx2_512 .align 32 .Lstore_base2_64_avx2_512: movq %r14,0(%rdi) movq %rbx,8(%rdi) - movq %rbp,16(%rdi) + movq %r10,16(%rdi) jmp .Ldone_avx2_512 .align 16 @@ -1906,14 +1920,13 @@ ENTRY(poly1305_blocks_avx512) movl %edx,4(%rdi) movl %r14d,8(%rdi) movl %ebx,12(%rdi) - movl %ebp,16(%rdi) + movl %r10d,16(%rdi) .align 16 .Ldone_avx2_512: - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rsp @@ -1926,11 +1939,11 @@ ENTRY(poly1305_blocks_avx512) .Lbase2_64_avx2_512: pushq %rbx - pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + pushq %rdi .Lbase2_64_avx2_body_512: @@ -1941,7 +1954,7 @@ ENTRY(poly1305_blocks_avx512) movq 0(%rdi),%r14 movq 8(%rdi),%rbx - movl 16(%rdi),%ebp + movl 16(%rdi),%r10d movq %r13,%r12 movq %r13,%rax @@ -1955,10 +1968,12 @@ ENTRY(poly1305_blocks_avx512) addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi - adcq %rcx,%rbp + adcq %rcx,%r10 subq $16,%r15 + movq %rdi,0(%rsp) __poly1305_block + movq 0(%rsp),%rdi movq %r12,%rax testq $63,%r15 @@ -1977,17 +1992,17 @@ ENTRY(poly1305_blocks_avx512) andq $0x3ffffff,%rdx shrq $14,%rbx orq %r8,%r14 - shlq $24,%rbp + shlq $24,%r10 andq $0x3ffffff,%r14 shrq $40,%r9 andq $0x3ffffff,%rbx - orq %r9,%rbp + orq %r9,%r10 vmovd %eax,%xmm0 vmovd %edx,%xmm1 vmovd %r14d,%xmm2 vmovd %ebx,%xmm3 - vmovd %ebp,%xmm4 + vmovd %r10d,%xmm4 movl $1,20(%rdi) __poly1305_init_avx @@ -1995,11 +2010,10 @@ ENTRY(poly1305_blocks_avx512) .Lproceed_avx2_512: movq %r15,%rdx - movq 0(%rsp),%r15 - movq 8(%rsp),%r14 - movq 16(%rsp),%r13 - movq 24(%rsp),%r12 - movq 32(%rsp),%rbp + movq 8(%rsp),%r15 + movq 16(%rsp),%r14 + movq 24(%rsp),%r13 + movq 32(%rsp),%r12 movq 40(%rsp),%rbx leaq 48(%rsp),%rax leaq 48(%rsp),%rsp -- cgit v1.2.3-59-g8ed1b