diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-19 07:51:45 -1000 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-19 07:51:45 -1000 |
commit | 92b944170d67fec75ec20e1362fd8ecff078e7d1 (patch) | |
tree | 51fa38b11a60b830fac3e2c24c10a61b132e1951 /arch/arm64/crypto/ghash-ce-core.S | |
parent | Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next (diff) | |
parent | arm64: mm: remove broken &= operator from pmd_mknotpresent (diff) | |
download | linux-dev-92b944170d67fec75ec20e1362fd8ecff078e7d1.tar.xz linux-dev-92b944170d67fec75ec20e1362fd8ecff078e7d1.zip |
Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 fixes from Catalin Marinas:
"These are primarily bug fixes with a performance improvement patch for
the GHASH crypto algorithm (which went in during this merging window)
and dts/defconfig/Kconfig updates.
- ftrace_return_addr() macro fix for arm (introduced earlier via the
arm64 tree)
- stack alignment exception entry code fix
- GHASH crypto algorithm fix and performance improvement
- CMA buffer limited to 32-bit (until a better way to describe the
system topology in DT)
- UAPI sigcontext.h build fix
- __kernel_old_{gid,uid}_t definitions fix (affecting 32-bit LTP)
- ptrace fixes (kernel fault and 32-bit arm core dump)
- pte_mknotpresent() fix
- dts updates (APM SoC)
- defconfig and Kconfig update"
* tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux:
arm64: mm: remove broken &= operator from pmd_mknotpresent
arm64: fix build error in sigcontext.h
arm64: dts: Add more serial port nodes in APM X-Gene device tree
arm64/dma: Removing ARCH_HAS_DMA_GET_REQUIRED_MASK macro
arm64: ptrace: fix empty registers set in prstatus of aarch32 process core
arm64: uid16: fix __kernel_old_{gid,uid}_t definitions
arm64: ptrace: change fs when passing kernel pointer to regset code
arm64: Limit the CMA buffer to 32-bit if ZONE_DMA
arm/ftrace: fix ftrace_return_addr() to ftrace_return_address()
arm64/crypto: improve performance of GHASH algorithm
arm64/crypto: fix data corruption bug in GHASH algorithm
arm64: defconfig update for LTP
arm64: ftrace: Fix comment typo 'CONFIG_FUNCTION_GRAPH_FP_TEST'
arm64: add ARCH_HAS_OPP to allow enabling OPP library
arm64: restore alphabetic order in Kconfig
arm64: Bug fix in stack alignment exception
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r-- | arch/arm64/crypto/ghash-ce-core.S | 92 |
1 files changed, 38 insertions, 54 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S index b9e6eaf41c9b..dc457015884e 100644 --- a/arch/arm64/crypto/ghash-ce-core.S +++ b/arch/arm64/crypto/ghash-ce-core.S @@ -3,14 +3,6 @@ * * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> * - * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S - * - * Copyright (c) 2009 Intel Corp. - * Author: Huang Ying <ying.huang@intel.com> - * Vinodh Gopal - * Erdinc Ozturk - * Deniz Karakoyunlu - * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation. @@ -19,13 +11,15 @@ #include <linux/linkage.h> #include <asm/assembler.h> - DATA .req v0 - SHASH .req v1 - IN1 .req v2 + SHASH .req v0 + SHASH2 .req v1 T1 .req v2 T2 .req v3 - T3 .req v4 - VZR .req v5 + MASK .req v4 + XL .req v5 + XM .req v6 + XH .req v7 + IN1 .req v7 .text .arch armv8-a+crypto @@ -35,61 +29,51 @@ * struct ghash_key const *k, const char *head) */ ENTRY(pmull_ghash_update) - ld1 {DATA.16b}, [x1] ld1 {SHASH.16b}, [x3] - eor VZR.16b, VZR.16b, VZR.16b + ld1 {XL.16b}, [x1] + movi MASK.16b, #0xe1 + ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 + shl MASK.2d, MASK.2d, #57 + eor SHASH2.16b, SHASH2.16b, SHASH.16b /* do the head block first, if supplied */ cbz x4, 0f - ld1 {IN1.2d}, [x4] + ld1 {T1.2d}, [x4] b 1f -0: ld1 {IN1.2d}, [x2], #16 +0: ld1 {T1.2d}, [x2], #16 sub w0, w0, #1 -1: ext IN1.16b, IN1.16b, IN1.16b, #8 -CPU_LE( rev64 IN1.16b, IN1.16b ) - eor DATA.16b, DATA.16b, IN1.16b - /* multiply DATA by SHASH in GF(2^128) */ - ext T2.16b, DATA.16b, DATA.16b, #8 - ext T3.16b, SHASH.16b, SHASH.16b, #8 - eor T2.16b, T2.16b, DATA.16b - eor T3.16b, T3.16b, SHASH.16b +1: /* multiply XL by SHASH in GF(2^128) */ +CPU_LE( rev64 T1.16b, T1.16b ) - pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1 - pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0 - pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0) - eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0) - eor T2.16b, T2.16b, DATA.16b + ext T2.16b, XL.16b, XL.16b, #8 + ext IN1.16b, T1.16b, T1.16b, #8 + eor T1.16b, T1.16b, T2.16b + eor XL.16b, XL.16b, IN1.16b - ext T3.16b, VZR.16b, T2.16b, #8 - ext T2.16b, T2.16b, VZR.16b, #8 - eor DATA.16b, DATA.16b, T3.16b - eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of - // carry-less multiplication + pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 + eor T1.16b, T1.16b, XL.16b + pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 + pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) - /* first phase of the reduction */ - shl T3.2d, DATA.2d, #1 - eor T3.16b, T3.16b, DATA.16b - shl T3.2d, T3.2d, #5 - eor T3.16b, T3.16b, DATA.16b - shl T3.2d, T3.2d, #57 - ext T2.16b, VZR.16b, T3.16b, #8 - ext T3.16b, T3.16b, VZR.16b, #8 - eor DATA.16b, DATA.16b, T2.16b - eor T1.16b, T1.16b, T3.16b + ext T1.16b, XL.16b, XH.16b, #8 + eor T2.16b, XL.16b, XH.16b + eor XM.16b, XM.16b, T1.16b + eor XM.16b, XM.16b, T2.16b + pmull T2.1q, XL.1d, MASK.1d - /* second phase of the reduction */ - ushr T2.2d, DATA.2d, #5 - eor T2.16b, T2.16b, DATA.16b - ushr T2.2d, T2.2d, #1 - eor T2.16b, T2.16b, DATA.16b - ushr T2.2d, T2.2d, #1 - eor T1.16b, T1.16b, T2.16b - eor DATA.16b, DATA.16b, T1.16b + mov XH.d[0], XM.d[1] + mov XM.d[1], XL.d[0] + + eor XL.16b, XM.16b, T2.16b + ext T2.16b, XL.16b, XL.16b, #8 + pmull XL.1q, XL.1d, MASK.1d + eor T2.16b, T2.16b, XH.16b + eor XL.16b, XL.16b, T2.16b cbnz w0, 0b - st1 {DATA.16b}, [x1] + st1 {XL.16b}, [x1] ret ENDPROC(pmull_ghash_update) |