aboutsummaryrefslogtreecommitdiffstats
path: root/arch/arm64/crypto/ghash-ce-core.S
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-06-19 07:51:45 -1000
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-19 07:51:45 -1000
commit92b944170d67fec75ec20e1362fd8ecff078e7d1 (patch)
tree51fa38b11a60b830fac3e2c24c10a61b132e1951 /arch/arm64/crypto/ghash-ce-core.S
parentMerge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-next (diff)
parentarm64: mm: remove broken &= operator from pmd_mknotpresent (diff)
downloadlinux-dev-92b944170d67fec75ec20e1362fd8ecff078e7d1.tar.xz
linux-dev-92b944170d67fec75ec20e1362fd8ecff078e7d1.zip
Merge tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux
Pull arm64 fixes from Catalin Marinas: "These are primarily bug fixes with a performance improvement patch for the GHASH crypto algorithm (which went in during this merging window) and dts/defconfig/Kconfig updates. - ftrace_return_addr() macro fix for arm (introduced earlier via the arm64 tree) - stack alignment exception entry code fix - GHASH crypto algorithm fix and performance improvement - CMA buffer limited to 32-bit (until a better way to describe the system topology in DT) - UAPI sigcontext.h build fix - __kernel_old_{gid,uid}_t definitions fix (affecting 32-bit LTP) - ptrace fixes (kernel fault and 32-bit arm core dump) - pte_mknotpresent() fix - dts updates (APM SoC) - defconfig and Kconfig update" * tag 'arm64-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux: arm64: mm: remove broken &= operator from pmd_mknotpresent arm64: fix build error in sigcontext.h arm64: dts: Add more serial port nodes in APM X-Gene device tree arm64/dma: Removing ARCH_HAS_DMA_GET_REQUIRED_MASK macro arm64: ptrace: fix empty registers set in prstatus of aarch32 process core arm64: uid16: fix __kernel_old_{gid,uid}_t definitions arm64: ptrace: change fs when passing kernel pointer to regset code arm64: Limit the CMA buffer to 32-bit if ZONE_DMA arm/ftrace: fix ftrace_return_addr() to ftrace_return_address() arm64/crypto: improve performance of GHASH algorithm arm64/crypto: fix data corruption bug in GHASH algorithm arm64: defconfig update for LTP arm64: ftrace: Fix comment typo 'CONFIG_FUNCTION_GRAPH_FP_TEST' arm64: add ARCH_HAS_OPP to allow enabling OPP library arm64: restore alphabetic order in Kconfig arm64: Bug fix in stack alignment exception
Diffstat (limited to 'arch/arm64/crypto/ghash-ce-core.S')
-rw-r--r--arch/arm64/crypto/ghash-ce-core.S92
1 files changed, 38 insertions, 54 deletions
diff --git a/arch/arm64/crypto/ghash-ce-core.S b/arch/arm64/crypto/ghash-ce-core.S
index b9e6eaf41c9b..dc457015884e 100644
--- a/arch/arm64/crypto/ghash-ce-core.S
+++ b/arch/arm64/crypto/ghash-ce-core.S
@@ -3,14 +3,6 @@
*
* Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
- * Based on arch/x86/crypto/ghash-pmullni-intel_asm.S
- *
- * Copyright (c) 2009 Intel Corp.
- * Author: Huang Ying <ying.huang@intel.com>
- * Vinodh Gopal
- * Erdinc Ozturk
- * Deniz Karakoyunlu
- *
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
@@ -19,13 +11,15 @@
#include <linux/linkage.h>
#include <asm/assembler.h>
- DATA .req v0
- SHASH .req v1
- IN1 .req v2
+ SHASH .req v0
+ SHASH2 .req v1
T1 .req v2
T2 .req v3
- T3 .req v4
- VZR .req v5
+ MASK .req v4
+ XL .req v5
+ XM .req v6
+ XH .req v7
+ IN1 .req v7
.text
.arch armv8-a+crypto
@@ -35,61 +29,51 @@
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update)
- ld1 {DATA.16b}, [x1]
ld1 {SHASH.16b}, [x3]
- eor VZR.16b, VZR.16b, VZR.16b
+ ld1 {XL.16b}, [x1]
+ movi MASK.16b, #0xe1
+ ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
+ shl MASK.2d, MASK.2d, #57
+ eor SHASH2.16b, SHASH2.16b, SHASH.16b
/* do the head block first, if supplied */
cbz x4, 0f
- ld1 {IN1.2d}, [x4]
+ ld1 {T1.2d}, [x4]
b 1f
-0: ld1 {IN1.2d}, [x2], #16
+0: ld1 {T1.2d}, [x2], #16
sub w0, w0, #1
-1: ext IN1.16b, IN1.16b, IN1.16b, #8
-CPU_LE( rev64 IN1.16b, IN1.16b )
- eor DATA.16b, DATA.16b, IN1.16b
- /* multiply DATA by SHASH in GF(2^128) */
- ext T2.16b, DATA.16b, DATA.16b, #8
- ext T3.16b, SHASH.16b, SHASH.16b, #8
- eor T2.16b, T2.16b, DATA.16b
- eor T3.16b, T3.16b, SHASH.16b
+1: /* multiply XL by SHASH in GF(2^128) */
+CPU_LE( rev64 T1.16b, T1.16b )
- pmull2 T1.1q, SHASH.2d, DATA.2d // a1 * b1
- pmull DATA.1q, SHASH.1d, DATA.1d // a0 * b0
- pmull T2.1q, T2.1d, T3.1d // (a1 + a0)(b1 + b0)
- eor T2.16b, T2.16b, T1.16b // (a0 * b1) + (a1 * b0)
- eor T2.16b, T2.16b, DATA.16b
+ ext T2.16b, XL.16b, XL.16b, #8
+ ext IN1.16b, T1.16b, T1.16b, #8
+ eor T1.16b, T1.16b, T2.16b
+ eor XL.16b, XL.16b, IN1.16b
- ext T3.16b, VZR.16b, T2.16b, #8
- ext T2.16b, T2.16b, VZR.16b, #8
- eor DATA.16b, DATA.16b, T3.16b
- eor T1.16b, T1.16b, T2.16b // <T1:DATA> is result of
- // carry-less multiplication
+ pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
+ eor T1.16b, T1.16b, XL.16b
+ pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
+ pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
- /* first phase of the reduction */
- shl T3.2d, DATA.2d, #1
- eor T3.16b, T3.16b, DATA.16b
- shl T3.2d, T3.2d, #5
- eor T3.16b, T3.16b, DATA.16b
- shl T3.2d, T3.2d, #57
- ext T2.16b, VZR.16b, T3.16b, #8
- ext T3.16b, T3.16b, VZR.16b, #8
- eor DATA.16b, DATA.16b, T2.16b
- eor T1.16b, T1.16b, T3.16b
+ ext T1.16b, XL.16b, XH.16b, #8
+ eor T2.16b, XL.16b, XH.16b
+ eor XM.16b, XM.16b, T1.16b
+ eor XM.16b, XM.16b, T2.16b
+ pmull T2.1q, XL.1d, MASK.1d
- /* second phase of the reduction */
- ushr T2.2d, DATA.2d, #5
- eor T2.16b, T2.16b, DATA.16b
- ushr T2.2d, T2.2d, #1
- eor T2.16b, T2.16b, DATA.16b
- ushr T2.2d, T2.2d, #1
- eor T1.16b, T1.16b, T2.16b
- eor DATA.16b, DATA.16b, T1.16b
+ mov XH.d[0], XM.d[1]
+ mov XM.d[1], XL.d[0]
+
+ eor XL.16b, XM.16b, T2.16b
+ ext T2.16b, XL.16b, XL.16b, #8
+ pmull XL.1q, XL.1d, MASK.1d
+ eor T2.16b, T2.16b, XH.16b
+ eor XL.16b, XL.16b, T2.16b
cbnz w0, 0b
- st1 {DATA.16b}, [x1]
+ st1 {XL.16b}, [x1]
ret
ENDPROC(pmull_ghash_update)