aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/blowfish-x86_64-asm_64.S
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@mbnet.fi>2011-09-02 01:45:22 +0300
committerHerbert Xu <herbert@gondor.apana.org.au>2011-09-22 21:25:26 +1000
commit64b94ceae8c16cd1b2800cac83112d3815be5250 (patch)
treec7e3384659522cac32dc85a34e4ed722346a0f91 /arch/x86/crypto/blowfish-x86_64-asm_64.S
parentcrypto: tcrypt - add ctr(blowfish) speed test (diff)
downloadlinux-dev-64b94ceae8c16cd1b2800cac83112d3815be5250.tar.xz
linux-dev-64b94ceae8c16cd1b2800cac83112d3815be5250.zip
crypto: blowfish - add x86_64 assembly implementation
Patch adds x86_64 assembly implementation of blowfish. Two set of assembler functions are provided. First set is regular 'one-block at time' encrypt/decrypt functions. Second is 'four-block at time' functions that gain performance increase on out-of-order CPUs. Performance of 4-way functions should be equal to 1-way functions with in-order CPUs. Summary of the tcrypt benchmarks: Blowfish assembler vs blowfish C (256bit 8kb block ECB) encrypt: 2.2x speed decrypt: 2.3x speed Blowfish assembler vs blowfish C (256bit 8kb block CBC) encrypt: 1.12x speed decrypt: 2.5x speed Blowfish assembler vs blowfish C (256bit 8kb block CTR) encrypt: 2.5x speed Full output: http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-asm-x86_64.txt http://koti.mbnet.fi/axh/kernel/crypto/tcrypt-speed-blowfish-c-x86_64.txt Tests were run on: vendor_id : AuthenticAMD cpu family : 16 model : 10 model name : AMD Phenom(tm) II X6 1055T Processor stepping : 0 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@mbnet.fi> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Diffstat (limited to 'arch/x86/crypto/blowfish-x86_64-asm_64.S')
-rw-r--r--arch/x86/crypto/blowfish-x86_64-asm_64.S392
1 files changed, 392 insertions, 0 deletions
diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S
new file mode 100644
index 000000000000..44eb23ab9676
--- /dev/null
+++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S
@@ -0,0 +1,392 @@
+/*
+ * Blowfish Cipher Algorithm (x86_64)
+ *
+ * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
+ * USA
+ *
+ */
+
+.file "blowfish-x86_64-asm.S"
+.text
+
+/* structure of crypto context */
+#define p 0
+#define s0 ((16 + 2) * 4)
+#define s1 ((16 + 2 + (1 * 256)) * 4)
+#define s2 ((16 + 2 + (2 * 256)) * 4)
+#define s3 ((16 + 2 + (3 * 256)) * 4)
+
+/* register macros */
+#define CTX %rdi
+#define RIO %rsi
+
+#define RX0 %rax
+#define RX1 %rbx
+#define RX2 %rcx
+#define RX3 %rdx
+
+#define RX0d %eax
+#define RX1d %ebx
+#define RX2d %ecx
+#define RX3d %edx
+
+#define RX0bl %al
+#define RX1bl %bl
+#define RX2bl %cl
+#define RX3bl %dl
+
+#define RX0bh %ah
+#define RX1bh %bh
+#define RX2bh %ch
+#define RX3bh %dh
+
+#define RT0 %rbp
+#define RT1 %rsi
+
+#define RT0d %ebp
+#define RT1d %esi
+
+#define RK0 %r8
+#define RK1 %r9
+#define RK2 %r10
+#define RK3 %r11
+
+#define RK0d %r8d
+#define RK1d %r9d
+#define RK2d %r10d
+#define RK3d %r11d
+
+#define RKEY %r12
+
+/***********************************************************************
+ * 1-way blowfish
+ ***********************************************************************/
+#define F(x, k) \
+ rorq $16, x; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT1d; \
+ rolq $16, x; \
+ movl s0(CTX,RT0,4), k ## d; \
+ addl s1(CTX,RT1,4), k ## d; \
+ movzbl x ## bh, RT0d; \
+ movzbl x ## bl, RT1d; \
+ rolq $32, x; \
+ xorl s2(CTX,RT0,4), k ## d; \
+ addl s3(CTX,RT1,4), k ## d; \
+ xorq k, x;
+
+#define add_roundkey_enc(n) \
+ xorq p+4*(n)(CTX), RX0;
+
+#define round_enc(n) \
+ add_roundkey_enc(n); \
+ \
+ F(RX0, RK0); \
+ F(RX0, RK0);
+
+#define round_final_enc(n) \
+ xorq p+4*(n)(CTX), RX0;
+
+#define add_roundkey_dec(n) \
+ movq p+4*(n-1)(CTX), RT0; \
+ rorq $32, RT0; \
+ xorq RT0, RX0;
+
+#define round_dec(n) \
+ add_roundkey_dec(n); \
+ \
+ F(RX0, RK0); \
+ F(RX0, RK0); \
+
+#define read_block() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0;
+
+#define write_block() \
+ bswapq RX0; \
+ movq RX0, (RIO);
+
+#define xor_block() \
+ bswapq RX0; \
+ xorq RX0, (RIO);
+
+.align 8
+.global __blowfish_enc_blk
+.type __blowfish_enc_blk,@function;
+
+__blowfish_enc_blk:
+ // input:
+ // %rdi: ctx, CTX
+ // %rsi: dst
+ // %rdx: src
+ // %rcx: bool xor
+ pushq %rbp;
+ pushq %rbx;
+
+ pushq %rsi;
+ pushq %rcx;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_enc(0);
+ round_enc(2);
+ round_enc(4);
+ round_enc(6);
+ round_enc(8);
+ round_enc(10);
+ round_enc(12);
+ round_enc(14);
+ add_roundkey_enc(16);
+
+ popq %rbp;
+ popq RIO;
+
+ test %bpl, %bpl;
+ jnz __enc_xor;
+
+ write_block();
+
+__enc_ret:
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+
+__enc_xor:
+ xor_block();
+
+ jmp __enc_ret;
+
+.align 8
+.global blowfish_dec_blk
+.type blowfish_dec_blk,@function;
+
+blowfish_dec_blk:
+ // input:
+ // %rdi: ctx, CTX
+ // %rsi: dst
+ // %rdx: src
+ pushq %rbp;
+ pushq %rbx;
+
+ pushq %rsi;
+ movq %rdx, RIO;
+
+ read_block();
+
+ round_dec(17);
+ round_dec(15);
+ round_dec(13);
+ round_dec(11);
+ round_dec(9);
+ round_dec(7);
+ round_dec(5);
+ round_dec(3);
+ add_roundkey_dec(1);
+
+ popq RIO;
+ write_block();
+
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+
+/**********************************************************************
+ 4-way blowfish, four blocks parallel
+ **********************************************************************/
+#define add_preloaded_roundkey4() \
+ xorq RKEY, RX0; \
+ xorq RKEY, RX1; \
+ xorq RKEY, RX2; \
+ xorq RKEY, RX3;
+
+#define preload_roundkey_enc(n) \
+ movq p+4*(n)(CTX), RKEY;
+
+#define add_roundkey_enc4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_enc(n + 2);
+
+#define round_enc4(n) \
+ add_roundkey_enc4(n); \
+ \
+ F(RX0, RK0); \
+ F(RX1, RK1); \
+ F(RX2, RK2); \
+ F(RX3, RK3); \
+ \
+ F(RX0, RK0); \
+ F(RX1, RK1); \
+ F(RX2, RK2); \
+ F(RX3, RK3);
+
+#define preload_roundkey_dec(n) \
+ movq p+4*((n)-1)(CTX), RKEY; \
+ rorq $32, RKEY;
+
+#define add_roundkey_dec4(n) \
+ add_preloaded_roundkey4(); \
+ preload_roundkey_dec(n - 2);
+
+#define round_dec4(n) \
+ add_roundkey_dec4(n); \
+ \
+ F(RX0, RK0); \
+ F(RX1, RK1); \
+ F(RX2, RK2); \
+ F(RX3, RK3); \
+ \
+ F(RX0, RK0); \
+ F(RX1, RK1); \
+ F(RX2, RK2); \
+ F(RX3, RK3);
+
+#define read_block4() \
+ movq (RIO), RX0; \
+ rorq $32, RX0; \
+ bswapq RX0; \
+ \
+ movq 8(RIO), RX1; \
+ rorq $32, RX1; \
+ bswapq RX1; \
+ \
+ movq 16(RIO), RX2; \
+ rorq $32, RX2; \
+ bswapq RX2; \
+ \
+ movq 24(RIO), RX3; \
+ rorq $32, RX3; \
+ bswapq RX3;
+
+#define write_block4() \
+ bswapq RX0; \
+ movq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ movq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ movq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ movq RX3, 24(RIO);
+
+#define xor_block4() \
+ bswapq RX0; \
+ xorq RX0, (RIO); \
+ \
+ bswapq RX1; \
+ xorq RX1, 8(RIO); \
+ \
+ bswapq RX2; \
+ xorq RX2, 16(RIO); \
+ \
+ bswapq RX3; \
+ xorq RX3, 24(RIO);
+
+.align 8
+.global __blowfish_enc_blk_4way
+.type __blowfish_enc_blk_4way,@function;
+
+__blowfish_enc_blk_4way:
+ // input:
+ // %rdi: ctx, CTX
+ // %rsi: dst
+ // %rdx: src
+ // %rcx: bool xor
+ pushq %rbp;
+ pushq %rbx;
+ pushq RKEY;
+ preload_roundkey_enc(0);
+
+ pushq %rsi;
+ pushq %rcx;
+ movq %rdx, RIO;
+
+ read_block4();
+
+ round_enc4(0);
+ round_enc4(2);
+ round_enc4(4);
+ round_enc4(6);
+ round_enc4(8);
+ round_enc4(10);
+ round_enc4(12);
+ round_enc4(14);
+ add_preloaded_roundkey4();
+
+ popq %rbp;
+ popq RIO;
+
+ test %bpl, %bpl;
+ jnz __enc_xor4;
+
+ write_block4();
+
+__enc_ret4:
+ popq RKEY;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+
+__enc_xor4:
+ xor_block4();
+
+ jmp __enc_ret4;
+
+.align 8
+.global blowfish_dec_blk_4way
+.type blowfish_dec_blk_4way,@function;
+
+blowfish_dec_blk_4way:
+ // input:
+ // %rdi: ctx, CTX
+ // %rsi: dst
+ // %rdx: src
+ pushq %rbp;
+ pushq %rbx;
+ pushq RKEY;
+ preload_roundkey_dec(17);
+
+ pushq %rsi;
+ movq %rdx, RIO;
+
+ read_block4();
+
+ round_dec4(17);
+ round_dec4(15);
+ round_dec4(13);
+ round_dec4(11);
+ round_dec4(9);
+ round_dec4(7);
+ round_dec4(5);
+ round_dec4(3);
+ add_preloaded_roundkey4();
+
+ popq RIO;
+ write_block4();
+
+ popq RKEY;
+ popq %rbx;
+ popq %rbp;
+
+ ret;
+