aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86/crypto/blake2s-core.S
blob: b50b35ff1fdbad76994257332fd5c57dfaa7f892 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
 * Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
 * Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
 */

#include <linux/linkage.h>

.section .rodata.cst32.BLAKE2S_IV, "aM", @progbits, 32
.align 32
IV:	.octa 0xA54FF53A3C6EF372BB67AE856A09E667
	.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ROT16, "aM", @progbits, 16
.align 16
ROT16:	.octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ROR328, "aM", @progbits, 16
.align 16
ROR328:	.octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.BLAKE2S_SIGMA, "aM", @progbits, 160
.align 64
SIGMA:
.byte  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
.byte 14,  4,  9, 13, 10,  8, 15,  6,  5,  1,  0, 11,  3, 12,  2,  7
.byte 11, 12,  5, 15,  8,  0,  2, 13,  9, 10,  3,  7,  4, 14,  6,  1
.byte  7,  3, 13, 11,  9,  1, 12, 14, 15,  2,  5,  4,  8,  6, 10,  0
.byte  9,  5,  2, 10,  0,  7,  4, 15,  3, 14, 11,  6, 13,  1, 12,  8
.byte  2,  6,  0,  8, 12, 10, 11,  3,  1,  4,  7, 15,  9, 13,  5, 14
.byte 12,  1, 14,  4,  5, 15, 13, 10,  8,  0,  6,  9, 11,  7,  3,  2
.byte 13,  7, 12,  3, 11, 14,  1,  9,  2,  5, 15,  8, 10,  0,  4,  6
.byte  6, 14, 11,  0, 15,  9,  3,  8, 10, 12, 13,  1,  5,  2,  7,  4
.byte 10,  8,  7,  1,  2,  4,  6,  5, 13, 15,  9,  3,  0, 11, 14, 12
#ifdef CONFIG_AS_AVX512
.section .rodata.cst64.BLAKE2S_SIGMA2, "aM", @progbits, 640
.align 64
SIGMA2:
.long  0,  2,  4,  6,  1,  3,  5,  7, 14,  8, 10, 12, 15,  9, 11, 13
.long  8,  2, 13, 15, 10,  9, 12,  3,  6,  4,  0, 14,  5, 11,  1,  7
.long 11, 13,  8,  6,  5, 10, 14,  3,  2,  4, 12, 15,  1,  0,  7,  9
.long 11, 10,  7,  0,  8, 15,  1, 13,  3,  6,  2, 12,  4, 14,  9,  5
.long  4, 10,  9, 14, 15,  0, 11,  8,  1,  7,  3, 13,  2,  5,  6, 12
.long  2, 11,  4, 15, 14,  3, 10,  8, 13,  6,  5,  7,  0, 12,  1,  9
.long  4,  8, 15,  9, 14, 11, 13,  5,  3,  2,  1, 12,  6, 10,  7,  0
.long  6, 13,  0, 14, 12,  2,  1, 11, 15,  4,  5,  8,  7,  9,  3, 10
.long 15,  5,  4, 13, 10,  7,  3, 11, 12,  2,  0,  6,  9,  8,  1, 14
.long  8,  7, 14, 11, 13, 15,  0, 12, 10,  4,  5,  6,  3,  2,  1,  9
#endif /* CONFIG_AS_AVX512 */

.text
SYM_FUNC_START(blake2s_compress_ssse3)
	testq		%rdx,%rdx
	je		.Lendofloop
	movdqu		(%rdi),%xmm0
	movdqu		0x10(%rdi),%xmm1
	movdqa		ROT16(%rip),%xmm12
	movdqa		ROR328(%rip),%xmm13
	movdqu		0x20(%rdi),%xmm14
	movq		%rcx,%xmm15
	leaq		SIGMA+0xa0(%rip),%r8
	jmp		.Lbeginofloop
	.align		32
.Lbeginofloop:
	movdqa		%xmm0,%xmm10
	movdqa		%xmm1,%xmm11
	paddq		%xmm15,%xmm14
	movdqa		IV(%rip),%xmm2
	movdqa		%xmm14,%xmm3
	pxor		IV+0x10(%rip),%xmm3
	leaq		SIGMA(%rip),%rcx
.Lroundloop:
	movzbl		(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0x1(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0x2(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x3(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	punpckldq	%xmm5,%xmm4
	punpckldq	%xmm7,%xmm6
	punpcklqdq	%xmm6,%xmm4
	paddd		%xmm4,%xmm0
	paddd		%xmm1,%xmm0
	pxor		%xmm0,%xmm3
	pshufb		%xmm12,%xmm3
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0xc,%xmm1
	pslld		$0x14,%xmm8
	por		%xmm8,%xmm1
	movzbl		0x4(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0x5(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x6(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0x7(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	punpckldq	%xmm6,%xmm5
	punpckldq	%xmm4,%xmm7
	punpcklqdq	%xmm7,%xmm5
	paddd		%xmm5,%xmm0
	paddd		%xmm1,%xmm0
	pxor		%xmm0,%xmm3
	pshufb		%xmm13,%xmm3
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0x7,%xmm1
	pslld		$0x19,%xmm8
	por		%xmm8,%xmm1
	pshufd		$0x93,%xmm0,%xmm0
	pshufd		$0x4e,%xmm3,%xmm3
	pshufd		$0x39,%xmm2,%xmm2
	movzbl		0x8(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	movzbl		0x9(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0xa(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0xb(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	punpckldq	%xmm7,%xmm6
	punpckldq	%xmm5,%xmm4
	punpcklqdq	%xmm4,%xmm6
	paddd		%xmm6,%xmm0
	paddd		%xmm1,%xmm0
	pxor		%xmm0,%xmm3
	pshufb		%xmm12,%xmm3
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0xc,%xmm1
	pslld		$0x14,%xmm8
	por		%xmm8,%xmm1
	movzbl		0xc(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm7
	movzbl		0xd(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm4
	movzbl		0xe(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm5
	movzbl		0xf(%rcx),%eax
	movd		(%rsi,%rax,4),%xmm6
	punpckldq	%xmm4,%xmm7
	punpckldq	%xmm6,%xmm5
	punpcklqdq	%xmm5,%xmm7
	paddd		%xmm7,%xmm0
	paddd		%xmm1,%xmm0
	pxor		%xmm0,%xmm3
	pshufb		%xmm13,%xmm3
	paddd		%xmm3,%xmm2
	pxor		%xmm2,%xmm1
	movdqa		%xmm1,%xmm8
	psrld		$0x7,%xmm1
	pslld		$0x19,%xmm8
	por		%xmm8,%xmm1
	pshufd		$0x39,%xmm0,%xmm0
	pshufd		$0x4e,%xmm3,%xmm3
	pshufd		$0x93,%xmm2,%xmm2
	addq		$0x10,%rcx
	cmpq		%r8,%rcx
	jnz		.Lroundloop
	pxor		%xmm2,%xmm0
	pxor		%xmm3,%xmm1
	pxor		%xmm10,%xmm0
	pxor		%xmm11,%xmm1
	addq		$0x40,%rsi
	decq		%rdx
	jnz		.Lbeginofloop
	movdqu		%xmm0,(%rdi)
	movdqu		%xmm1,0x10(%rdi)
	movdqu		%xmm14,0x20(%rdi)
.Lendofloop:
	RET
SYM_FUNC_END(blake2s_compress_ssse3)

#ifdef CONFIG_AS_AVX512
SYM_FUNC_START(blake2s_compress_avx512)
	vmovdqu		(%rdi),%xmm0
	vmovdqu		0x10(%rdi),%xmm1
	vmovdqu		0x20(%rdi),%xmm4
	vmovq		%rcx,%xmm5
	vmovdqa		IV(%rip),%xmm14
	vmovdqa		IV+16(%rip),%xmm15
	jmp		.Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
	vmovdqa		%xmm0,%xmm10
	vmovdqa		%xmm1,%xmm11
	vpaddq		%xmm5,%xmm4,%xmm4
	vmovdqa		%xmm14,%xmm2
	vpxor		%xmm15,%xmm4,%xmm3
	vmovdqu		(%rsi),%ymm6
	vmovdqu		0x20(%rsi),%ymm7
	addq		$0x40,%rsi
	leaq		SIGMA2(%rip),%rax
	movb		$0xa,%cl
.Lblake2s_compress_avx512_roundloop:
	addq		$0x40,%rax
	vmovdqa		-0x40(%rax),%ymm8
	vmovdqa		-0x20(%rax),%ymm9
	vpermi2d	%ymm7,%ymm6,%ymm8
	vpermi2d	%ymm7,%ymm6,%ymm9
	vmovdqa		%ymm8,%ymm6
	vmovdqa		%ymm9,%ymm7
	vpaddd		%xmm8,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x10,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0xc,%xmm1,%xmm1
	vextracti128	$0x1,%ymm8,%xmm8
	vpaddd		%xmm8,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x8,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0x7,%xmm1,%xmm1
	vpshufd		$0x93,%xmm0,%xmm0
	vpshufd		$0x4e,%xmm3,%xmm3
	vpshufd		$0x39,%xmm2,%xmm2
	vpaddd		%xmm9,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x10,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0xc,%xmm1,%xmm1
	vextracti128	$0x1,%ymm9,%xmm9
	vpaddd		%xmm9,%xmm0,%xmm0
	vpaddd		%xmm1,%xmm0,%xmm0
	vpxor		%xmm0,%xmm3,%xmm3
	vprord		$0x8,%xmm3,%xmm3
	vpaddd		%xmm3,%xmm2,%xmm2
	vpxor		%xmm2,%xmm1,%xmm1
	vprord		$0x7,%xmm1,%xmm1
	vpshufd		$0x39,%xmm0,%xmm0
	vpshufd		$0x4e,%xmm3,%xmm3
	vpshufd		$0x93,%xmm2,%xmm2
	decb		%cl
	jne		.Lblake2s_compress_avx512_roundloop
	vpxor		%xmm10,%xmm0,%xmm0
	vpxor		%xmm11,%xmm1,%xmm1
	vpxor		%xmm2,%xmm0,%xmm0
	vpxor		%xmm3,%xmm1,%xmm1
	decq		%rdx
	jne		.Lblake2s_compress_avx512_mainloop
	vmovdqu		%xmm0,(%rdi)
	vmovdqu		%xmm1,0x10(%rdi)
	vmovdqu		%xmm4,0x20(%rdi)
	vzeroupper
	RET
SYM_FUNC_END(blake2s_compress_avx512)
#endif /* CONFIG_AS_AVX512 */