1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
|
/* SPDX-License-Identifier: GPL-2.0 OR MIT */
/*
* Copyright (C) 2015-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (C) 2017-2019 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved.
*/
#include <linux/linkage.h>
.section .rodata.cst32.iv, "aM", @progbits, 32
.align 32
.Liv:
.octa 0xA54FF53A3C6EF372BB67AE856A09E667
.octa 0x5BE0CD191F83D9AB9B05688C510E527F
.section .rodata.cst16.ror16, "aM", @progbits, 16
.align 16
.Lror16:
.octa 0x0D0C0F0E09080B0A0504070601000302
.section .rodata.cst16.ror8, "aM", @progbits, 16
.align 16
.Lror8:
.octa 0x0C0F0E0D080B0A090407060500030201
.section .rodata.cst64.sigma, "aM", @progbits, 160
.align 64
.Lsigma:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7
.byte 11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1
.byte 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0
.byte 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8
.byte 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14
.byte 12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2
.byte 13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6
.byte 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4
.byte 10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12
.section .rodata.cst64.sigma2, "aM", @progbits, 160
.align 64
.Lsigma2:
.byte 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13
.byte 8, 2, 13, 15, 10, 9, 12, 3, 6, 4, 0, 14, 5, 11, 1, 7
.byte 11, 13, 8, 6, 5, 10, 14, 3, 2, 4, 12, 15, 1, 0, 7, 9
.byte 11, 10, 7, 0, 8, 15, 1, 13, 3, 6, 2, 12, 4, 14, 9, 5
.byte 4, 10, 9, 14, 15, 0, 11, 8, 1, 7, 3, 13, 2, 5, 6, 12
.byte 2, 11, 4, 15, 14, 3, 10, 8, 13, 6, 5, 7, 0, 12, 1, 9
.byte 4, 8, 15, 9, 14, 11, 13, 5, 3, 2, 1, 12, 6, 10, 7, 0
.byte 6, 13, 0, 14, 12, 2, 1, 11, 15, 4, 5, 8, 7, 9, 3, 10
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#define CTX %rdi
#define DATA %rsi
#define NBLOCKS %rdx
#define INC %ecx
.text
//
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_ssse3)
movdqu (CTX),%xmm0 // Load h[0..3]
movdqu 16(CTX),%xmm1 // Load h[4..7]
movdqa .Lror16(%rip),%xmm12
movdqa .Lror8(%rip),%xmm13
movdqu 32(CTX),%xmm14 // Load t and f
movd INC,%xmm15 // Load inc
leaq .Lsigma+160(%rip),%r8
jmp .Lssse3_mainloop
.align 32
.Lssse3_mainloop:
// Main loop: each iteration processes one 64-byte block.
movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
paddq %xmm15,%xmm14 // t += inc (64-bit addition)
movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
movdqa %xmm14,%xmm3
pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
leaq .Lsigma(%rip),%rcx
.Lssse3_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
movzbl (%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 1(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 2(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 3(%rcx),%eax
movd (DATA,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
paddd %xmm4,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 4(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 5(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 6(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 7(%rcx),%eax
movd (DATA,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
paddd %xmm5,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 8(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 9(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 10(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 11(%rcx),%eax
movd (DATA,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
paddd %xmm6,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm12,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 12(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 13(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 14(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 15(%rcx),%eax
movd (DATA,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
paddd %xmm7,%xmm0
paddd %xmm1,%xmm0
pxor %xmm0,%xmm3
pshufb %xmm13,%xmm3
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $16,%rcx
cmpq %r8,%rcx
jnz .Lssse3_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $64,DATA
decq NBLOCKS
jnz .Lssse3_mainloop
movdqu %xmm0,(CTX) // Store new h[0..3]
movdqu %xmm1,16(CTX) // Store new h[4..7]
movq %xmm14,32(CTX) // Store new t (f is unchanged)
RET
SYM_FUNC_END(blake2s_compress_ssse3)
//
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu (CTX),%xmm0 // Load h[0..3]
vmovdqu 16(CTX),%xmm1 // Load h[4..7]
vmovdqu 32(CTX),%xmm4 // Load t and f
vmovd INC,%xmm5 // Load inc
vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
jmp .Lavx512_mainloop
.align 32
.Lavx512_mainloop:
// Main loop: each iteration processes one 64-byte block.
vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
vmovdqu (DATA),%ymm6 // Load first 8 data words
vmovdqu 32(DATA),%ymm7 // Load second 8 data words
addq $64,DATA
leaq .Lsigma2(%rip),%rax
movb $10,%cl // Set num rounds remaining
.Lavx512_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
vpmovzxbd (%rax),%ymm8
vpmovzxbd 8(%rax),%ymm9
addq $16,%rax
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
vmovdqa %ymm9,%ymm7
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lavx512_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
vpternlogd $0x96,%xmm10,%xmm2,%xmm0
vpternlogd $0x96,%xmm11,%xmm3,%xmm1
decq NBLOCKS
jne .Lavx512_mainloop
vmovdqu %xmm0,(CTX) // Store new h[0..3]
vmovdqu %xmm1,16(CTX) // Store new h[4..7]
vmovq %xmm4,32(CTX) // Store new t (f is unchanged)
vzeroupper
RET
SYM_FUNC_END(blake2s_compress_avx512)
|