1 /* SPDX-License-Identifier: GPL-2.0+ */
3 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
5 * Copyright (C) 2018 Martin Willi
8 #include <linux/linkage.h>
10 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13 .octa 0x00000000000000000000000000000001
15 .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
17 CTR8BL: .octa 0x00000003000000020000000100000000
18 .octa 0x00000007000000060000000500000004
22 ENTRY(chacha20_2block_xor_avx512vl)
23 # %rdi: Input state matrix, s
24 # %rsi: up to 2 data blocks output, o
25 # %rdx: up to 2 data blocks input, i
26 # %rcx: input/output length in bytes
28 # This function encrypts two ChaCha20 blocks by loading the state
29 # matrix twice across four AVX registers. It performs matrix operations
30 # on four words in each matrix in parallel, but requires shuffling to
31 # rearrange the words after each round.
36 vbroadcasti128 0x00(%rdi),%ymm0
37 vbroadcasti128 0x10(%rdi),%ymm1
38 vbroadcasti128 0x20(%rdi),%ymm2
39 vbroadcasti128 0x30(%rdi),%ymm3
41 vpaddd CTR2BL(%rip),%ymm3,%ymm3
52 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53 vpaddd %ymm1,%ymm0,%ymm0
54 vpxord %ymm0,%ymm3,%ymm3
55 vprold $16,%ymm3,%ymm3
57 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58 vpaddd %ymm3,%ymm2,%ymm2
59 vpxord %ymm2,%ymm1,%ymm1
60 vprold $12,%ymm1,%ymm1
62 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
63 vpaddd %ymm1,%ymm0,%ymm0
64 vpxord %ymm0,%ymm3,%ymm3
67 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
68 vpaddd %ymm3,%ymm2,%ymm2
69 vpxord %ymm2,%ymm1,%ymm1
72 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
73 vpshufd $0x39,%ymm1,%ymm1
74 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
75 vpshufd $0x4e,%ymm2,%ymm2
76 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
77 vpshufd $0x93,%ymm3,%ymm3
79 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
80 vpaddd %ymm1,%ymm0,%ymm0
81 vpxord %ymm0,%ymm3,%ymm3
82 vprold $16,%ymm3,%ymm3
84 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
85 vpaddd %ymm3,%ymm2,%ymm2
86 vpxord %ymm2,%ymm1,%ymm1
87 vprold $12,%ymm1,%ymm1
89 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
90 vpaddd %ymm1,%ymm0,%ymm0
91 vpxord %ymm0,%ymm3,%ymm3
94 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
95 vpaddd %ymm3,%ymm2,%ymm2
96 vpxord %ymm2,%ymm1,%ymm1
99 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
100 vpshufd $0x93,%ymm1,%ymm1
101 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
102 vpshufd $0x4e,%ymm2,%ymm2
103 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
104 vpshufd $0x39,%ymm3,%ymm3
109 # o0 = i0 ^ (x0 + s0)
110 vpaddd %ymm8,%ymm0,%ymm7
113 vpxord 0x00(%rdx),%xmm7,%xmm6
114 vmovdqu %xmm6,0x00(%rsi)
115 vextracti128 $1,%ymm7,%xmm0
116 # o1 = i1 ^ (x1 + s1)
117 vpaddd %ymm9,%ymm1,%ymm7
120 vpxord 0x10(%rdx),%xmm7,%xmm6
121 vmovdqu %xmm6,0x10(%rsi)
122 vextracti128 $1,%ymm7,%xmm1
123 # o2 = i2 ^ (x2 + s2)
124 vpaddd %ymm10,%ymm2,%ymm7
127 vpxord 0x20(%rdx),%xmm7,%xmm6
128 vmovdqu %xmm6,0x20(%rsi)
129 vextracti128 $1,%ymm7,%xmm2
130 # o3 = i3 ^ (x3 + s3)
131 vpaddd %ymm11,%ymm3,%ymm7
134 vpxord 0x30(%rdx),%xmm7,%xmm6
135 vmovdqu %xmm6,0x30(%rsi)
136 vextracti128 $1,%ymm7,%xmm3
138 # xor and write second block
142 vpxord 0x40(%rdx),%xmm7,%xmm6
143 vmovdqu %xmm6,0x40(%rsi)
148 vpxord 0x50(%rdx),%xmm7,%xmm6
149 vmovdqu %xmm6,0x50(%rsi)
154 vpxord 0x60(%rdx),%xmm7,%xmm6
155 vmovdqu %xmm6,0x60(%rsi)
160 vpxord 0x70(%rdx),%xmm7,%xmm6
161 vmovdqu %xmm6,0x70(%rsi)
168 # xor remaining bytes from partial register into output
180 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
181 vpxord %xmm7,%xmm1,%xmm1
182 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
186 ENDPROC(chacha20_2block_xor_avx512vl)
188 ENTRY(chacha20_8block_xor_avx512vl)
189 # %rdi: Input state matrix, s
190 # %rsi: up to 8 data blocks output, o
191 # %rdx: up to 8 data blocks input, i
192 # %rcx: input/output length in bytes
194 # This function encrypts eight consecutive ChaCha20 blocks by loading
195 # the state matrix in AVX registers eight times. Compared to AVX2, this
196 # mostly benefits from the new rotate instructions in VL and the
197 # additional registers.
201 # x0..15[0-7] = s[0..15]
202 vpbroadcastd 0x00(%rdi),%ymm0
203 vpbroadcastd 0x04(%rdi),%ymm1
204 vpbroadcastd 0x08(%rdi),%ymm2
205 vpbroadcastd 0x0c(%rdi),%ymm3
206 vpbroadcastd 0x10(%rdi),%ymm4
207 vpbroadcastd 0x14(%rdi),%ymm5
208 vpbroadcastd 0x18(%rdi),%ymm6
209 vpbroadcastd 0x1c(%rdi),%ymm7
210 vpbroadcastd 0x20(%rdi),%ymm8
211 vpbroadcastd 0x24(%rdi),%ymm9
212 vpbroadcastd 0x28(%rdi),%ymm10
213 vpbroadcastd 0x2c(%rdi),%ymm11
214 vpbroadcastd 0x30(%rdi),%ymm12
215 vpbroadcastd 0x34(%rdi),%ymm13
216 vpbroadcastd 0x38(%rdi),%ymm14
217 vpbroadcastd 0x3c(%rdi),%ymm15
219 # x12 += counter values 0-3
220 vpaddd CTR8BL(%rip),%ymm12,%ymm12
222 vmovdqa64 %ymm0,%ymm16
223 vmovdqa64 %ymm1,%ymm17
224 vmovdqa64 %ymm2,%ymm18
225 vmovdqa64 %ymm3,%ymm19
226 vmovdqa64 %ymm4,%ymm20
227 vmovdqa64 %ymm5,%ymm21
228 vmovdqa64 %ymm6,%ymm22
229 vmovdqa64 %ymm7,%ymm23
230 vmovdqa64 %ymm8,%ymm24
231 vmovdqa64 %ymm9,%ymm25
232 vmovdqa64 %ymm10,%ymm26
233 vmovdqa64 %ymm11,%ymm27
234 vmovdqa64 %ymm12,%ymm28
235 vmovdqa64 %ymm13,%ymm29
236 vmovdqa64 %ymm14,%ymm30
237 vmovdqa64 %ymm15,%ymm31
242 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243 vpaddd %ymm0,%ymm4,%ymm0
244 vpxord %ymm0,%ymm12,%ymm12
245 vprold $16,%ymm12,%ymm12
246 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
247 vpaddd %ymm1,%ymm5,%ymm1
248 vpxord %ymm1,%ymm13,%ymm13
249 vprold $16,%ymm13,%ymm13
250 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
251 vpaddd %ymm2,%ymm6,%ymm2
252 vpxord %ymm2,%ymm14,%ymm14
253 vprold $16,%ymm14,%ymm14
254 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
255 vpaddd %ymm3,%ymm7,%ymm3
256 vpxord %ymm3,%ymm15,%ymm15
257 vprold $16,%ymm15,%ymm15
259 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
260 vpaddd %ymm12,%ymm8,%ymm8
261 vpxord %ymm8,%ymm4,%ymm4
262 vprold $12,%ymm4,%ymm4
263 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
264 vpaddd %ymm13,%ymm9,%ymm9
265 vpxord %ymm9,%ymm5,%ymm5
266 vprold $12,%ymm5,%ymm5
267 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
268 vpaddd %ymm14,%ymm10,%ymm10
269 vpxord %ymm10,%ymm6,%ymm6
270 vprold $12,%ymm6,%ymm6
271 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
272 vpaddd %ymm15,%ymm11,%ymm11
273 vpxord %ymm11,%ymm7,%ymm7
274 vprold $12,%ymm7,%ymm7
276 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
277 vpaddd %ymm0,%ymm4,%ymm0
278 vpxord %ymm0,%ymm12,%ymm12
279 vprold $8,%ymm12,%ymm12
280 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
281 vpaddd %ymm1,%ymm5,%ymm1
282 vpxord %ymm1,%ymm13,%ymm13
283 vprold $8,%ymm13,%ymm13
284 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
285 vpaddd %ymm2,%ymm6,%ymm2
286 vpxord %ymm2,%ymm14,%ymm14
287 vprold $8,%ymm14,%ymm14
288 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
289 vpaddd %ymm3,%ymm7,%ymm3
290 vpxord %ymm3,%ymm15,%ymm15
291 vprold $8,%ymm15,%ymm15
293 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
294 vpaddd %ymm12,%ymm8,%ymm8
295 vpxord %ymm8,%ymm4,%ymm4
296 vprold $7,%ymm4,%ymm4
297 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
298 vpaddd %ymm13,%ymm9,%ymm9
299 vpxord %ymm9,%ymm5,%ymm5
300 vprold $7,%ymm5,%ymm5
301 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
302 vpaddd %ymm14,%ymm10,%ymm10
303 vpxord %ymm10,%ymm6,%ymm6
304 vprold $7,%ymm6,%ymm6
305 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
306 vpaddd %ymm15,%ymm11,%ymm11
307 vpxord %ymm11,%ymm7,%ymm7
308 vprold $7,%ymm7,%ymm7
310 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
311 vpaddd %ymm0,%ymm5,%ymm0
312 vpxord %ymm0,%ymm15,%ymm15
313 vprold $16,%ymm15,%ymm15
314 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
315 vpaddd %ymm1,%ymm6,%ymm1
316 vpxord %ymm1,%ymm12,%ymm12
317 vprold $16,%ymm12,%ymm12
318 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
319 vpaddd %ymm2,%ymm7,%ymm2
320 vpxord %ymm2,%ymm13,%ymm13
321 vprold $16,%ymm13,%ymm13
322 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
323 vpaddd %ymm3,%ymm4,%ymm3
324 vpxord %ymm3,%ymm14,%ymm14
325 vprold $16,%ymm14,%ymm14
327 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
328 vpaddd %ymm15,%ymm10,%ymm10
329 vpxord %ymm10,%ymm5,%ymm5
330 vprold $12,%ymm5,%ymm5
331 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
332 vpaddd %ymm12,%ymm11,%ymm11
333 vpxord %ymm11,%ymm6,%ymm6
334 vprold $12,%ymm6,%ymm6
335 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
336 vpaddd %ymm13,%ymm8,%ymm8
337 vpxord %ymm8,%ymm7,%ymm7
338 vprold $12,%ymm7,%ymm7
339 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
340 vpaddd %ymm14,%ymm9,%ymm9
341 vpxord %ymm9,%ymm4,%ymm4
342 vprold $12,%ymm4,%ymm4
344 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
345 vpaddd %ymm0,%ymm5,%ymm0
346 vpxord %ymm0,%ymm15,%ymm15
347 vprold $8,%ymm15,%ymm15
348 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
349 vpaddd %ymm1,%ymm6,%ymm1
350 vpxord %ymm1,%ymm12,%ymm12
351 vprold $8,%ymm12,%ymm12
352 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
353 vpaddd %ymm2,%ymm7,%ymm2
354 vpxord %ymm2,%ymm13,%ymm13
355 vprold $8,%ymm13,%ymm13
356 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
357 vpaddd %ymm3,%ymm4,%ymm3
358 vpxord %ymm3,%ymm14,%ymm14
359 vprold $8,%ymm14,%ymm14
361 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
362 vpaddd %ymm15,%ymm10,%ymm10
363 vpxord %ymm10,%ymm5,%ymm5
364 vprold $7,%ymm5,%ymm5
365 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
366 vpaddd %ymm12,%ymm11,%ymm11
367 vpxord %ymm11,%ymm6,%ymm6
368 vprold $7,%ymm6,%ymm6
369 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
370 vpaddd %ymm13,%ymm8,%ymm8
371 vpxord %ymm8,%ymm7,%ymm7
372 vprold $7,%ymm7,%ymm7
373 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
374 vpaddd %ymm14,%ymm9,%ymm9
375 vpxord %ymm9,%ymm4,%ymm4
376 vprold $7,%ymm4,%ymm4
381 # x0..15[0-3] += s[0..15]
382 vpaddd %ymm16,%ymm0,%ymm0
383 vpaddd %ymm17,%ymm1,%ymm1
384 vpaddd %ymm18,%ymm2,%ymm2
385 vpaddd %ymm19,%ymm3,%ymm3
386 vpaddd %ymm20,%ymm4,%ymm4
387 vpaddd %ymm21,%ymm5,%ymm5
388 vpaddd %ymm22,%ymm6,%ymm6
389 vpaddd %ymm23,%ymm7,%ymm7
390 vpaddd %ymm24,%ymm8,%ymm8
391 vpaddd %ymm25,%ymm9,%ymm9
392 vpaddd %ymm26,%ymm10,%ymm10
393 vpaddd %ymm27,%ymm11,%ymm11
394 vpaddd %ymm28,%ymm12,%ymm12
395 vpaddd %ymm29,%ymm13,%ymm13
396 vpaddd %ymm30,%ymm14,%ymm14
397 vpaddd %ymm31,%ymm15,%ymm15
399 # interleave 32-bit words in state n, n+1
400 vpunpckldq %ymm1,%ymm0,%ymm16
401 vpunpckhdq %ymm1,%ymm0,%ymm17
402 vpunpckldq %ymm3,%ymm2,%ymm18
403 vpunpckhdq %ymm3,%ymm2,%ymm19
404 vpunpckldq %ymm5,%ymm4,%ymm20
405 vpunpckhdq %ymm5,%ymm4,%ymm21
406 vpunpckldq %ymm7,%ymm6,%ymm22
407 vpunpckhdq %ymm7,%ymm6,%ymm23
408 vpunpckldq %ymm9,%ymm8,%ymm24
409 vpunpckhdq %ymm9,%ymm8,%ymm25
410 vpunpckldq %ymm11,%ymm10,%ymm26
411 vpunpckhdq %ymm11,%ymm10,%ymm27
412 vpunpckldq %ymm13,%ymm12,%ymm28
413 vpunpckhdq %ymm13,%ymm12,%ymm29
414 vpunpckldq %ymm15,%ymm14,%ymm30
415 vpunpckhdq %ymm15,%ymm14,%ymm31
417 # interleave 64-bit words in state n, n+2
418 vpunpcklqdq %ymm18,%ymm16,%ymm0
419 vpunpcklqdq %ymm19,%ymm17,%ymm1
420 vpunpckhqdq %ymm18,%ymm16,%ymm2
421 vpunpckhqdq %ymm19,%ymm17,%ymm3
422 vpunpcklqdq %ymm22,%ymm20,%ymm4
423 vpunpcklqdq %ymm23,%ymm21,%ymm5
424 vpunpckhqdq %ymm22,%ymm20,%ymm6
425 vpunpckhqdq %ymm23,%ymm21,%ymm7
426 vpunpcklqdq %ymm26,%ymm24,%ymm8
427 vpunpcklqdq %ymm27,%ymm25,%ymm9
428 vpunpckhqdq %ymm26,%ymm24,%ymm10
429 vpunpckhqdq %ymm27,%ymm25,%ymm11
430 vpunpcklqdq %ymm30,%ymm28,%ymm12
431 vpunpcklqdq %ymm31,%ymm29,%ymm13
432 vpunpckhqdq %ymm30,%ymm28,%ymm14
433 vpunpckhqdq %ymm31,%ymm29,%ymm15
435 # interleave 128-bit words in state n, n+4
436 # xor/write first four blocks
437 vmovdqa64 %ymm0,%ymm16
438 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
441 vpxord 0x0000(%rdx),%ymm0,%ymm0
442 vmovdqu64 %ymm0,0x0000(%rsi)
443 vmovdqa64 %ymm16,%ymm0
444 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
446 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
449 vpxord 0x0020(%rdx),%ymm0,%ymm0
450 vmovdqu64 %ymm0,0x0020(%rsi)
451 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
453 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
456 vpxord 0x0040(%rdx),%ymm0,%ymm0
457 vmovdqu64 %ymm0,0x0040(%rsi)
458 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
460 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
463 vpxord 0x0060(%rdx),%ymm0,%ymm0
464 vmovdqu64 %ymm0,0x0060(%rsi)
465 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
467 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
470 vpxord 0x0080(%rdx),%ymm0,%ymm0
471 vmovdqu64 %ymm0,0x0080(%rsi)
472 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
474 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
477 vpxord 0x00a0(%rdx),%ymm0,%ymm0
478 vmovdqu64 %ymm0,0x00a0(%rsi)
479 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
481 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
484 vpxord 0x00c0(%rdx),%ymm0,%ymm0
485 vmovdqu64 %ymm0,0x00c0(%rsi)
486 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
488 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
491 vpxord 0x00e0(%rdx),%ymm0,%ymm0
492 vmovdqu64 %ymm0,0x00e0(%rsi)
493 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
495 # xor remaining blocks, write to output
496 vmovdqa64 %ymm4,%ymm0
499 vpxord 0x0100(%rdx),%ymm0,%ymm0
500 vmovdqu64 %ymm0,0x0100(%rsi)
502 vmovdqa64 %ymm12,%ymm0
505 vpxord 0x0120(%rdx),%ymm0,%ymm0
506 vmovdqu64 %ymm0,0x0120(%rsi)
508 vmovdqa64 %ymm6,%ymm0
511 vpxord 0x0140(%rdx),%ymm0,%ymm0
512 vmovdqu64 %ymm0,0x0140(%rsi)
514 vmovdqa64 %ymm14,%ymm0
517 vpxord 0x0160(%rdx),%ymm0,%ymm0
518 vmovdqu64 %ymm0,0x0160(%rsi)
520 vmovdqa64 %ymm5,%ymm0
523 vpxord 0x0180(%rdx),%ymm0,%ymm0
524 vmovdqu64 %ymm0,0x0180(%rsi)
526 vmovdqa64 %ymm13,%ymm0
529 vpxord 0x01a0(%rdx),%ymm0,%ymm0
530 vmovdqu64 %ymm0,0x01a0(%rsi)
532 vmovdqa64 %ymm7,%ymm0
535 vpxord 0x01c0(%rdx),%ymm0,%ymm0
536 vmovdqu64 %ymm0,0x01c0(%rsi)
538 vmovdqa64 %ymm15,%ymm0
541 vpxord 0x01e0(%rdx),%ymm0,%ymm0
542 vmovdqu64 %ymm0,0x01e0(%rsi)
549 # xor remaining bytes from partial register into output
561 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
562 vpxord %ymm0,%ymm1,%ymm1
563 vmovdqu8 %ymm1,(%rsi,%r9){%k1}
567 ENDPROC(chacha20_8block_xor_avx512vl)