1 /* SPDX-License-Identifier: GPL-2.0+ */
3 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
5 * Copyright (C) 2018 Martin Willi
8 #include <linux/linkage.h>
10 .section .rodata.cst32.CTR2BL, "aM", @progbits, 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13 .octa 0x00000000000000000000000000000001
15 .section .rodata.cst32.CTR4BL, "aM", @progbits, 32
17 CTR4BL: .octa 0x00000000000000000000000000000002
18 .octa 0x00000000000000000000000000000003
20 .section .rodata.cst32.CTR8BL, "aM", @progbits, 32
22 CTR8BL: .octa 0x00000003000000020000000100000000
23 .octa 0x00000007000000060000000500000004
27 ENTRY(chacha20_2block_xor_avx512vl)
28 # %rdi: Input state matrix, s
29 # %rsi: up to 2 data blocks output, o
30 # %rdx: up to 2 data blocks input, i
31 # %rcx: input/output length in bytes
33 # This function encrypts two ChaCha20 blocks by loading the state
34 # matrix twice across four AVX registers. It performs matrix operations
35 # on four words in each matrix in parallel, but requires shuffling to
36 # rearrange the words after each round.
41 vbroadcasti128 0x00(%rdi),%ymm0
42 vbroadcasti128 0x10(%rdi),%ymm1
43 vbroadcasti128 0x20(%rdi),%ymm2
44 vbroadcasti128 0x30(%rdi),%ymm3
46 vpaddd CTR2BL(%rip),%ymm3,%ymm3
57 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
58 vpaddd %ymm1,%ymm0,%ymm0
59 vpxord %ymm0,%ymm3,%ymm3
60 vprold $16,%ymm3,%ymm3
62 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
63 vpaddd %ymm3,%ymm2,%ymm2
64 vpxord %ymm2,%ymm1,%ymm1
65 vprold $12,%ymm1,%ymm1
67 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
68 vpaddd %ymm1,%ymm0,%ymm0
69 vpxord %ymm0,%ymm3,%ymm3
72 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
73 vpaddd %ymm3,%ymm2,%ymm2
74 vpxord %ymm2,%ymm1,%ymm1
77 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
78 vpshufd $0x39,%ymm1,%ymm1
79 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
80 vpshufd $0x4e,%ymm2,%ymm2
81 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
82 vpshufd $0x93,%ymm3,%ymm3
84 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
85 vpaddd %ymm1,%ymm0,%ymm0
86 vpxord %ymm0,%ymm3,%ymm3
87 vprold $16,%ymm3,%ymm3
89 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
90 vpaddd %ymm3,%ymm2,%ymm2
91 vpxord %ymm2,%ymm1,%ymm1
92 vprold $12,%ymm1,%ymm1
94 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
95 vpaddd %ymm1,%ymm0,%ymm0
96 vpxord %ymm0,%ymm3,%ymm3
99 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
100 vpaddd %ymm3,%ymm2,%ymm2
101 vpxord %ymm2,%ymm1,%ymm1
102 vprold $7,%ymm1,%ymm1
104 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
105 vpshufd $0x93,%ymm1,%ymm1
106 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
107 vpshufd $0x4e,%ymm2,%ymm2
108 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
109 vpshufd $0x39,%ymm3,%ymm3
114 # o0 = i0 ^ (x0 + s0)
115 vpaddd %ymm8,%ymm0,%ymm7
118 vpxord 0x00(%rdx),%xmm7,%xmm6
119 vmovdqu %xmm6,0x00(%rsi)
120 vextracti128 $1,%ymm7,%xmm0
121 # o1 = i1 ^ (x1 + s1)
122 vpaddd %ymm9,%ymm1,%ymm7
125 vpxord 0x10(%rdx),%xmm7,%xmm6
126 vmovdqu %xmm6,0x10(%rsi)
127 vextracti128 $1,%ymm7,%xmm1
128 # o2 = i2 ^ (x2 + s2)
129 vpaddd %ymm10,%ymm2,%ymm7
132 vpxord 0x20(%rdx),%xmm7,%xmm6
133 vmovdqu %xmm6,0x20(%rsi)
134 vextracti128 $1,%ymm7,%xmm2
135 # o3 = i3 ^ (x3 + s3)
136 vpaddd %ymm11,%ymm3,%ymm7
139 vpxord 0x30(%rdx),%xmm7,%xmm6
140 vmovdqu %xmm6,0x30(%rsi)
141 vextracti128 $1,%ymm7,%xmm3
143 # xor and write second block
147 vpxord 0x40(%rdx),%xmm7,%xmm6
148 vmovdqu %xmm6,0x40(%rsi)
153 vpxord 0x50(%rdx),%xmm7,%xmm6
154 vmovdqu %xmm6,0x50(%rsi)
159 vpxord 0x60(%rdx),%xmm7,%xmm6
160 vmovdqu %xmm6,0x60(%rsi)
165 vpxord 0x70(%rdx),%xmm7,%xmm6
166 vmovdqu %xmm6,0x70(%rsi)
173 # xor remaining bytes from partial register into output
185 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
186 vpxord %xmm7,%xmm1,%xmm1
187 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
191 ENDPROC(chacha20_2block_xor_avx512vl)
193 ENTRY(chacha20_4block_xor_avx512vl)
194 # %rdi: Input state matrix, s
195 # %rsi: up to 4 data blocks output, o
196 # %rdx: up to 4 data blocks input, i
197 # %rcx: input/output length in bytes
199 # This function encrypts four ChaCha20 block by loading the state
200 # matrix four times across eight AVX registers. It performs matrix
201 # operations on four words in two matrices in parallel, sequentially
202 # to the operations on the four words of the other two matrices. The
203 # required word shuffling has a rather high latency, we can do the
204 # arithmetic on two matrix-pairs without much slowdown.
209 vbroadcasti128 0x00(%rdi),%ymm0
210 vbroadcasti128 0x10(%rdi),%ymm1
211 vbroadcasti128 0x20(%rdi),%ymm2
212 vbroadcasti128 0x30(%rdi),%ymm3
219 vpaddd CTR2BL(%rip),%ymm3,%ymm3
220 vpaddd CTR4BL(%rip),%ymm7,%ymm7
232 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
233 vpaddd %ymm1,%ymm0,%ymm0
234 vpxord %ymm0,%ymm3,%ymm3
235 vprold $16,%ymm3,%ymm3
237 vpaddd %ymm5,%ymm4,%ymm4
238 vpxord %ymm4,%ymm7,%ymm7
239 vprold $16,%ymm7,%ymm7
241 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
242 vpaddd %ymm3,%ymm2,%ymm2
243 vpxord %ymm2,%ymm1,%ymm1
244 vprold $12,%ymm1,%ymm1
246 vpaddd %ymm7,%ymm6,%ymm6
247 vpxord %ymm6,%ymm5,%ymm5
248 vprold $12,%ymm5,%ymm5
250 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
251 vpaddd %ymm1,%ymm0,%ymm0
252 vpxord %ymm0,%ymm3,%ymm3
253 vprold $8,%ymm3,%ymm3
255 vpaddd %ymm5,%ymm4,%ymm4
256 vpxord %ymm4,%ymm7,%ymm7
257 vprold $8,%ymm7,%ymm7
259 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
260 vpaddd %ymm3,%ymm2,%ymm2
261 vpxord %ymm2,%ymm1,%ymm1
262 vprold $7,%ymm1,%ymm1
264 vpaddd %ymm7,%ymm6,%ymm6
265 vpxord %ymm6,%ymm5,%ymm5
266 vprold $7,%ymm5,%ymm5
268 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
269 vpshufd $0x39,%ymm1,%ymm1
270 vpshufd $0x39,%ymm5,%ymm5
271 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
272 vpshufd $0x4e,%ymm2,%ymm2
273 vpshufd $0x4e,%ymm6,%ymm6
274 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
275 vpshufd $0x93,%ymm3,%ymm3
276 vpshufd $0x93,%ymm7,%ymm7
278 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
279 vpaddd %ymm1,%ymm0,%ymm0
280 vpxord %ymm0,%ymm3,%ymm3
281 vprold $16,%ymm3,%ymm3
283 vpaddd %ymm5,%ymm4,%ymm4
284 vpxord %ymm4,%ymm7,%ymm7
285 vprold $16,%ymm7,%ymm7
287 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
288 vpaddd %ymm3,%ymm2,%ymm2
289 vpxord %ymm2,%ymm1,%ymm1
290 vprold $12,%ymm1,%ymm1
292 vpaddd %ymm7,%ymm6,%ymm6
293 vpxord %ymm6,%ymm5,%ymm5
294 vprold $12,%ymm5,%ymm5
296 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297 vpaddd %ymm1,%ymm0,%ymm0
298 vpxord %ymm0,%ymm3,%ymm3
299 vprold $8,%ymm3,%ymm3
301 vpaddd %ymm5,%ymm4,%ymm4
302 vpxord %ymm4,%ymm7,%ymm7
303 vprold $8,%ymm7,%ymm7
305 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306 vpaddd %ymm3,%ymm2,%ymm2
307 vpxord %ymm2,%ymm1,%ymm1
308 vprold $7,%ymm1,%ymm1
310 vpaddd %ymm7,%ymm6,%ymm6
311 vpxord %ymm6,%ymm5,%ymm5
312 vprold $7,%ymm5,%ymm5
314 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
315 vpshufd $0x93,%ymm1,%ymm1
316 vpshufd $0x93,%ymm5,%ymm5
317 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
318 vpshufd $0x4e,%ymm2,%ymm2
319 vpshufd $0x4e,%ymm6,%ymm6
320 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
321 vpshufd $0x39,%ymm3,%ymm3
322 vpshufd $0x39,%ymm7,%ymm7
327 # o0 = i0 ^ (x0 + s0), first block
328 vpaddd %ymm11,%ymm0,%ymm10
331 vpxord 0x00(%rdx),%xmm10,%xmm9
332 vmovdqu %xmm9,0x00(%rsi)
333 vextracti128 $1,%ymm10,%xmm0
334 # o1 = i1 ^ (x1 + s1), first block
335 vpaddd %ymm12,%ymm1,%ymm10
338 vpxord 0x10(%rdx),%xmm10,%xmm9
339 vmovdqu %xmm9,0x10(%rsi)
340 vextracti128 $1,%ymm10,%xmm1
341 # o2 = i2 ^ (x2 + s2), first block
342 vpaddd %ymm13,%ymm2,%ymm10
345 vpxord 0x20(%rdx),%xmm10,%xmm9
346 vmovdqu %xmm9,0x20(%rsi)
347 vextracti128 $1,%ymm10,%xmm2
348 # o3 = i3 ^ (x3 + s3), first block
349 vpaddd %ymm14,%ymm3,%ymm10
352 vpxord 0x30(%rdx),%xmm10,%xmm9
353 vmovdqu %xmm9,0x30(%rsi)
354 vextracti128 $1,%ymm10,%xmm3
356 # xor and write second block
360 vpxord 0x40(%rdx),%xmm10,%xmm9
361 vmovdqu %xmm9,0x40(%rsi)
366 vpxord 0x50(%rdx),%xmm10,%xmm9
367 vmovdqu %xmm9,0x50(%rsi)
372 vpxord 0x60(%rdx),%xmm10,%xmm9
373 vmovdqu %xmm9,0x60(%rsi)
378 vpxord 0x70(%rdx),%xmm10,%xmm9
379 vmovdqu %xmm9,0x70(%rsi)
381 # o0 = i0 ^ (x0 + s0), third block
382 vpaddd %ymm11,%ymm4,%ymm10
385 vpxord 0x80(%rdx),%xmm10,%xmm9
386 vmovdqu %xmm9,0x80(%rsi)
387 vextracti128 $1,%ymm10,%xmm4
388 # o1 = i1 ^ (x1 + s1), third block
389 vpaddd %ymm12,%ymm5,%ymm10
392 vpxord 0x90(%rdx),%xmm10,%xmm9
393 vmovdqu %xmm9,0x90(%rsi)
394 vextracti128 $1,%ymm10,%xmm5
395 # o2 = i2 ^ (x2 + s2), third block
396 vpaddd %ymm13,%ymm6,%ymm10
399 vpxord 0xa0(%rdx),%xmm10,%xmm9
400 vmovdqu %xmm9,0xa0(%rsi)
401 vextracti128 $1,%ymm10,%xmm6
402 # o3 = i3 ^ (x3 + s3), third block
403 vpaddd %ymm15,%ymm7,%ymm10
406 vpxord 0xb0(%rdx),%xmm10,%xmm9
407 vmovdqu %xmm9,0xb0(%rsi)
408 vextracti128 $1,%ymm10,%xmm7
410 # xor and write fourth block
414 vpxord 0xc0(%rdx),%xmm10,%xmm9
415 vmovdqu %xmm9,0xc0(%rsi)
420 vpxord 0xd0(%rdx),%xmm10,%xmm9
421 vmovdqu %xmm9,0xd0(%rsi)
426 vpxord 0xe0(%rdx),%xmm10,%xmm9
427 vmovdqu %xmm9,0xe0(%rsi)
432 vpxord 0xf0(%rdx),%xmm10,%xmm9
433 vmovdqu %xmm9,0xf0(%rsi)
440 # xor remaining bytes from partial register into output
452 vmovdqu8 (%rdx,%r9),%xmm1{%k1}{z}
453 vpxord %xmm10,%xmm1,%xmm1
454 vmovdqu8 %xmm1,(%rsi,%r9){%k1}
458 ENDPROC(chacha20_4block_xor_avx512vl)
460 ENTRY(chacha20_8block_xor_avx512vl)
461 # %rdi: Input state matrix, s
462 # %rsi: up to 8 data blocks output, o
463 # %rdx: up to 8 data blocks input, i
464 # %rcx: input/output length in bytes
466 # This function encrypts eight consecutive ChaCha20 blocks by loading
467 # the state matrix in AVX registers eight times. Compared to AVX2, this
468 # mostly benefits from the new rotate instructions in VL and the
469 # additional registers.
473 # x0..15[0-7] = s[0..15]
474 vpbroadcastd 0x00(%rdi),%ymm0
475 vpbroadcastd 0x04(%rdi),%ymm1
476 vpbroadcastd 0x08(%rdi),%ymm2
477 vpbroadcastd 0x0c(%rdi),%ymm3
478 vpbroadcastd 0x10(%rdi),%ymm4
479 vpbroadcastd 0x14(%rdi),%ymm5
480 vpbroadcastd 0x18(%rdi),%ymm6
481 vpbroadcastd 0x1c(%rdi),%ymm7
482 vpbroadcastd 0x20(%rdi),%ymm8
483 vpbroadcastd 0x24(%rdi),%ymm9
484 vpbroadcastd 0x28(%rdi),%ymm10
485 vpbroadcastd 0x2c(%rdi),%ymm11
486 vpbroadcastd 0x30(%rdi),%ymm12
487 vpbroadcastd 0x34(%rdi),%ymm13
488 vpbroadcastd 0x38(%rdi),%ymm14
489 vpbroadcastd 0x3c(%rdi),%ymm15
491 # x12 += counter values 0-3
492 vpaddd CTR8BL(%rip),%ymm12,%ymm12
494 vmovdqa64 %ymm0,%ymm16
495 vmovdqa64 %ymm1,%ymm17
496 vmovdqa64 %ymm2,%ymm18
497 vmovdqa64 %ymm3,%ymm19
498 vmovdqa64 %ymm4,%ymm20
499 vmovdqa64 %ymm5,%ymm21
500 vmovdqa64 %ymm6,%ymm22
501 vmovdqa64 %ymm7,%ymm23
502 vmovdqa64 %ymm8,%ymm24
503 vmovdqa64 %ymm9,%ymm25
504 vmovdqa64 %ymm10,%ymm26
505 vmovdqa64 %ymm11,%ymm27
506 vmovdqa64 %ymm12,%ymm28
507 vmovdqa64 %ymm13,%ymm29
508 vmovdqa64 %ymm14,%ymm30
509 vmovdqa64 %ymm15,%ymm31
514 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
515 vpaddd %ymm0,%ymm4,%ymm0
516 vpxord %ymm0,%ymm12,%ymm12
517 vprold $16,%ymm12,%ymm12
518 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
519 vpaddd %ymm1,%ymm5,%ymm1
520 vpxord %ymm1,%ymm13,%ymm13
521 vprold $16,%ymm13,%ymm13
522 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
523 vpaddd %ymm2,%ymm6,%ymm2
524 vpxord %ymm2,%ymm14,%ymm14
525 vprold $16,%ymm14,%ymm14
526 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
527 vpaddd %ymm3,%ymm7,%ymm3
528 vpxord %ymm3,%ymm15,%ymm15
529 vprold $16,%ymm15,%ymm15
531 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
532 vpaddd %ymm12,%ymm8,%ymm8
533 vpxord %ymm8,%ymm4,%ymm4
534 vprold $12,%ymm4,%ymm4
535 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
536 vpaddd %ymm13,%ymm9,%ymm9
537 vpxord %ymm9,%ymm5,%ymm5
538 vprold $12,%ymm5,%ymm5
539 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
540 vpaddd %ymm14,%ymm10,%ymm10
541 vpxord %ymm10,%ymm6,%ymm6
542 vprold $12,%ymm6,%ymm6
543 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
544 vpaddd %ymm15,%ymm11,%ymm11
545 vpxord %ymm11,%ymm7,%ymm7
546 vprold $12,%ymm7,%ymm7
548 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
549 vpaddd %ymm0,%ymm4,%ymm0
550 vpxord %ymm0,%ymm12,%ymm12
551 vprold $8,%ymm12,%ymm12
552 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
553 vpaddd %ymm1,%ymm5,%ymm1
554 vpxord %ymm1,%ymm13,%ymm13
555 vprold $8,%ymm13,%ymm13
556 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
557 vpaddd %ymm2,%ymm6,%ymm2
558 vpxord %ymm2,%ymm14,%ymm14
559 vprold $8,%ymm14,%ymm14
560 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
561 vpaddd %ymm3,%ymm7,%ymm3
562 vpxord %ymm3,%ymm15,%ymm15
563 vprold $8,%ymm15,%ymm15
565 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
566 vpaddd %ymm12,%ymm8,%ymm8
567 vpxord %ymm8,%ymm4,%ymm4
568 vprold $7,%ymm4,%ymm4
569 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
570 vpaddd %ymm13,%ymm9,%ymm9
571 vpxord %ymm9,%ymm5,%ymm5
572 vprold $7,%ymm5,%ymm5
573 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
574 vpaddd %ymm14,%ymm10,%ymm10
575 vpxord %ymm10,%ymm6,%ymm6
576 vprold $7,%ymm6,%ymm6
577 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
578 vpaddd %ymm15,%ymm11,%ymm11
579 vpxord %ymm11,%ymm7,%ymm7
580 vprold $7,%ymm7,%ymm7
582 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
583 vpaddd %ymm0,%ymm5,%ymm0
584 vpxord %ymm0,%ymm15,%ymm15
585 vprold $16,%ymm15,%ymm15
586 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
587 vpaddd %ymm1,%ymm6,%ymm1
588 vpxord %ymm1,%ymm12,%ymm12
589 vprold $16,%ymm12,%ymm12
590 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
591 vpaddd %ymm2,%ymm7,%ymm2
592 vpxord %ymm2,%ymm13,%ymm13
593 vprold $16,%ymm13,%ymm13
594 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
595 vpaddd %ymm3,%ymm4,%ymm3
596 vpxord %ymm3,%ymm14,%ymm14
597 vprold $16,%ymm14,%ymm14
599 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
600 vpaddd %ymm15,%ymm10,%ymm10
601 vpxord %ymm10,%ymm5,%ymm5
602 vprold $12,%ymm5,%ymm5
603 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
604 vpaddd %ymm12,%ymm11,%ymm11
605 vpxord %ymm11,%ymm6,%ymm6
606 vprold $12,%ymm6,%ymm6
607 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
608 vpaddd %ymm13,%ymm8,%ymm8
609 vpxord %ymm8,%ymm7,%ymm7
610 vprold $12,%ymm7,%ymm7
611 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
612 vpaddd %ymm14,%ymm9,%ymm9
613 vpxord %ymm9,%ymm4,%ymm4
614 vprold $12,%ymm4,%ymm4
616 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
617 vpaddd %ymm0,%ymm5,%ymm0
618 vpxord %ymm0,%ymm15,%ymm15
619 vprold $8,%ymm15,%ymm15
620 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
621 vpaddd %ymm1,%ymm6,%ymm1
622 vpxord %ymm1,%ymm12,%ymm12
623 vprold $8,%ymm12,%ymm12
624 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
625 vpaddd %ymm2,%ymm7,%ymm2
626 vpxord %ymm2,%ymm13,%ymm13
627 vprold $8,%ymm13,%ymm13
628 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
629 vpaddd %ymm3,%ymm4,%ymm3
630 vpxord %ymm3,%ymm14,%ymm14
631 vprold $8,%ymm14,%ymm14
633 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
634 vpaddd %ymm15,%ymm10,%ymm10
635 vpxord %ymm10,%ymm5,%ymm5
636 vprold $7,%ymm5,%ymm5
637 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
638 vpaddd %ymm12,%ymm11,%ymm11
639 vpxord %ymm11,%ymm6,%ymm6
640 vprold $7,%ymm6,%ymm6
641 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
642 vpaddd %ymm13,%ymm8,%ymm8
643 vpxord %ymm8,%ymm7,%ymm7
644 vprold $7,%ymm7,%ymm7
645 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
646 vpaddd %ymm14,%ymm9,%ymm9
647 vpxord %ymm9,%ymm4,%ymm4
648 vprold $7,%ymm4,%ymm4
653 # x0..15[0-3] += s[0..15]
654 vpaddd %ymm16,%ymm0,%ymm0
655 vpaddd %ymm17,%ymm1,%ymm1
656 vpaddd %ymm18,%ymm2,%ymm2
657 vpaddd %ymm19,%ymm3,%ymm3
658 vpaddd %ymm20,%ymm4,%ymm4
659 vpaddd %ymm21,%ymm5,%ymm5
660 vpaddd %ymm22,%ymm6,%ymm6
661 vpaddd %ymm23,%ymm7,%ymm7
662 vpaddd %ymm24,%ymm8,%ymm8
663 vpaddd %ymm25,%ymm9,%ymm9
664 vpaddd %ymm26,%ymm10,%ymm10
665 vpaddd %ymm27,%ymm11,%ymm11
666 vpaddd %ymm28,%ymm12,%ymm12
667 vpaddd %ymm29,%ymm13,%ymm13
668 vpaddd %ymm30,%ymm14,%ymm14
669 vpaddd %ymm31,%ymm15,%ymm15
671 # interleave 32-bit words in state n, n+1
672 vpunpckldq %ymm1,%ymm0,%ymm16
673 vpunpckhdq %ymm1,%ymm0,%ymm17
674 vpunpckldq %ymm3,%ymm2,%ymm18
675 vpunpckhdq %ymm3,%ymm2,%ymm19
676 vpunpckldq %ymm5,%ymm4,%ymm20
677 vpunpckhdq %ymm5,%ymm4,%ymm21
678 vpunpckldq %ymm7,%ymm6,%ymm22
679 vpunpckhdq %ymm7,%ymm6,%ymm23
680 vpunpckldq %ymm9,%ymm8,%ymm24
681 vpunpckhdq %ymm9,%ymm8,%ymm25
682 vpunpckldq %ymm11,%ymm10,%ymm26
683 vpunpckhdq %ymm11,%ymm10,%ymm27
684 vpunpckldq %ymm13,%ymm12,%ymm28
685 vpunpckhdq %ymm13,%ymm12,%ymm29
686 vpunpckldq %ymm15,%ymm14,%ymm30
687 vpunpckhdq %ymm15,%ymm14,%ymm31
689 # interleave 64-bit words in state n, n+2
690 vpunpcklqdq %ymm18,%ymm16,%ymm0
691 vpunpcklqdq %ymm19,%ymm17,%ymm1
692 vpunpckhqdq %ymm18,%ymm16,%ymm2
693 vpunpckhqdq %ymm19,%ymm17,%ymm3
694 vpunpcklqdq %ymm22,%ymm20,%ymm4
695 vpunpcklqdq %ymm23,%ymm21,%ymm5
696 vpunpckhqdq %ymm22,%ymm20,%ymm6
697 vpunpckhqdq %ymm23,%ymm21,%ymm7
698 vpunpcklqdq %ymm26,%ymm24,%ymm8
699 vpunpcklqdq %ymm27,%ymm25,%ymm9
700 vpunpckhqdq %ymm26,%ymm24,%ymm10
701 vpunpckhqdq %ymm27,%ymm25,%ymm11
702 vpunpcklqdq %ymm30,%ymm28,%ymm12
703 vpunpcklqdq %ymm31,%ymm29,%ymm13
704 vpunpckhqdq %ymm30,%ymm28,%ymm14
705 vpunpckhqdq %ymm31,%ymm29,%ymm15
707 # interleave 128-bit words in state n, n+4
708 # xor/write first four blocks
709 vmovdqa64 %ymm0,%ymm16
710 vperm2i128 $0x20,%ymm4,%ymm0,%ymm0
713 vpxord 0x0000(%rdx),%ymm0,%ymm0
714 vmovdqu64 %ymm0,0x0000(%rsi)
715 vmovdqa64 %ymm16,%ymm0
716 vperm2i128 $0x31,%ymm4,%ymm0,%ymm4
718 vperm2i128 $0x20,%ymm12,%ymm8,%ymm0
721 vpxord 0x0020(%rdx),%ymm0,%ymm0
722 vmovdqu64 %ymm0,0x0020(%rsi)
723 vperm2i128 $0x31,%ymm12,%ymm8,%ymm12
725 vperm2i128 $0x20,%ymm6,%ymm2,%ymm0
728 vpxord 0x0040(%rdx),%ymm0,%ymm0
729 vmovdqu64 %ymm0,0x0040(%rsi)
730 vperm2i128 $0x31,%ymm6,%ymm2,%ymm6
732 vperm2i128 $0x20,%ymm14,%ymm10,%ymm0
735 vpxord 0x0060(%rdx),%ymm0,%ymm0
736 vmovdqu64 %ymm0,0x0060(%rsi)
737 vperm2i128 $0x31,%ymm14,%ymm10,%ymm14
739 vperm2i128 $0x20,%ymm5,%ymm1,%ymm0
742 vpxord 0x0080(%rdx),%ymm0,%ymm0
743 vmovdqu64 %ymm0,0x0080(%rsi)
744 vperm2i128 $0x31,%ymm5,%ymm1,%ymm5
746 vperm2i128 $0x20,%ymm13,%ymm9,%ymm0
749 vpxord 0x00a0(%rdx),%ymm0,%ymm0
750 vmovdqu64 %ymm0,0x00a0(%rsi)
751 vperm2i128 $0x31,%ymm13,%ymm9,%ymm13
753 vperm2i128 $0x20,%ymm7,%ymm3,%ymm0
756 vpxord 0x00c0(%rdx),%ymm0,%ymm0
757 vmovdqu64 %ymm0,0x00c0(%rsi)
758 vperm2i128 $0x31,%ymm7,%ymm3,%ymm7
760 vperm2i128 $0x20,%ymm15,%ymm11,%ymm0
763 vpxord 0x00e0(%rdx),%ymm0,%ymm0
764 vmovdqu64 %ymm0,0x00e0(%rsi)
765 vperm2i128 $0x31,%ymm15,%ymm11,%ymm15
767 # xor remaining blocks, write to output
768 vmovdqa64 %ymm4,%ymm0
771 vpxord 0x0100(%rdx),%ymm0,%ymm0
772 vmovdqu64 %ymm0,0x0100(%rsi)
774 vmovdqa64 %ymm12,%ymm0
777 vpxord 0x0120(%rdx),%ymm0,%ymm0
778 vmovdqu64 %ymm0,0x0120(%rsi)
780 vmovdqa64 %ymm6,%ymm0
783 vpxord 0x0140(%rdx),%ymm0,%ymm0
784 vmovdqu64 %ymm0,0x0140(%rsi)
786 vmovdqa64 %ymm14,%ymm0
789 vpxord 0x0160(%rdx),%ymm0,%ymm0
790 vmovdqu64 %ymm0,0x0160(%rsi)
792 vmovdqa64 %ymm5,%ymm0
795 vpxord 0x0180(%rdx),%ymm0,%ymm0
796 vmovdqu64 %ymm0,0x0180(%rsi)
798 vmovdqa64 %ymm13,%ymm0
801 vpxord 0x01a0(%rdx),%ymm0,%ymm0
802 vmovdqu64 %ymm0,0x01a0(%rsi)
804 vmovdqa64 %ymm7,%ymm0
807 vpxord 0x01c0(%rdx),%ymm0,%ymm0
808 vmovdqu64 %ymm0,0x01c0(%rsi)
810 vmovdqa64 %ymm15,%ymm0
813 vpxord 0x01e0(%rdx),%ymm0,%ymm0
814 vmovdqu64 %ymm0,0x01e0(%rsi)
821 # xor remaining bytes from partial register into output
833 vmovdqu8 (%rdx,%r9),%ymm1{%k1}{z}
834 vpxord %ymm0,%ymm1,%ymm1
835 vmovdqu8 %ymm1,(%rsi,%r9){%k1}
839 ENDPROC(chacha20_8block_xor_avx512vl)