OSDN Git Service

crypto: x86/chacha20 - add XChaCha20 support
[uclinux-h8/linux.git] / arch / x86 / crypto / chacha20-avx512vl-x86_64.S
1 /* SPDX-License-Identifier: GPL-2.0+ */
2 /*
3  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
4  *
5  * Copyright (C) 2018 Martin Willi
6  */
7
8 #include <linux/linkage.h>
9
10 .section        .rodata.cst32.CTR2BL, "aM", @progbits, 32
11 .align 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13         .octa 0x00000000000000000000000000000001
14
15 .section        .rodata.cst32.CTR4BL, "aM", @progbits, 32
16 .align 32
17 CTR4BL: .octa 0x00000000000000000000000000000002
18         .octa 0x00000000000000000000000000000003
19
20 .section        .rodata.cst32.CTR8BL, "aM", @progbits, 32
21 .align 32
22 CTR8BL: .octa 0x00000003000000020000000100000000
23         .octa 0x00000007000000060000000500000004
24
25 .text
26
27 ENTRY(chacha20_2block_xor_avx512vl)
28         # %rdi: Input state matrix, s
29         # %rsi: up to 2 data blocks output, o
30         # %rdx: up to 2 data blocks input, i
31         # %rcx: input/output length in bytes
32
33         # This function encrypts two ChaCha20 blocks by loading the state
34         # matrix twice across four AVX registers. It performs matrix operations
35         # on four words in each matrix in parallel, but requires shuffling to
36         # rearrange the words after each round.
37
38         vzeroupper
39
40         # x0..3[0-2] = s0..3
41         vbroadcasti128  0x00(%rdi),%ymm0
42         vbroadcasti128  0x10(%rdi),%ymm1
43         vbroadcasti128  0x20(%rdi),%ymm2
44         vbroadcasti128  0x30(%rdi),%ymm3
45
46         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
47
48         vmovdqa         %ymm0,%ymm8
49         vmovdqa         %ymm1,%ymm9
50         vmovdqa         %ymm2,%ymm10
51         vmovdqa         %ymm3,%ymm11
52
53         mov             $10,%rax
54
55 .Ldoubleround:
56
57         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
58         vpaddd          %ymm1,%ymm0,%ymm0
59         vpxord          %ymm0,%ymm3,%ymm3
60         vprold          $16,%ymm3,%ymm3
61
62         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
63         vpaddd          %ymm3,%ymm2,%ymm2
64         vpxord          %ymm2,%ymm1,%ymm1
65         vprold          $12,%ymm1,%ymm1
66
67         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
68         vpaddd          %ymm1,%ymm0,%ymm0
69         vpxord          %ymm0,%ymm3,%ymm3
70         vprold          $8,%ymm3,%ymm3
71
72         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
73         vpaddd          %ymm3,%ymm2,%ymm2
74         vpxord          %ymm2,%ymm1,%ymm1
75         vprold          $7,%ymm1,%ymm1
76
77         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
78         vpshufd         $0x39,%ymm1,%ymm1
79         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
80         vpshufd         $0x4e,%ymm2,%ymm2
81         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
82         vpshufd         $0x93,%ymm3,%ymm3
83
84         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
85         vpaddd          %ymm1,%ymm0,%ymm0
86         vpxord          %ymm0,%ymm3,%ymm3
87         vprold          $16,%ymm3,%ymm3
88
89         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
90         vpaddd          %ymm3,%ymm2,%ymm2
91         vpxord          %ymm2,%ymm1,%ymm1
92         vprold          $12,%ymm1,%ymm1
93
94         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
95         vpaddd          %ymm1,%ymm0,%ymm0
96         vpxord          %ymm0,%ymm3,%ymm3
97         vprold          $8,%ymm3,%ymm3
98
99         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
100         vpaddd          %ymm3,%ymm2,%ymm2
101         vpxord          %ymm2,%ymm1,%ymm1
102         vprold          $7,%ymm1,%ymm1
103
104         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
105         vpshufd         $0x93,%ymm1,%ymm1
106         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
107         vpshufd         $0x4e,%ymm2,%ymm2
108         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
109         vpshufd         $0x39,%ymm3,%ymm3
110
111         dec             %rax
112         jnz             .Ldoubleround
113
114         # o0 = i0 ^ (x0 + s0)
115         vpaddd          %ymm8,%ymm0,%ymm7
116         cmp             $0x10,%rcx
117         jl              .Lxorpart2
118         vpxord          0x00(%rdx),%xmm7,%xmm6
119         vmovdqu         %xmm6,0x00(%rsi)
120         vextracti128    $1,%ymm7,%xmm0
121         # o1 = i1 ^ (x1 + s1)
122         vpaddd          %ymm9,%ymm1,%ymm7
123         cmp             $0x20,%rcx
124         jl              .Lxorpart2
125         vpxord          0x10(%rdx),%xmm7,%xmm6
126         vmovdqu         %xmm6,0x10(%rsi)
127         vextracti128    $1,%ymm7,%xmm1
128         # o2 = i2 ^ (x2 + s2)
129         vpaddd          %ymm10,%ymm2,%ymm7
130         cmp             $0x30,%rcx
131         jl              .Lxorpart2
132         vpxord          0x20(%rdx),%xmm7,%xmm6
133         vmovdqu         %xmm6,0x20(%rsi)
134         vextracti128    $1,%ymm7,%xmm2
135         # o3 = i3 ^ (x3 + s3)
136         vpaddd          %ymm11,%ymm3,%ymm7
137         cmp             $0x40,%rcx
138         jl              .Lxorpart2
139         vpxord          0x30(%rdx),%xmm7,%xmm6
140         vmovdqu         %xmm6,0x30(%rsi)
141         vextracti128    $1,%ymm7,%xmm3
142
143         # xor and write second block
144         vmovdqa         %xmm0,%xmm7
145         cmp             $0x50,%rcx
146         jl              .Lxorpart2
147         vpxord          0x40(%rdx),%xmm7,%xmm6
148         vmovdqu         %xmm6,0x40(%rsi)
149
150         vmovdqa         %xmm1,%xmm7
151         cmp             $0x60,%rcx
152         jl              .Lxorpart2
153         vpxord          0x50(%rdx),%xmm7,%xmm6
154         vmovdqu         %xmm6,0x50(%rsi)
155
156         vmovdqa         %xmm2,%xmm7
157         cmp             $0x70,%rcx
158         jl              .Lxorpart2
159         vpxord          0x60(%rdx),%xmm7,%xmm6
160         vmovdqu         %xmm6,0x60(%rsi)
161
162         vmovdqa         %xmm3,%xmm7
163         cmp             $0x80,%rcx
164         jl              .Lxorpart2
165         vpxord          0x70(%rdx),%xmm7,%xmm6
166         vmovdqu         %xmm6,0x70(%rsi)
167
168 .Ldone2:
169         vzeroupper
170         ret
171
172 .Lxorpart2:
173         # xor remaining bytes from partial register into output
174         mov             %rcx,%rax
175         and             $0xf,%rcx
176         jz              .Ldone8
177         mov             %rax,%r9
178         and             $~0xf,%r9
179
180         mov             $1,%rax
181         shld            %cl,%rax,%rax
182         sub             $1,%rax
183         kmovq           %rax,%k1
184
185         vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
186         vpxord          %xmm7,%xmm1,%xmm1
187         vmovdqu8        %xmm1,(%rsi,%r9){%k1}
188
189         jmp             .Ldone2
190
191 ENDPROC(chacha20_2block_xor_avx512vl)
192
193 ENTRY(chacha20_4block_xor_avx512vl)
194         # %rdi: Input state matrix, s
195         # %rsi: up to 4 data blocks output, o
196         # %rdx: up to 4 data blocks input, i
197         # %rcx: input/output length in bytes
198
199         # This function encrypts four ChaCha20 block by loading the state
200         # matrix four times across eight AVX registers. It performs matrix
201         # operations on four words in two matrices in parallel, sequentially
202         # to the operations on the four words of the other two matrices. The
203         # required word shuffling has a rather high latency, we can do the
204         # arithmetic on two matrix-pairs without much slowdown.
205
206         vzeroupper
207
208         # x0..3[0-4] = s0..3
209         vbroadcasti128  0x00(%rdi),%ymm0
210         vbroadcasti128  0x10(%rdi),%ymm1
211         vbroadcasti128  0x20(%rdi),%ymm2
212         vbroadcasti128  0x30(%rdi),%ymm3
213
214         vmovdqa         %ymm0,%ymm4
215         vmovdqa         %ymm1,%ymm5
216         vmovdqa         %ymm2,%ymm6
217         vmovdqa         %ymm3,%ymm7
218
219         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
220         vpaddd          CTR4BL(%rip),%ymm7,%ymm7
221
222         vmovdqa         %ymm0,%ymm11
223         vmovdqa         %ymm1,%ymm12
224         vmovdqa         %ymm2,%ymm13
225         vmovdqa         %ymm3,%ymm14
226         vmovdqa         %ymm7,%ymm15
227
228         mov             $10,%rax
229
230 .Ldoubleround4:
231
232         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
233         vpaddd          %ymm1,%ymm0,%ymm0
234         vpxord          %ymm0,%ymm3,%ymm3
235         vprold          $16,%ymm3,%ymm3
236
237         vpaddd          %ymm5,%ymm4,%ymm4
238         vpxord          %ymm4,%ymm7,%ymm7
239         vprold          $16,%ymm7,%ymm7
240
241         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
242         vpaddd          %ymm3,%ymm2,%ymm2
243         vpxord          %ymm2,%ymm1,%ymm1
244         vprold          $12,%ymm1,%ymm1
245
246         vpaddd          %ymm7,%ymm6,%ymm6
247         vpxord          %ymm6,%ymm5,%ymm5
248         vprold          $12,%ymm5,%ymm5
249
250         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
251         vpaddd          %ymm1,%ymm0,%ymm0
252         vpxord          %ymm0,%ymm3,%ymm3
253         vprold          $8,%ymm3,%ymm3
254
255         vpaddd          %ymm5,%ymm4,%ymm4
256         vpxord          %ymm4,%ymm7,%ymm7
257         vprold          $8,%ymm7,%ymm7
258
259         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
260         vpaddd          %ymm3,%ymm2,%ymm2
261         vpxord          %ymm2,%ymm1,%ymm1
262         vprold          $7,%ymm1,%ymm1
263
264         vpaddd          %ymm7,%ymm6,%ymm6
265         vpxord          %ymm6,%ymm5,%ymm5
266         vprold          $7,%ymm5,%ymm5
267
268         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
269         vpshufd         $0x39,%ymm1,%ymm1
270         vpshufd         $0x39,%ymm5,%ymm5
271         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
272         vpshufd         $0x4e,%ymm2,%ymm2
273         vpshufd         $0x4e,%ymm6,%ymm6
274         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
275         vpshufd         $0x93,%ymm3,%ymm3
276         vpshufd         $0x93,%ymm7,%ymm7
277
278         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
279         vpaddd          %ymm1,%ymm0,%ymm0
280         vpxord          %ymm0,%ymm3,%ymm3
281         vprold          $16,%ymm3,%ymm3
282
283         vpaddd          %ymm5,%ymm4,%ymm4
284         vpxord          %ymm4,%ymm7,%ymm7
285         vprold          $16,%ymm7,%ymm7
286
287         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
288         vpaddd          %ymm3,%ymm2,%ymm2
289         vpxord          %ymm2,%ymm1,%ymm1
290         vprold          $12,%ymm1,%ymm1
291
292         vpaddd          %ymm7,%ymm6,%ymm6
293         vpxord          %ymm6,%ymm5,%ymm5
294         vprold          $12,%ymm5,%ymm5
295
296         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
297         vpaddd          %ymm1,%ymm0,%ymm0
298         vpxord          %ymm0,%ymm3,%ymm3
299         vprold          $8,%ymm3,%ymm3
300
301         vpaddd          %ymm5,%ymm4,%ymm4
302         vpxord          %ymm4,%ymm7,%ymm7
303         vprold          $8,%ymm7,%ymm7
304
305         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
306         vpaddd          %ymm3,%ymm2,%ymm2
307         vpxord          %ymm2,%ymm1,%ymm1
308         vprold          $7,%ymm1,%ymm1
309
310         vpaddd          %ymm7,%ymm6,%ymm6
311         vpxord          %ymm6,%ymm5,%ymm5
312         vprold          $7,%ymm5,%ymm5
313
314         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
315         vpshufd         $0x93,%ymm1,%ymm1
316         vpshufd         $0x93,%ymm5,%ymm5
317         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
318         vpshufd         $0x4e,%ymm2,%ymm2
319         vpshufd         $0x4e,%ymm6,%ymm6
320         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
321         vpshufd         $0x39,%ymm3,%ymm3
322         vpshufd         $0x39,%ymm7,%ymm7
323
324         dec             %rax
325         jnz             .Ldoubleround4
326
327         # o0 = i0 ^ (x0 + s0), first block
328         vpaddd          %ymm11,%ymm0,%ymm10
329         cmp             $0x10,%rcx
330         jl              .Lxorpart4
331         vpxord          0x00(%rdx),%xmm10,%xmm9
332         vmovdqu         %xmm9,0x00(%rsi)
333         vextracti128    $1,%ymm10,%xmm0
334         # o1 = i1 ^ (x1 + s1), first block
335         vpaddd          %ymm12,%ymm1,%ymm10
336         cmp             $0x20,%rcx
337         jl              .Lxorpart4
338         vpxord          0x10(%rdx),%xmm10,%xmm9
339         vmovdqu         %xmm9,0x10(%rsi)
340         vextracti128    $1,%ymm10,%xmm1
341         # o2 = i2 ^ (x2 + s2), first block
342         vpaddd          %ymm13,%ymm2,%ymm10
343         cmp             $0x30,%rcx
344         jl              .Lxorpart4
345         vpxord          0x20(%rdx),%xmm10,%xmm9
346         vmovdqu         %xmm9,0x20(%rsi)
347         vextracti128    $1,%ymm10,%xmm2
348         # o3 = i3 ^ (x3 + s3), first block
349         vpaddd          %ymm14,%ymm3,%ymm10
350         cmp             $0x40,%rcx
351         jl              .Lxorpart4
352         vpxord          0x30(%rdx),%xmm10,%xmm9
353         vmovdqu         %xmm9,0x30(%rsi)
354         vextracti128    $1,%ymm10,%xmm3
355
356         # xor and write second block
357         vmovdqa         %xmm0,%xmm10
358         cmp             $0x50,%rcx
359         jl              .Lxorpart4
360         vpxord          0x40(%rdx),%xmm10,%xmm9
361         vmovdqu         %xmm9,0x40(%rsi)
362
363         vmovdqa         %xmm1,%xmm10
364         cmp             $0x60,%rcx
365         jl              .Lxorpart4
366         vpxord          0x50(%rdx),%xmm10,%xmm9
367         vmovdqu         %xmm9,0x50(%rsi)
368
369         vmovdqa         %xmm2,%xmm10
370         cmp             $0x70,%rcx
371         jl              .Lxorpart4
372         vpxord          0x60(%rdx),%xmm10,%xmm9
373         vmovdqu         %xmm9,0x60(%rsi)
374
375         vmovdqa         %xmm3,%xmm10
376         cmp             $0x80,%rcx
377         jl              .Lxorpart4
378         vpxord          0x70(%rdx),%xmm10,%xmm9
379         vmovdqu         %xmm9,0x70(%rsi)
380
381         # o0 = i0 ^ (x0 + s0), third block
382         vpaddd          %ymm11,%ymm4,%ymm10
383         cmp             $0x90,%rcx
384         jl              .Lxorpart4
385         vpxord          0x80(%rdx),%xmm10,%xmm9
386         vmovdqu         %xmm9,0x80(%rsi)
387         vextracti128    $1,%ymm10,%xmm4
388         # o1 = i1 ^ (x1 + s1), third block
389         vpaddd          %ymm12,%ymm5,%ymm10
390         cmp             $0xa0,%rcx
391         jl              .Lxorpart4
392         vpxord          0x90(%rdx),%xmm10,%xmm9
393         vmovdqu         %xmm9,0x90(%rsi)
394         vextracti128    $1,%ymm10,%xmm5
395         # o2 = i2 ^ (x2 + s2), third block
396         vpaddd          %ymm13,%ymm6,%ymm10
397         cmp             $0xb0,%rcx
398         jl              .Lxorpart4
399         vpxord          0xa0(%rdx),%xmm10,%xmm9
400         vmovdqu         %xmm9,0xa0(%rsi)
401         vextracti128    $1,%ymm10,%xmm6
402         # o3 = i3 ^ (x3 + s3), third block
403         vpaddd          %ymm15,%ymm7,%ymm10
404         cmp             $0xc0,%rcx
405         jl              .Lxorpart4
406         vpxord          0xb0(%rdx),%xmm10,%xmm9
407         vmovdqu         %xmm9,0xb0(%rsi)
408         vextracti128    $1,%ymm10,%xmm7
409
410         # xor and write fourth block
411         vmovdqa         %xmm4,%xmm10
412         cmp             $0xd0,%rcx
413         jl              .Lxorpart4
414         vpxord          0xc0(%rdx),%xmm10,%xmm9
415         vmovdqu         %xmm9,0xc0(%rsi)
416
417         vmovdqa         %xmm5,%xmm10
418         cmp             $0xe0,%rcx
419         jl              .Lxorpart4
420         vpxord          0xd0(%rdx),%xmm10,%xmm9
421         vmovdqu         %xmm9,0xd0(%rsi)
422
423         vmovdqa         %xmm6,%xmm10
424         cmp             $0xf0,%rcx
425         jl              .Lxorpart4
426         vpxord          0xe0(%rdx),%xmm10,%xmm9
427         vmovdqu         %xmm9,0xe0(%rsi)
428
429         vmovdqa         %xmm7,%xmm10
430         cmp             $0x100,%rcx
431         jl              .Lxorpart4
432         vpxord          0xf0(%rdx),%xmm10,%xmm9
433         vmovdqu         %xmm9,0xf0(%rsi)
434
435 .Ldone4:
436         vzeroupper
437         ret
438
439 .Lxorpart4:
440         # xor remaining bytes from partial register into output
441         mov             %rcx,%rax
442         and             $0xf,%rcx
443         jz              .Ldone8
444         mov             %rax,%r9
445         and             $~0xf,%r9
446
447         mov             $1,%rax
448         shld            %cl,%rax,%rax
449         sub             $1,%rax
450         kmovq           %rax,%k1
451
452         vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
453         vpxord          %xmm10,%xmm1,%xmm1
454         vmovdqu8        %xmm1,(%rsi,%r9){%k1}
455
456         jmp             .Ldone4
457
458 ENDPROC(chacha20_4block_xor_avx512vl)
459
460 ENTRY(chacha20_8block_xor_avx512vl)
461         # %rdi: Input state matrix, s
462         # %rsi: up to 8 data blocks output, o
463         # %rdx: up to 8 data blocks input, i
464         # %rcx: input/output length in bytes
465
466         # This function encrypts eight consecutive ChaCha20 blocks by loading
467         # the state matrix in AVX registers eight times. Compared to AVX2, this
468         # mostly benefits from the new rotate instructions in VL and the
469         # additional registers.
470
471         vzeroupper
472
473         # x0..15[0-7] = s[0..15]
474         vpbroadcastd    0x00(%rdi),%ymm0
475         vpbroadcastd    0x04(%rdi),%ymm1
476         vpbroadcastd    0x08(%rdi),%ymm2
477         vpbroadcastd    0x0c(%rdi),%ymm3
478         vpbroadcastd    0x10(%rdi),%ymm4
479         vpbroadcastd    0x14(%rdi),%ymm5
480         vpbroadcastd    0x18(%rdi),%ymm6
481         vpbroadcastd    0x1c(%rdi),%ymm7
482         vpbroadcastd    0x20(%rdi),%ymm8
483         vpbroadcastd    0x24(%rdi),%ymm9
484         vpbroadcastd    0x28(%rdi),%ymm10
485         vpbroadcastd    0x2c(%rdi),%ymm11
486         vpbroadcastd    0x30(%rdi),%ymm12
487         vpbroadcastd    0x34(%rdi),%ymm13
488         vpbroadcastd    0x38(%rdi),%ymm14
489         vpbroadcastd    0x3c(%rdi),%ymm15
490
491         # x12 += counter values 0-3
492         vpaddd          CTR8BL(%rip),%ymm12,%ymm12
493
494         vmovdqa64       %ymm0,%ymm16
495         vmovdqa64       %ymm1,%ymm17
496         vmovdqa64       %ymm2,%ymm18
497         vmovdqa64       %ymm3,%ymm19
498         vmovdqa64       %ymm4,%ymm20
499         vmovdqa64       %ymm5,%ymm21
500         vmovdqa64       %ymm6,%ymm22
501         vmovdqa64       %ymm7,%ymm23
502         vmovdqa64       %ymm8,%ymm24
503         vmovdqa64       %ymm9,%ymm25
504         vmovdqa64       %ymm10,%ymm26
505         vmovdqa64       %ymm11,%ymm27
506         vmovdqa64       %ymm12,%ymm28
507         vmovdqa64       %ymm13,%ymm29
508         vmovdqa64       %ymm14,%ymm30
509         vmovdqa64       %ymm15,%ymm31
510
511         mov             $10,%eax
512
513 .Ldoubleround8:
514         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
515         vpaddd          %ymm0,%ymm4,%ymm0
516         vpxord          %ymm0,%ymm12,%ymm12
517         vprold          $16,%ymm12,%ymm12
518         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
519         vpaddd          %ymm1,%ymm5,%ymm1
520         vpxord          %ymm1,%ymm13,%ymm13
521         vprold          $16,%ymm13,%ymm13
522         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
523         vpaddd          %ymm2,%ymm6,%ymm2
524         vpxord          %ymm2,%ymm14,%ymm14
525         vprold          $16,%ymm14,%ymm14
526         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
527         vpaddd          %ymm3,%ymm7,%ymm3
528         vpxord          %ymm3,%ymm15,%ymm15
529         vprold          $16,%ymm15,%ymm15
530
531         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
532         vpaddd          %ymm12,%ymm8,%ymm8
533         vpxord          %ymm8,%ymm4,%ymm4
534         vprold          $12,%ymm4,%ymm4
535         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
536         vpaddd          %ymm13,%ymm9,%ymm9
537         vpxord          %ymm9,%ymm5,%ymm5
538         vprold          $12,%ymm5,%ymm5
539         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
540         vpaddd          %ymm14,%ymm10,%ymm10
541         vpxord          %ymm10,%ymm6,%ymm6
542         vprold          $12,%ymm6,%ymm6
543         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
544         vpaddd          %ymm15,%ymm11,%ymm11
545         vpxord          %ymm11,%ymm7,%ymm7
546         vprold          $12,%ymm7,%ymm7
547
548         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
549         vpaddd          %ymm0,%ymm4,%ymm0
550         vpxord          %ymm0,%ymm12,%ymm12
551         vprold          $8,%ymm12,%ymm12
552         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
553         vpaddd          %ymm1,%ymm5,%ymm1
554         vpxord          %ymm1,%ymm13,%ymm13
555         vprold          $8,%ymm13,%ymm13
556         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
557         vpaddd          %ymm2,%ymm6,%ymm2
558         vpxord          %ymm2,%ymm14,%ymm14
559         vprold          $8,%ymm14,%ymm14
560         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
561         vpaddd          %ymm3,%ymm7,%ymm3
562         vpxord          %ymm3,%ymm15,%ymm15
563         vprold          $8,%ymm15,%ymm15
564
565         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
566         vpaddd          %ymm12,%ymm8,%ymm8
567         vpxord          %ymm8,%ymm4,%ymm4
568         vprold          $7,%ymm4,%ymm4
569         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
570         vpaddd          %ymm13,%ymm9,%ymm9
571         vpxord          %ymm9,%ymm5,%ymm5
572         vprold          $7,%ymm5,%ymm5
573         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
574         vpaddd          %ymm14,%ymm10,%ymm10
575         vpxord          %ymm10,%ymm6,%ymm6
576         vprold          $7,%ymm6,%ymm6
577         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
578         vpaddd          %ymm15,%ymm11,%ymm11
579         vpxord          %ymm11,%ymm7,%ymm7
580         vprold          $7,%ymm7,%ymm7
581
582         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
583         vpaddd          %ymm0,%ymm5,%ymm0
584         vpxord          %ymm0,%ymm15,%ymm15
585         vprold          $16,%ymm15,%ymm15
586         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
587         vpaddd          %ymm1,%ymm6,%ymm1
588         vpxord          %ymm1,%ymm12,%ymm12
589         vprold          $16,%ymm12,%ymm12
590         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
591         vpaddd          %ymm2,%ymm7,%ymm2
592         vpxord          %ymm2,%ymm13,%ymm13
593         vprold          $16,%ymm13,%ymm13
594         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
595         vpaddd          %ymm3,%ymm4,%ymm3
596         vpxord          %ymm3,%ymm14,%ymm14
597         vprold          $16,%ymm14,%ymm14
598
599         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
600         vpaddd          %ymm15,%ymm10,%ymm10
601         vpxord          %ymm10,%ymm5,%ymm5
602         vprold          $12,%ymm5,%ymm5
603         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
604         vpaddd          %ymm12,%ymm11,%ymm11
605         vpxord          %ymm11,%ymm6,%ymm6
606         vprold          $12,%ymm6,%ymm6
607         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
608         vpaddd          %ymm13,%ymm8,%ymm8
609         vpxord          %ymm8,%ymm7,%ymm7
610         vprold          $12,%ymm7,%ymm7
611         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
612         vpaddd          %ymm14,%ymm9,%ymm9
613         vpxord          %ymm9,%ymm4,%ymm4
614         vprold          $12,%ymm4,%ymm4
615
616         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
617         vpaddd          %ymm0,%ymm5,%ymm0
618         vpxord          %ymm0,%ymm15,%ymm15
619         vprold          $8,%ymm15,%ymm15
620         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
621         vpaddd          %ymm1,%ymm6,%ymm1
622         vpxord          %ymm1,%ymm12,%ymm12
623         vprold          $8,%ymm12,%ymm12
624         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
625         vpaddd          %ymm2,%ymm7,%ymm2
626         vpxord          %ymm2,%ymm13,%ymm13
627         vprold          $8,%ymm13,%ymm13
628         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
629         vpaddd          %ymm3,%ymm4,%ymm3
630         vpxord          %ymm3,%ymm14,%ymm14
631         vprold          $8,%ymm14,%ymm14
632
633         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
634         vpaddd          %ymm15,%ymm10,%ymm10
635         vpxord          %ymm10,%ymm5,%ymm5
636         vprold          $7,%ymm5,%ymm5
637         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
638         vpaddd          %ymm12,%ymm11,%ymm11
639         vpxord          %ymm11,%ymm6,%ymm6
640         vprold          $7,%ymm6,%ymm6
641         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
642         vpaddd          %ymm13,%ymm8,%ymm8
643         vpxord          %ymm8,%ymm7,%ymm7
644         vprold          $7,%ymm7,%ymm7
645         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
646         vpaddd          %ymm14,%ymm9,%ymm9
647         vpxord          %ymm9,%ymm4,%ymm4
648         vprold          $7,%ymm4,%ymm4
649
650         dec             %eax
651         jnz             .Ldoubleround8
652
653         # x0..15[0-3] += s[0..15]
654         vpaddd          %ymm16,%ymm0,%ymm0
655         vpaddd          %ymm17,%ymm1,%ymm1
656         vpaddd          %ymm18,%ymm2,%ymm2
657         vpaddd          %ymm19,%ymm3,%ymm3
658         vpaddd          %ymm20,%ymm4,%ymm4
659         vpaddd          %ymm21,%ymm5,%ymm5
660         vpaddd          %ymm22,%ymm6,%ymm6
661         vpaddd          %ymm23,%ymm7,%ymm7
662         vpaddd          %ymm24,%ymm8,%ymm8
663         vpaddd          %ymm25,%ymm9,%ymm9
664         vpaddd          %ymm26,%ymm10,%ymm10
665         vpaddd          %ymm27,%ymm11,%ymm11
666         vpaddd          %ymm28,%ymm12,%ymm12
667         vpaddd          %ymm29,%ymm13,%ymm13
668         vpaddd          %ymm30,%ymm14,%ymm14
669         vpaddd          %ymm31,%ymm15,%ymm15
670
671         # interleave 32-bit words in state n, n+1
672         vpunpckldq      %ymm1,%ymm0,%ymm16
673         vpunpckhdq      %ymm1,%ymm0,%ymm17
674         vpunpckldq      %ymm3,%ymm2,%ymm18
675         vpunpckhdq      %ymm3,%ymm2,%ymm19
676         vpunpckldq      %ymm5,%ymm4,%ymm20
677         vpunpckhdq      %ymm5,%ymm4,%ymm21
678         vpunpckldq      %ymm7,%ymm6,%ymm22
679         vpunpckhdq      %ymm7,%ymm6,%ymm23
680         vpunpckldq      %ymm9,%ymm8,%ymm24
681         vpunpckhdq      %ymm9,%ymm8,%ymm25
682         vpunpckldq      %ymm11,%ymm10,%ymm26
683         vpunpckhdq      %ymm11,%ymm10,%ymm27
684         vpunpckldq      %ymm13,%ymm12,%ymm28
685         vpunpckhdq      %ymm13,%ymm12,%ymm29
686         vpunpckldq      %ymm15,%ymm14,%ymm30
687         vpunpckhdq      %ymm15,%ymm14,%ymm31
688
689         # interleave 64-bit words in state n, n+2
690         vpunpcklqdq     %ymm18,%ymm16,%ymm0
691         vpunpcklqdq     %ymm19,%ymm17,%ymm1
692         vpunpckhqdq     %ymm18,%ymm16,%ymm2
693         vpunpckhqdq     %ymm19,%ymm17,%ymm3
694         vpunpcklqdq     %ymm22,%ymm20,%ymm4
695         vpunpcklqdq     %ymm23,%ymm21,%ymm5
696         vpunpckhqdq     %ymm22,%ymm20,%ymm6
697         vpunpckhqdq     %ymm23,%ymm21,%ymm7
698         vpunpcklqdq     %ymm26,%ymm24,%ymm8
699         vpunpcklqdq     %ymm27,%ymm25,%ymm9
700         vpunpckhqdq     %ymm26,%ymm24,%ymm10
701         vpunpckhqdq     %ymm27,%ymm25,%ymm11
702         vpunpcklqdq     %ymm30,%ymm28,%ymm12
703         vpunpcklqdq     %ymm31,%ymm29,%ymm13
704         vpunpckhqdq     %ymm30,%ymm28,%ymm14
705         vpunpckhqdq     %ymm31,%ymm29,%ymm15
706
707         # interleave 128-bit words in state n, n+4
708         # xor/write first four blocks
709         vmovdqa64       %ymm0,%ymm16
710         vperm2i128      $0x20,%ymm4,%ymm0,%ymm0
711         cmp             $0x0020,%rcx
712         jl              .Lxorpart8
713         vpxord          0x0000(%rdx),%ymm0,%ymm0
714         vmovdqu64       %ymm0,0x0000(%rsi)
715         vmovdqa64       %ymm16,%ymm0
716         vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
717
718         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
719         cmp             $0x0040,%rcx
720         jl              .Lxorpart8
721         vpxord          0x0020(%rdx),%ymm0,%ymm0
722         vmovdqu64       %ymm0,0x0020(%rsi)
723         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
724
725         vperm2i128      $0x20,%ymm6,%ymm2,%ymm0
726         cmp             $0x0060,%rcx
727         jl              .Lxorpart8
728         vpxord          0x0040(%rdx),%ymm0,%ymm0
729         vmovdqu64       %ymm0,0x0040(%rsi)
730         vperm2i128      $0x31,%ymm6,%ymm2,%ymm6
731
732         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
733         cmp             $0x0080,%rcx
734         jl              .Lxorpart8
735         vpxord          0x0060(%rdx),%ymm0,%ymm0
736         vmovdqu64       %ymm0,0x0060(%rsi)
737         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
738
739         vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
740         cmp             $0x00a0,%rcx
741         jl              .Lxorpart8
742         vpxord          0x0080(%rdx),%ymm0,%ymm0
743         vmovdqu64       %ymm0,0x0080(%rsi)
744         vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
745
746         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
747         cmp             $0x00c0,%rcx
748         jl              .Lxorpart8
749         vpxord          0x00a0(%rdx),%ymm0,%ymm0
750         vmovdqu64       %ymm0,0x00a0(%rsi)
751         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
752
753         vperm2i128      $0x20,%ymm7,%ymm3,%ymm0
754         cmp             $0x00e0,%rcx
755         jl              .Lxorpart8
756         vpxord          0x00c0(%rdx),%ymm0,%ymm0
757         vmovdqu64       %ymm0,0x00c0(%rsi)
758         vperm2i128      $0x31,%ymm7,%ymm3,%ymm7
759
760         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
761         cmp             $0x0100,%rcx
762         jl              .Lxorpart8
763         vpxord          0x00e0(%rdx),%ymm0,%ymm0
764         vmovdqu64       %ymm0,0x00e0(%rsi)
765         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
766
767         # xor remaining blocks, write to output
768         vmovdqa64       %ymm4,%ymm0
769         cmp             $0x0120,%rcx
770         jl              .Lxorpart8
771         vpxord          0x0100(%rdx),%ymm0,%ymm0
772         vmovdqu64       %ymm0,0x0100(%rsi)
773
774         vmovdqa64       %ymm12,%ymm0
775         cmp             $0x0140,%rcx
776         jl              .Lxorpart8
777         vpxord          0x0120(%rdx),%ymm0,%ymm0
778         vmovdqu64       %ymm0,0x0120(%rsi)
779
780         vmovdqa64       %ymm6,%ymm0
781         cmp             $0x0160,%rcx
782         jl              .Lxorpart8
783         vpxord          0x0140(%rdx),%ymm0,%ymm0
784         vmovdqu64       %ymm0,0x0140(%rsi)
785
786         vmovdqa64       %ymm14,%ymm0
787         cmp             $0x0180,%rcx
788         jl              .Lxorpart8
789         vpxord          0x0160(%rdx),%ymm0,%ymm0
790         vmovdqu64       %ymm0,0x0160(%rsi)
791
792         vmovdqa64       %ymm5,%ymm0
793         cmp             $0x01a0,%rcx
794         jl              .Lxorpart8
795         vpxord          0x0180(%rdx),%ymm0,%ymm0
796         vmovdqu64       %ymm0,0x0180(%rsi)
797
798         vmovdqa64       %ymm13,%ymm0
799         cmp             $0x01c0,%rcx
800         jl              .Lxorpart8
801         vpxord          0x01a0(%rdx),%ymm0,%ymm0
802         vmovdqu64       %ymm0,0x01a0(%rsi)
803
804         vmovdqa64       %ymm7,%ymm0
805         cmp             $0x01e0,%rcx
806         jl              .Lxorpart8
807         vpxord          0x01c0(%rdx),%ymm0,%ymm0
808         vmovdqu64       %ymm0,0x01c0(%rsi)
809
810         vmovdqa64       %ymm15,%ymm0
811         cmp             $0x0200,%rcx
812         jl              .Lxorpart8
813         vpxord          0x01e0(%rdx),%ymm0,%ymm0
814         vmovdqu64       %ymm0,0x01e0(%rsi)
815
816 .Ldone8:
817         vzeroupper
818         ret
819
820 .Lxorpart8:
821         # xor remaining bytes from partial register into output
822         mov             %rcx,%rax
823         and             $0x1f,%rcx
824         jz              .Ldone8
825         mov             %rax,%r9
826         and             $~0x1f,%r9
827
828         mov             $1,%rax
829         shld            %cl,%rax,%rax
830         sub             $1,%rax
831         kmovq           %rax,%k1
832
833         vmovdqu8        (%rdx,%r9),%ymm1{%k1}{z}
834         vpxord          %ymm0,%ymm1,%ymm1
835         vmovdqu8        %ymm1,(%rsi,%r9){%k1}
836
837         jmp             .Ldone8
838
839 ENDPROC(chacha20_8block_xor_avx512vl)