OSDN Git Service

261097578715a8af74449991f623b8af7862725d
[uclinux-h8/linux.git] / arch / x86 / crypto / chacha20-avx512vl-x86_64.S
1 /* SPDX-License-Identifier: GPL-2.0+ */
2 /*
3  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 AVX-512VL functions
4  *
5  * Copyright (C) 2018 Martin Willi
6  */
7
8 #include <linux/linkage.h>
9
10 .section        .rodata.cst32.CTR2BL, "aM", @progbits, 32
11 .align 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13         .octa 0x00000000000000000000000000000001
14
15 .section        .rodata.cst32.CTR8BL, "aM", @progbits, 32
16 .align 32
17 CTR8BL: .octa 0x00000003000000020000000100000000
18         .octa 0x00000007000000060000000500000004
19
20 .text
21
22 ENTRY(chacha20_2block_xor_avx512vl)
23         # %rdi: Input state matrix, s
24         # %rsi: up to 2 data blocks output, o
25         # %rdx: up to 2 data blocks input, i
26         # %rcx: input/output length in bytes
27
28         # This function encrypts two ChaCha20 blocks by loading the state
29         # matrix twice across four AVX registers. It performs matrix operations
30         # on four words in each matrix in parallel, but requires shuffling to
31         # rearrange the words after each round.
32
33         vzeroupper
34
35         # x0..3[0-2] = s0..3
36         vbroadcasti128  0x00(%rdi),%ymm0
37         vbroadcasti128  0x10(%rdi),%ymm1
38         vbroadcasti128  0x20(%rdi),%ymm2
39         vbroadcasti128  0x30(%rdi),%ymm3
40
41         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
42
43         vmovdqa         %ymm0,%ymm8
44         vmovdqa         %ymm1,%ymm9
45         vmovdqa         %ymm2,%ymm10
46         vmovdqa         %ymm3,%ymm11
47
48         mov             $10,%rax
49
50 .Ldoubleround:
51
52         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
53         vpaddd          %ymm1,%ymm0,%ymm0
54         vpxord          %ymm0,%ymm3,%ymm3
55         vprold          $16,%ymm3,%ymm3
56
57         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
58         vpaddd          %ymm3,%ymm2,%ymm2
59         vpxord          %ymm2,%ymm1,%ymm1
60         vprold          $12,%ymm1,%ymm1
61
62         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
63         vpaddd          %ymm1,%ymm0,%ymm0
64         vpxord          %ymm0,%ymm3,%ymm3
65         vprold          $8,%ymm3,%ymm3
66
67         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
68         vpaddd          %ymm3,%ymm2,%ymm2
69         vpxord          %ymm2,%ymm1,%ymm1
70         vprold          $7,%ymm1,%ymm1
71
72         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
73         vpshufd         $0x39,%ymm1,%ymm1
74         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
75         vpshufd         $0x4e,%ymm2,%ymm2
76         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
77         vpshufd         $0x93,%ymm3,%ymm3
78
79         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
80         vpaddd          %ymm1,%ymm0,%ymm0
81         vpxord          %ymm0,%ymm3,%ymm3
82         vprold          $16,%ymm3,%ymm3
83
84         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
85         vpaddd          %ymm3,%ymm2,%ymm2
86         vpxord          %ymm2,%ymm1,%ymm1
87         vprold          $12,%ymm1,%ymm1
88
89         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
90         vpaddd          %ymm1,%ymm0,%ymm0
91         vpxord          %ymm0,%ymm3,%ymm3
92         vprold          $8,%ymm3,%ymm3
93
94         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
95         vpaddd          %ymm3,%ymm2,%ymm2
96         vpxord          %ymm2,%ymm1,%ymm1
97         vprold          $7,%ymm1,%ymm1
98
99         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
100         vpshufd         $0x93,%ymm1,%ymm1
101         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
102         vpshufd         $0x4e,%ymm2,%ymm2
103         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
104         vpshufd         $0x39,%ymm3,%ymm3
105
106         dec             %rax
107         jnz             .Ldoubleround
108
109         # o0 = i0 ^ (x0 + s0)
110         vpaddd          %ymm8,%ymm0,%ymm7
111         cmp             $0x10,%rcx
112         jl              .Lxorpart2
113         vpxord          0x00(%rdx),%xmm7,%xmm6
114         vmovdqu         %xmm6,0x00(%rsi)
115         vextracti128    $1,%ymm7,%xmm0
116         # o1 = i1 ^ (x1 + s1)
117         vpaddd          %ymm9,%ymm1,%ymm7
118         cmp             $0x20,%rcx
119         jl              .Lxorpart2
120         vpxord          0x10(%rdx),%xmm7,%xmm6
121         vmovdqu         %xmm6,0x10(%rsi)
122         vextracti128    $1,%ymm7,%xmm1
123         # o2 = i2 ^ (x2 + s2)
124         vpaddd          %ymm10,%ymm2,%ymm7
125         cmp             $0x30,%rcx
126         jl              .Lxorpart2
127         vpxord          0x20(%rdx),%xmm7,%xmm6
128         vmovdqu         %xmm6,0x20(%rsi)
129         vextracti128    $1,%ymm7,%xmm2
130         # o3 = i3 ^ (x3 + s3)
131         vpaddd          %ymm11,%ymm3,%ymm7
132         cmp             $0x40,%rcx
133         jl              .Lxorpart2
134         vpxord          0x30(%rdx),%xmm7,%xmm6
135         vmovdqu         %xmm6,0x30(%rsi)
136         vextracti128    $1,%ymm7,%xmm3
137
138         # xor and write second block
139         vmovdqa         %xmm0,%xmm7
140         cmp             $0x50,%rcx
141         jl              .Lxorpart2
142         vpxord          0x40(%rdx),%xmm7,%xmm6
143         vmovdqu         %xmm6,0x40(%rsi)
144
145         vmovdqa         %xmm1,%xmm7
146         cmp             $0x60,%rcx
147         jl              .Lxorpart2
148         vpxord          0x50(%rdx),%xmm7,%xmm6
149         vmovdqu         %xmm6,0x50(%rsi)
150
151         vmovdqa         %xmm2,%xmm7
152         cmp             $0x70,%rcx
153         jl              .Lxorpart2
154         vpxord          0x60(%rdx),%xmm7,%xmm6
155         vmovdqu         %xmm6,0x60(%rsi)
156
157         vmovdqa         %xmm3,%xmm7
158         cmp             $0x80,%rcx
159         jl              .Lxorpart2
160         vpxord          0x70(%rdx),%xmm7,%xmm6
161         vmovdqu         %xmm6,0x70(%rsi)
162
163 .Ldone2:
164         vzeroupper
165         ret
166
167 .Lxorpart2:
168         # xor remaining bytes from partial register into output
169         mov             %rcx,%rax
170         and             $0xf,%rcx
171         jz              .Ldone8
172         mov             %rax,%r9
173         and             $~0xf,%r9
174
175         mov             $1,%rax
176         shld            %cl,%rax,%rax
177         sub             $1,%rax
178         kmovq           %rax,%k1
179
180         vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
181         vpxord          %xmm7,%xmm1,%xmm1
182         vmovdqu8        %xmm1,(%rsi,%r9){%k1}
183
184         jmp             .Ldone2
185
186 ENDPROC(chacha20_2block_xor_avx512vl)
187
188 ENTRY(chacha20_8block_xor_avx512vl)
189         # %rdi: Input state matrix, s
190         # %rsi: up to 8 data blocks output, o
191         # %rdx: up to 8 data blocks input, i
192         # %rcx: input/output length in bytes
193
194         # This function encrypts eight consecutive ChaCha20 blocks by loading
195         # the state matrix in AVX registers eight times. Compared to AVX2, this
196         # mostly benefits from the new rotate instructions in VL and the
197         # additional registers.
198
199         vzeroupper
200
201         # x0..15[0-7] = s[0..15]
202         vpbroadcastd    0x00(%rdi),%ymm0
203         vpbroadcastd    0x04(%rdi),%ymm1
204         vpbroadcastd    0x08(%rdi),%ymm2
205         vpbroadcastd    0x0c(%rdi),%ymm3
206         vpbroadcastd    0x10(%rdi),%ymm4
207         vpbroadcastd    0x14(%rdi),%ymm5
208         vpbroadcastd    0x18(%rdi),%ymm6
209         vpbroadcastd    0x1c(%rdi),%ymm7
210         vpbroadcastd    0x20(%rdi),%ymm8
211         vpbroadcastd    0x24(%rdi),%ymm9
212         vpbroadcastd    0x28(%rdi),%ymm10
213         vpbroadcastd    0x2c(%rdi),%ymm11
214         vpbroadcastd    0x30(%rdi),%ymm12
215         vpbroadcastd    0x34(%rdi),%ymm13
216         vpbroadcastd    0x38(%rdi),%ymm14
217         vpbroadcastd    0x3c(%rdi),%ymm15
218
219         # x12 += counter values 0-3
220         vpaddd          CTR8BL(%rip),%ymm12,%ymm12
221
222         vmovdqa64       %ymm0,%ymm16
223         vmovdqa64       %ymm1,%ymm17
224         vmovdqa64       %ymm2,%ymm18
225         vmovdqa64       %ymm3,%ymm19
226         vmovdqa64       %ymm4,%ymm20
227         vmovdqa64       %ymm5,%ymm21
228         vmovdqa64       %ymm6,%ymm22
229         vmovdqa64       %ymm7,%ymm23
230         vmovdqa64       %ymm8,%ymm24
231         vmovdqa64       %ymm9,%ymm25
232         vmovdqa64       %ymm10,%ymm26
233         vmovdqa64       %ymm11,%ymm27
234         vmovdqa64       %ymm12,%ymm28
235         vmovdqa64       %ymm13,%ymm29
236         vmovdqa64       %ymm14,%ymm30
237         vmovdqa64       %ymm15,%ymm31
238
239         mov             $10,%eax
240
241 .Ldoubleround8:
242         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
243         vpaddd          %ymm0,%ymm4,%ymm0
244         vpxord          %ymm0,%ymm12,%ymm12
245         vprold          $16,%ymm12,%ymm12
246         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
247         vpaddd          %ymm1,%ymm5,%ymm1
248         vpxord          %ymm1,%ymm13,%ymm13
249         vprold          $16,%ymm13,%ymm13
250         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
251         vpaddd          %ymm2,%ymm6,%ymm2
252         vpxord          %ymm2,%ymm14,%ymm14
253         vprold          $16,%ymm14,%ymm14
254         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
255         vpaddd          %ymm3,%ymm7,%ymm3
256         vpxord          %ymm3,%ymm15,%ymm15
257         vprold          $16,%ymm15,%ymm15
258
259         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
260         vpaddd          %ymm12,%ymm8,%ymm8
261         vpxord          %ymm8,%ymm4,%ymm4
262         vprold          $12,%ymm4,%ymm4
263         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
264         vpaddd          %ymm13,%ymm9,%ymm9
265         vpxord          %ymm9,%ymm5,%ymm5
266         vprold          $12,%ymm5,%ymm5
267         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
268         vpaddd          %ymm14,%ymm10,%ymm10
269         vpxord          %ymm10,%ymm6,%ymm6
270         vprold          $12,%ymm6,%ymm6
271         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
272         vpaddd          %ymm15,%ymm11,%ymm11
273         vpxord          %ymm11,%ymm7,%ymm7
274         vprold          $12,%ymm7,%ymm7
275
276         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
277         vpaddd          %ymm0,%ymm4,%ymm0
278         vpxord          %ymm0,%ymm12,%ymm12
279         vprold          $8,%ymm12,%ymm12
280         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
281         vpaddd          %ymm1,%ymm5,%ymm1
282         vpxord          %ymm1,%ymm13,%ymm13
283         vprold          $8,%ymm13,%ymm13
284         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
285         vpaddd          %ymm2,%ymm6,%ymm2
286         vpxord          %ymm2,%ymm14,%ymm14
287         vprold          $8,%ymm14,%ymm14
288         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
289         vpaddd          %ymm3,%ymm7,%ymm3
290         vpxord          %ymm3,%ymm15,%ymm15
291         vprold          $8,%ymm15,%ymm15
292
293         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
294         vpaddd          %ymm12,%ymm8,%ymm8
295         vpxord          %ymm8,%ymm4,%ymm4
296         vprold          $7,%ymm4,%ymm4
297         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
298         vpaddd          %ymm13,%ymm9,%ymm9
299         vpxord          %ymm9,%ymm5,%ymm5
300         vprold          $7,%ymm5,%ymm5
301         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
302         vpaddd          %ymm14,%ymm10,%ymm10
303         vpxord          %ymm10,%ymm6,%ymm6
304         vprold          $7,%ymm6,%ymm6
305         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
306         vpaddd          %ymm15,%ymm11,%ymm11
307         vpxord          %ymm11,%ymm7,%ymm7
308         vprold          $7,%ymm7,%ymm7
309
310         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
311         vpaddd          %ymm0,%ymm5,%ymm0
312         vpxord          %ymm0,%ymm15,%ymm15
313         vprold          $16,%ymm15,%ymm15
314         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
315         vpaddd          %ymm1,%ymm6,%ymm1
316         vpxord          %ymm1,%ymm12,%ymm12
317         vprold          $16,%ymm12,%ymm12
318         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
319         vpaddd          %ymm2,%ymm7,%ymm2
320         vpxord          %ymm2,%ymm13,%ymm13
321         vprold          $16,%ymm13,%ymm13
322         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
323         vpaddd          %ymm3,%ymm4,%ymm3
324         vpxord          %ymm3,%ymm14,%ymm14
325         vprold          $16,%ymm14,%ymm14
326
327         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
328         vpaddd          %ymm15,%ymm10,%ymm10
329         vpxord          %ymm10,%ymm5,%ymm5
330         vprold          $12,%ymm5,%ymm5
331         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
332         vpaddd          %ymm12,%ymm11,%ymm11
333         vpxord          %ymm11,%ymm6,%ymm6
334         vprold          $12,%ymm6,%ymm6
335         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
336         vpaddd          %ymm13,%ymm8,%ymm8
337         vpxord          %ymm8,%ymm7,%ymm7
338         vprold          $12,%ymm7,%ymm7
339         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
340         vpaddd          %ymm14,%ymm9,%ymm9
341         vpxord          %ymm9,%ymm4,%ymm4
342         vprold          $12,%ymm4,%ymm4
343
344         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
345         vpaddd          %ymm0,%ymm5,%ymm0
346         vpxord          %ymm0,%ymm15,%ymm15
347         vprold          $8,%ymm15,%ymm15
348         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
349         vpaddd          %ymm1,%ymm6,%ymm1
350         vpxord          %ymm1,%ymm12,%ymm12
351         vprold          $8,%ymm12,%ymm12
352         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
353         vpaddd          %ymm2,%ymm7,%ymm2
354         vpxord          %ymm2,%ymm13,%ymm13
355         vprold          $8,%ymm13,%ymm13
356         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
357         vpaddd          %ymm3,%ymm4,%ymm3
358         vpxord          %ymm3,%ymm14,%ymm14
359         vprold          $8,%ymm14,%ymm14
360
361         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
362         vpaddd          %ymm15,%ymm10,%ymm10
363         vpxord          %ymm10,%ymm5,%ymm5
364         vprold          $7,%ymm5,%ymm5
365         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
366         vpaddd          %ymm12,%ymm11,%ymm11
367         vpxord          %ymm11,%ymm6,%ymm6
368         vprold          $7,%ymm6,%ymm6
369         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
370         vpaddd          %ymm13,%ymm8,%ymm8
371         vpxord          %ymm8,%ymm7,%ymm7
372         vprold          $7,%ymm7,%ymm7
373         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
374         vpaddd          %ymm14,%ymm9,%ymm9
375         vpxord          %ymm9,%ymm4,%ymm4
376         vprold          $7,%ymm4,%ymm4
377
378         dec             %eax
379         jnz             .Ldoubleround8
380
381         # x0..15[0-3] += s[0..15]
382         vpaddd          %ymm16,%ymm0,%ymm0
383         vpaddd          %ymm17,%ymm1,%ymm1
384         vpaddd          %ymm18,%ymm2,%ymm2
385         vpaddd          %ymm19,%ymm3,%ymm3
386         vpaddd          %ymm20,%ymm4,%ymm4
387         vpaddd          %ymm21,%ymm5,%ymm5
388         vpaddd          %ymm22,%ymm6,%ymm6
389         vpaddd          %ymm23,%ymm7,%ymm7
390         vpaddd          %ymm24,%ymm8,%ymm8
391         vpaddd          %ymm25,%ymm9,%ymm9
392         vpaddd          %ymm26,%ymm10,%ymm10
393         vpaddd          %ymm27,%ymm11,%ymm11
394         vpaddd          %ymm28,%ymm12,%ymm12
395         vpaddd          %ymm29,%ymm13,%ymm13
396         vpaddd          %ymm30,%ymm14,%ymm14
397         vpaddd          %ymm31,%ymm15,%ymm15
398
399         # interleave 32-bit words in state n, n+1
400         vpunpckldq      %ymm1,%ymm0,%ymm16
401         vpunpckhdq      %ymm1,%ymm0,%ymm17
402         vpunpckldq      %ymm3,%ymm2,%ymm18
403         vpunpckhdq      %ymm3,%ymm2,%ymm19
404         vpunpckldq      %ymm5,%ymm4,%ymm20
405         vpunpckhdq      %ymm5,%ymm4,%ymm21
406         vpunpckldq      %ymm7,%ymm6,%ymm22
407         vpunpckhdq      %ymm7,%ymm6,%ymm23
408         vpunpckldq      %ymm9,%ymm8,%ymm24
409         vpunpckhdq      %ymm9,%ymm8,%ymm25
410         vpunpckldq      %ymm11,%ymm10,%ymm26
411         vpunpckhdq      %ymm11,%ymm10,%ymm27
412         vpunpckldq      %ymm13,%ymm12,%ymm28
413         vpunpckhdq      %ymm13,%ymm12,%ymm29
414         vpunpckldq      %ymm15,%ymm14,%ymm30
415         vpunpckhdq      %ymm15,%ymm14,%ymm31
416
417         # interleave 64-bit words in state n, n+2
418         vpunpcklqdq     %ymm18,%ymm16,%ymm0
419         vpunpcklqdq     %ymm19,%ymm17,%ymm1
420         vpunpckhqdq     %ymm18,%ymm16,%ymm2
421         vpunpckhqdq     %ymm19,%ymm17,%ymm3
422         vpunpcklqdq     %ymm22,%ymm20,%ymm4
423         vpunpcklqdq     %ymm23,%ymm21,%ymm5
424         vpunpckhqdq     %ymm22,%ymm20,%ymm6
425         vpunpckhqdq     %ymm23,%ymm21,%ymm7
426         vpunpcklqdq     %ymm26,%ymm24,%ymm8
427         vpunpcklqdq     %ymm27,%ymm25,%ymm9
428         vpunpckhqdq     %ymm26,%ymm24,%ymm10
429         vpunpckhqdq     %ymm27,%ymm25,%ymm11
430         vpunpcklqdq     %ymm30,%ymm28,%ymm12
431         vpunpcklqdq     %ymm31,%ymm29,%ymm13
432         vpunpckhqdq     %ymm30,%ymm28,%ymm14
433         vpunpckhqdq     %ymm31,%ymm29,%ymm15
434
435         # interleave 128-bit words in state n, n+4
436         # xor/write first four blocks
437         vmovdqa64       %ymm0,%ymm16
438         vperm2i128      $0x20,%ymm4,%ymm0,%ymm0
439         cmp             $0x0020,%rcx
440         jl              .Lxorpart8
441         vpxord          0x0000(%rdx),%ymm0,%ymm0
442         vmovdqu64       %ymm0,0x0000(%rsi)
443         vmovdqa64       %ymm16,%ymm0
444         vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
445
446         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
447         cmp             $0x0040,%rcx
448         jl              .Lxorpart8
449         vpxord          0x0020(%rdx),%ymm0,%ymm0
450         vmovdqu64       %ymm0,0x0020(%rsi)
451         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
452
453         vperm2i128      $0x20,%ymm6,%ymm2,%ymm0
454         cmp             $0x0060,%rcx
455         jl              .Lxorpart8
456         vpxord          0x0040(%rdx),%ymm0,%ymm0
457         vmovdqu64       %ymm0,0x0040(%rsi)
458         vperm2i128      $0x31,%ymm6,%ymm2,%ymm6
459
460         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
461         cmp             $0x0080,%rcx
462         jl              .Lxorpart8
463         vpxord          0x0060(%rdx),%ymm0,%ymm0
464         vmovdqu64       %ymm0,0x0060(%rsi)
465         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
466
467         vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
468         cmp             $0x00a0,%rcx
469         jl              .Lxorpart8
470         vpxord          0x0080(%rdx),%ymm0,%ymm0
471         vmovdqu64       %ymm0,0x0080(%rsi)
472         vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
473
474         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
475         cmp             $0x00c0,%rcx
476         jl              .Lxorpart8
477         vpxord          0x00a0(%rdx),%ymm0,%ymm0
478         vmovdqu64       %ymm0,0x00a0(%rsi)
479         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
480
481         vperm2i128      $0x20,%ymm7,%ymm3,%ymm0
482         cmp             $0x00e0,%rcx
483         jl              .Lxorpart8
484         vpxord          0x00c0(%rdx),%ymm0,%ymm0
485         vmovdqu64       %ymm0,0x00c0(%rsi)
486         vperm2i128      $0x31,%ymm7,%ymm3,%ymm7
487
488         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
489         cmp             $0x0100,%rcx
490         jl              .Lxorpart8
491         vpxord          0x00e0(%rdx),%ymm0,%ymm0
492         vmovdqu64       %ymm0,0x00e0(%rsi)
493         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
494
495         # xor remaining blocks, write to output
496         vmovdqa64       %ymm4,%ymm0
497         cmp             $0x0120,%rcx
498         jl              .Lxorpart8
499         vpxord          0x0100(%rdx),%ymm0,%ymm0
500         vmovdqu64       %ymm0,0x0100(%rsi)
501
502         vmovdqa64       %ymm12,%ymm0
503         cmp             $0x0140,%rcx
504         jl              .Lxorpart8
505         vpxord          0x0120(%rdx),%ymm0,%ymm0
506         vmovdqu64       %ymm0,0x0120(%rsi)
507
508         vmovdqa64       %ymm6,%ymm0
509         cmp             $0x0160,%rcx
510         jl              .Lxorpart8
511         vpxord          0x0140(%rdx),%ymm0,%ymm0
512         vmovdqu64       %ymm0,0x0140(%rsi)
513
514         vmovdqa64       %ymm14,%ymm0
515         cmp             $0x0180,%rcx
516         jl              .Lxorpart8
517         vpxord          0x0160(%rdx),%ymm0,%ymm0
518         vmovdqu64       %ymm0,0x0160(%rsi)
519
520         vmovdqa64       %ymm5,%ymm0
521         cmp             $0x01a0,%rcx
522         jl              .Lxorpart8
523         vpxord          0x0180(%rdx),%ymm0,%ymm0
524         vmovdqu64       %ymm0,0x0180(%rsi)
525
526         vmovdqa64       %ymm13,%ymm0
527         cmp             $0x01c0,%rcx
528         jl              .Lxorpart8
529         vpxord          0x01a0(%rdx),%ymm0,%ymm0
530         vmovdqu64       %ymm0,0x01a0(%rsi)
531
532         vmovdqa64       %ymm7,%ymm0
533         cmp             $0x01e0,%rcx
534         jl              .Lxorpart8
535         vpxord          0x01c0(%rdx),%ymm0,%ymm0
536         vmovdqu64       %ymm0,0x01c0(%rsi)
537
538         vmovdqa64       %ymm15,%ymm0
539         cmp             $0x0200,%rcx
540         jl              .Lxorpart8
541         vpxord          0x01e0(%rdx),%ymm0,%ymm0
542         vmovdqu64       %ymm0,0x01e0(%rsi)
543
544 .Ldone8:
545         vzeroupper
546         ret
547
548 .Lxorpart8:
549         # xor remaining bytes from partial register into output
550         mov             %rcx,%rax
551         and             $0x1f,%rcx
552         jz              .Ldone8
553         mov             %rax,%r9
554         and             $~0x1f,%r9
555
556         mov             $1,%rax
557         shld            %cl,%rax,%rax
558         sub             $1,%rax
559         kmovq           %rax,%k1
560
561         vmovdqu8        (%rdx,%r9),%ymm1{%k1}{z}
562         vpxord          %ymm0,%ymm1,%ymm1
563         vmovdqu8        %ymm1,(%rsi,%r9){%k1}
564
565         jmp             .Ldone8
566
567 ENDPROC(chacha20_8block_xor_avx512vl)