OSDN Git Service

crypto: x86/chacha20 - refactor to allow varying number of rounds
[uclinux-h8/linux.git] / arch / x86 / crypto / chacha-avx2-x86_64.S
1 /*
2  * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13
14 .section        .rodata.cst32.ROT8, "aM", @progbits, 32
15 .align 32
16 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
17         .octa 0x0e0d0c0f0a09080b0605040702010003
18
19 .section        .rodata.cst32.ROT16, "aM", @progbits, 32
20 .align 32
21 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
22         .octa 0x0d0c0f0e09080b0a0504070601000302
23
24 .section        .rodata.cst32.CTRINC, "aM", @progbits, 32
25 .align 32
26 CTRINC: .octa 0x00000003000000020000000100000000
27         .octa 0x00000007000000060000000500000004
28
29 .section        .rodata.cst32.CTR2BL, "aM", @progbits, 32
30 .align 32
31 CTR2BL: .octa 0x00000000000000000000000000000000
32         .octa 0x00000000000000000000000000000001
33
34 .section        .rodata.cst32.CTR4BL, "aM", @progbits, 32
35 .align 32
36 CTR4BL: .octa 0x00000000000000000000000000000002
37         .octa 0x00000000000000000000000000000003
38
39 .text
40
41 ENTRY(chacha_2block_xor_avx2)
42         # %rdi: Input state matrix, s
43         # %rsi: up to 2 data blocks output, o
44         # %rdx: up to 2 data blocks input, i
45         # %rcx: input/output length in bytes
46         # %r8d: nrounds
47
48         # This function encrypts two ChaCha blocks by loading the state
49         # matrix twice across four AVX registers. It performs matrix operations
50         # on four words in each matrix in parallel, but requires shuffling to
51         # rearrange the words after each round.
52
53         vzeroupper
54
55         # x0..3[0-2] = s0..3
56         vbroadcasti128  0x00(%rdi),%ymm0
57         vbroadcasti128  0x10(%rdi),%ymm1
58         vbroadcasti128  0x20(%rdi),%ymm2
59         vbroadcasti128  0x30(%rdi),%ymm3
60
61         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
62
63         vmovdqa         %ymm0,%ymm8
64         vmovdqa         %ymm1,%ymm9
65         vmovdqa         %ymm2,%ymm10
66         vmovdqa         %ymm3,%ymm11
67
68         vmovdqa         ROT8(%rip),%ymm4
69         vmovdqa         ROT16(%rip),%ymm5
70
71         mov             %rcx,%rax
72
73 .Ldoubleround:
74
75         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76         vpaddd          %ymm1,%ymm0,%ymm0
77         vpxor           %ymm0,%ymm3,%ymm3
78         vpshufb         %ymm5,%ymm3,%ymm3
79
80         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81         vpaddd          %ymm3,%ymm2,%ymm2
82         vpxor           %ymm2,%ymm1,%ymm1
83         vmovdqa         %ymm1,%ymm6
84         vpslld          $12,%ymm6,%ymm6
85         vpsrld          $20,%ymm1,%ymm1
86         vpor            %ymm6,%ymm1,%ymm1
87
88         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89         vpaddd          %ymm1,%ymm0,%ymm0
90         vpxor           %ymm0,%ymm3,%ymm3
91         vpshufb         %ymm4,%ymm3,%ymm3
92
93         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94         vpaddd          %ymm3,%ymm2,%ymm2
95         vpxor           %ymm2,%ymm1,%ymm1
96         vmovdqa         %ymm1,%ymm7
97         vpslld          $7,%ymm7,%ymm7
98         vpsrld          $25,%ymm1,%ymm1
99         vpor            %ymm7,%ymm1,%ymm1
100
101         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
102         vpshufd         $0x39,%ymm1,%ymm1
103         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104         vpshufd         $0x4e,%ymm2,%ymm2
105         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
106         vpshufd         $0x93,%ymm3,%ymm3
107
108         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
109         vpaddd          %ymm1,%ymm0,%ymm0
110         vpxor           %ymm0,%ymm3,%ymm3
111         vpshufb         %ymm5,%ymm3,%ymm3
112
113         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
114         vpaddd          %ymm3,%ymm2,%ymm2
115         vpxor           %ymm2,%ymm1,%ymm1
116         vmovdqa         %ymm1,%ymm6
117         vpslld          $12,%ymm6,%ymm6
118         vpsrld          $20,%ymm1,%ymm1
119         vpor            %ymm6,%ymm1,%ymm1
120
121         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
122         vpaddd          %ymm1,%ymm0,%ymm0
123         vpxor           %ymm0,%ymm3,%ymm3
124         vpshufb         %ymm4,%ymm3,%ymm3
125
126         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
127         vpaddd          %ymm3,%ymm2,%ymm2
128         vpxor           %ymm2,%ymm1,%ymm1
129         vmovdqa         %ymm1,%ymm7
130         vpslld          $7,%ymm7,%ymm7
131         vpsrld          $25,%ymm1,%ymm1
132         vpor            %ymm7,%ymm1,%ymm1
133
134         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
135         vpshufd         $0x93,%ymm1,%ymm1
136         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
137         vpshufd         $0x4e,%ymm2,%ymm2
138         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
139         vpshufd         $0x39,%ymm3,%ymm3
140
141         sub             $2,%r8d
142         jnz             .Ldoubleround
143
144         # o0 = i0 ^ (x0 + s0)
145         vpaddd          %ymm8,%ymm0,%ymm7
146         cmp             $0x10,%rax
147         jl              .Lxorpart2
148         vpxor           0x00(%rdx),%xmm7,%xmm6
149         vmovdqu         %xmm6,0x00(%rsi)
150         vextracti128    $1,%ymm7,%xmm0
151         # o1 = i1 ^ (x1 + s1)
152         vpaddd          %ymm9,%ymm1,%ymm7
153         cmp             $0x20,%rax
154         jl              .Lxorpart2
155         vpxor           0x10(%rdx),%xmm7,%xmm6
156         vmovdqu         %xmm6,0x10(%rsi)
157         vextracti128    $1,%ymm7,%xmm1
158         # o2 = i2 ^ (x2 + s2)
159         vpaddd          %ymm10,%ymm2,%ymm7
160         cmp             $0x30,%rax
161         jl              .Lxorpart2
162         vpxor           0x20(%rdx),%xmm7,%xmm6
163         vmovdqu         %xmm6,0x20(%rsi)
164         vextracti128    $1,%ymm7,%xmm2
165         # o3 = i3 ^ (x3 + s3)
166         vpaddd          %ymm11,%ymm3,%ymm7
167         cmp             $0x40,%rax
168         jl              .Lxorpart2
169         vpxor           0x30(%rdx),%xmm7,%xmm6
170         vmovdqu         %xmm6,0x30(%rsi)
171         vextracti128    $1,%ymm7,%xmm3
172
173         # xor and write second block
174         vmovdqa         %xmm0,%xmm7
175         cmp             $0x50,%rax
176         jl              .Lxorpart2
177         vpxor           0x40(%rdx),%xmm7,%xmm6
178         vmovdqu         %xmm6,0x40(%rsi)
179
180         vmovdqa         %xmm1,%xmm7
181         cmp             $0x60,%rax
182         jl              .Lxorpart2
183         vpxor           0x50(%rdx),%xmm7,%xmm6
184         vmovdqu         %xmm6,0x50(%rsi)
185
186         vmovdqa         %xmm2,%xmm7
187         cmp             $0x70,%rax
188         jl              .Lxorpart2
189         vpxor           0x60(%rdx),%xmm7,%xmm6
190         vmovdqu         %xmm6,0x60(%rsi)
191
192         vmovdqa         %xmm3,%xmm7
193         cmp             $0x80,%rax
194         jl              .Lxorpart2
195         vpxor           0x70(%rdx),%xmm7,%xmm6
196         vmovdqu         %xmm6,0x70(%rsi)
197
198 .Ldone2:
199         vzeroupper
200         ret
201
202 .Lxorpart2:
203         # xor remaining bytes from partial register into output
204         mov             %rax,%r9
205         and             $0x0f,%r9
206         jz              .Ldone2
207         and             $~0x0f,%rax
208
209         mov             %rsi,%r11
210
211         lea             8(%rsp),%r10
212         sub             $0x10,%rsp
213         and             $~31,%rsp
214
215         lea             (%rdx,%rax),%rsi
216         mov             %rsp,%rdi
217         mov             %r9,%rcx
218         rep movsb
219
220         vpxor           0x00(%rsp),%xmm7,%xmm7
221         vmovdqa         %xmm7,0x00(%rsp)
222
223         mov             %rsp,%rsi
224         lea             (%r11,%rax),%rdi
225         mov             %r9,%rcx
226         rep movsb
227
228         lea             -8(%r10),%rsp
229         jmp             .Ldone2
230
231 ENDPROC(chacha_2block_xor_avx2)
232
233 ENTRY(chacha_4block_xor_avx2)
234         # %rdi: Input state matrix, s
235         # %rsi: up to 4 data blocks output, o
236         # %rdx: up to 4 data blocks input, i
237         # %rcx: input/output length in bytes
238         # %r8d: nrounds
239
240         # This function encrypts four ChaCha blocks by loading the state
241         # matrix four times across eight AVX registers. It performs matrix
242         # operations on four words in two matrices in parallel, sequentially
243         # to the operations on the four words of the other two matrices. The
244         # required word shuffling has a rather high latency, we can do the
245         # arithmetic on two matrix-pairs without much slowdown.
246
247         vzeroupper
248
249         # x0..3[0-4] = s0..3
250         vbroadcasti128  0x00(%rdi),%ymm0
251         vbroadcasti128  0x10(%rdi),%ymm1
252         vbroadcasti128  0x20(%rdi),%ymm2
253         vbroadcasti128  0x30(%rdi),%ymm3
254
255         vmovdqa         %ymm0,%ymm4
256         vmovdqa         %ymm1,%ymm5
257         vmovdqa         %ymm2,%ymm6
258         vmovdqa         %ymm3,%ymm7
259
260         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
261         vpaddd          CTR4BL(%rip),%ymm7,%ymm7
262
263         vmovdqa         %ymm0,%ymm11
264         vmovdqa         %ymm1,%ymm12
265         vmovdqa         %ymm2,%ymm13
266         vmovdqa         %ymm3,%ymm14
267         vmovdqa         %ymm7,%ymm15
268
269         vmovdqa         ROT8(%rip),%ymm8
270         vmovdqa         ROT16(%rip),%ymm9
271
272         mov             %rcx,%rax
273
274 .Ldoubleround4:
275
276         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277         vpaddd          %ymm1,%ymm0,%ymm0
278         vpxor           %ymm0,%ymm3,%ymm3
279         vpshufb         %ymm9,%ymm3,%ymm3
280
281         vpaddd          %ymm5,%ymm4,%ymm4
282         vpxor           %ymm4,%ymm7,%ymm7
283         vpshufb         %ymm9,%ymm7,%ymm7
284
285         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286         vpaddd          %ymm3,%ymm2,%ymm2
287         vpxor           %ymm2,%ymm1,%ymm1
288         vmovdqa         %ymm1,%ymm10
289         vpslld          $12,%ymm10,%ymm10
290         vpsrld          $20,%ymm1,%ymm1
291         vpor            %ymm10,%ymm1,%ymm1
292
293         vpaddd          %ymm7,%ymm6,%ymm6
294         vpxor           %ymm6,%ymm5,%ymm5
295         vmovdqa         %ymm5,%ymm10
296         vpslld          $12,%ymm10,%ymm10
297         vpsrld          $20,%ymm5,%ymm5
298         vpor            %ymm10,%ymm5,%ymm5
299
300         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
301         vpaddd          %ymm1,%ymm0,%ymm0
302         vpxor           %ymm0,%ymm3,%ymm3
303         vpshufb         %ymm8,%ymm3,%ymm3
304
305         vpaddd          %ymm5,%ymm4,%ymm4
306         vpxor           %ymm4,%ymm7,%ymm7
307         vpshufb         %ymm8,%ymm7,%ymm7
308
309         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
310         vpaddd          %ymm3,%ymm2,%ymm2
311         vpxor           %ymm2,%ymm1,%ymm1
312         vmovdqa         %ymm1,%ymm10
313         vpslld          $7,%ymm10,%ymm10
314         vpsrld          $25,%ymm1,%ymm1
315         vpor            %ymm10,%ymm1,%ymm1
316
317         vpaddd          %ymm7,%ymm6,%ymm6
318         vpxor           %ymm6,%ymm5,%ymm5
319         vmovdqa         %ymm5,%ymm10
320         vpslld          $7,%ymm10,%ymm10
321         vpsrld          $25,%ymm5,%ymm5
322         vpor            %ymm10,%ymm5,%ymm5
323
324         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
325         vpshufd         $0x39,%ymm1,%ymm1
326         vpshufd         $0x39,%ymm5,%ymm5
327         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
328         vpshufd         $0x4e,%ymm2,%ymm2
329         vpshufd         $0x4e,%ymm6,%ymm6
330         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
331         vpshufd         $0x93,%ymm3,%ymm3
332         vpshufd         $0x93,%ymm7,%ymm7
333
334         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
335         vpaddd          %ymm1,%ymm0,%ymm0
336         vpxor           %ymm0,%ymm3,%ymm3
337         vpshufb         %ymm9,%ymm3,%ymm3
338
339         vpaddd          %ymm5,%ymm4,%ymm4
340         vpxor           %ymm4,%ymm7,%ymm7
341         vpshufb         %ymm9,%ymm7,%ymm7
342
343         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
344         vpaddd          %ymm3,%ymm2,%ymm2
345         vpxor           %ymm2,%ymm1,%ymm1
346         vmovdqa         %ymm1,%ymm10
347         vpslld          $12,%ymm10,%ymm10
348         vpsrld          $20,%ymm1,%ymm1
349         vpor            %ymm10,%ymm1,%ymm1
350
351         vpaddd          %ymm7,%ymm6,%ymm6
352         vpxor           %ymm6,%ymm5,%ymm5
353         vmovdqa         %ymm5,%ymm10
354         vpslld          $12,%ymm10,%ymm10
355         vpsrld          $20,%ymm5,%ymm5
356         vpor            %ymm10,%ymm5,%ymm5
357
358         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
359         vpaddd          %ymm1,%ymm0,%ymm0
360         vpxor           %ymm0,%ymm3,%ymm3
361         vpshufb         %ymm8,%ymm3,%ymm3
362
363         vpaddd          %ymm5,%ymm4,%ymm4
364         vpxor           %ymm4,%ymm7,%ymm7
365         vpshufb         %ymm8,%ymm7,%ymm7
366
367         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
368         vpaddd          %ymm3,%ymm2,%ymm2
369         vpxor           %ymm2,%ymm1,%ymm1
370         vmovdqa         %ymm1,%ymm10
371         vpslld          $7,%ymm10,%ymm10
372         vpsrld          $25,%ymm1,%ymm1
373         vpor            %ymm10,%ymm1,%ymm1
374
375         vpaddd          %ymm7,%ymm6,%ymm6
376         vpxor           %ymm6,%ymm5,%ymm5
377         vmovdqa         %ymm5,%ymm10
378         vpslld          $7,%ymm10,%ymm10
379         vpsrld          $25,%ymm5,%ymm5
380         vpor            %ymm10,%ymm5,%ymm5
381
382         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
383         vpshufd         $0x93,%ymm1,%ymm1
384         vpshufd         $0x93,%ymm5,%ymm5
385         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
386         vpshufd         $0x4e,%ymm2,%ymm2
387         vpshufd         $0x4e,%ymm6,%ymm6
388         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
389         vpshufd         $0x39,%ymm3,%ymm3
390         vpshufd         $0x39,%ymm7,%ymm7
391
392         sub             $2,%r8d
393         jnz             .Ldoubleround4
394
395         # o0 = i0 ^ (x0 + s0), first block
396         vpaddd          %ymm11,%ymm0,%ymm10
397         cmp             $0x10,%rax
398         jl              .Lxorpart4
399         vpxor           0x00(%rdx),%xmm10,%xmm9
400         vmovdqu         %xmm9,0x00(%rsi)
401         vextracti128    $1,%ymm10,%xmm0
402         # o1 = i1 ^ (x1 + s1), first block
403         vpaddd          %ymm12,%ymm1,%ymm10
404         cmp             $0x20,%rax
405         jl              .Lxorpart4
406         vpxor           0x10(%rdx),%xmm10,%xmm9
407         vmovdqu         %xmm9,0x10(%rsi)
408         vextracti128    $1,%ymm10,%xmm1
409         # o2 = i2 ^ (x2 + s2), first block
410         vpaddd          %ymm13,%ymm2,%ymm10
411         cmp             $0x30,%rax
412         jl              .Lxorpart4
413         vpxor           0x20(%rdx),%xmm10,%xmm9
414         vmovdqu         %xmm9,0x20(%rsi)
415         vextracti128    $1,%ymm10,%xmm2
416         # o3 = i3 ^ (x3 + s3), first block
417         vpaddd          %ymm14,%ymm3,%ymm10
418         cmp             $0x40,%rax
419         jl              .Lxorpart4
420         vpxor           0x30(%rdx),%xmm10,%xmm9
421         vmovdqu         %xmm9,0x30(%rsi)
422         vextracti128    $1,%ymm10,%xmm3
423
424         # xor and write second block
425         vmovdqa         %xmm0,%xmm10
426         cmp             $0x50,%rax
427         jl              .Lxorpart4
428         vpxor           0x40(%rdx),%xmm10,%xmm9
429         vmovdqu         %xmm9,0x40(%rsi)
430
431         vmovdqa         %xmm1,%xmm10
432         cmp             $0x60,%rax
433         jl              .Lxorpart4
434         vpxor           0x50(%rdx),%xmm10,%xmm9
435         vmovdqu         %xmm9,0x50(%rsi)
436
437         vmovdqa         %xmm2,%xmm10
438         cmp             $0x70,%rax
439         jl              .Lxorpart4
440         vpxor           0x60(%rdx),%xmm10,%xmm9
441         vmovdqu         %xmm9,0x60(%rsi)
442
443         vmovdqa         %xmm3,%xmm10
444         cmp             $0x80,%rax
445         jl              .Lxorpart4
446         vpxor           0x70(%rdx),%xmm10,%xmm9
447         vmovdqu         %xmm9,0x70(%rsi)
448
449         # o0 = i0 ^ (x0 + s0), third block
450         vpaddd          %ymm11,%ymm4,%ymm10
451         cmp             $0x90,%rax
452         jl              .Lxorpart4
453         vpxor           0x80(%rdx),%xmm10,%xmm9
454         vmovdqu         %xmm9,0x80(%rsi)
455         vextracti128    $1,%ymm10,%xmm4
456         # o1 = i1 ^ (x1 + s1), third block
457         vpaddd          %ymm12,%ymm5,%ymm10
458         cmp             $0xa0,%rax
459         jl              .Lxorpart4
460         vpxor           0x90(%rdx),%xmm10,%xmm9
461         vmovdqu         %xmm9,0x90(%rsi)
462         vextracti128    $1,%ymm10,%xmm5
463         # o2 = i2 ^ (x2 + s2), third block
464         vpaddd          %ymm13,%ymm6,%ymm10
465         cmp             $0xb0,%rax
466         jl              .Lxorpart4
467         vpxor           0xa0(%rdx),%xmm10,%xmm9
468         vmovdqu         %xmm9,0xa0(%rsi)
469         vextracti128    $1,%ymm10,%xmm6
470         # o3 = i3 ^ (x3 + s3), third block
471         vpaddd          %ymm15,%ymm7,%ymm10
472         cmp             $0xc0,%rax
473         jl              .Lxorpart4
474         vpxor           0xb0(%rdx),%xmm10,%xmm9
475         vmovdqu         %xmm9,0xb0(%rsi)
476         vextracti128    $1,%ymm10,%xmm7
477
478         # xor and write fourth block
479         vmovdqa         %xmm4,%xmm10
480         cmp             $0xd0,%rax
481         jl              .Lxorpart4
482         vpxor           0xc0(%rdx),%xmm10,%xmm9
483         vmovdqu         %xmm9,0xc0(%rsi)
484
485         vmovdqa         %xmm5,%xmm10
486         cmp             $0xe0,%rax
487         jl              .Lxorpart4
488         vpxor           0xd0(%rdx),%xmm10,%xmm9
489         vmovdqu         %xmm9,0xd0(%rsi)
490
491         vmovdqa         %xmm6,%xmm10
492         cmp             $0xf0,%rax
493         jl              .Lxorpart4
494         vpxor           0xe0(%rdx),%xmm10,%xmm9
495         vmovdqu         %xmm9,0xe0(%rsi)
496
497         vmovdqa         %xmm7,%xmm10
498         cmp             $0x100,%rax
499         jl              .Lxorpart4
500         vpxor           0xf0(%rdx),%xmm10,%xmm9
501         vmovdqu         %xmm9,0xf0(%rsi)
502
503 .Ldone4:
504         vzeroupper
505         ret
506
507 .Lxorpart4:
508         # xor remaining bytes from partial register into output
509         mov             %rax,%r9
510         and             $0x0f,%r9
511         jz              .Ldone4
512         and             $~0x0f,%rax
513
514         mov             %rsi,%r11
515
516         lea             8(%rsp),%r10
517         sub             $0x10,%rsp
518         and             $~31,%rsp
519
520         lea             (%rdx,%rax),%rsi
521         mov             %rsp,%rdi
522         mov             %r9,%rcx
523         rep movsb
524
525         vpxor           0x00(%rsp),%xmm10,%xmm10
526         vmovdqa         %xmm10,0x00(%rsp)
527
528         mov             %rsp,%rsi
529         lea             (%r11,%rax),%rdi
530         mov             %r9,%rcx
531         rep movsb
532
533         lea             -8(%r10),%rsp
534         jmp             .Ldone4
535
536 ENDPROC(chacha_4block_xor_avx2)
537
538 ENTRY(chacha_8block_xor_avx2)
539         # %rdi: Input state matrix, s
540         # %rsi: up to 8 data blocks output, o
541         # %rdx: up to 8 data blocks input, i
542         # %rcx: input/output length in bytes
543         # %r8d: nrounds
544
545         # This function encrypts eight consecutive ChaCha blocks by loading
546         # the state matrix in AVX registers eight times. As we need some
547         # scratch registers, we save the first four registers on the stack. The
548         # algorithm performs each operation on the corresponding word of each
549         # state matrix, hence requires no word shuffling. For final XORing step
550         # we transpose the matrix by interleaving 32-, 64- and then 128-bit
551         # words, which allows us to do XOR in AVX registers. 8/16-bit word
552         # rotation is done with the slightly better performing byte shuffling,
553         # 7/12-bit word rotation uses traditional shift+OR.
554
555         vzeroupper
556         # 4 * 32 byte stack, 32-byte aligned
557         lea             8(%rsp),%r10
558         and             $~31, %rsp
559         sub             $0x80, %rsp
560         mov             %rcx,%rax
561
562         # x0..15[0-7] = s[0..15]
563         vpbroadcastd    0x00(%rdi),%ymm0
564         vpbroadcastd    0x04(%rdi),%ymm1
565         vpbroadcastd    0x08(%rdi),%ymm2
566         vpbroadcastd    0x0c(%rdi),%ymm3
567         vpbroadcastd    0x10(%rdi),%ymm4
568         vpbroadcastd    0x14(%rdi),%ymm5
569         vpbroadcastd    0x18(%rdi),%ymm6
570         vpbroadcastd    0x1c(%rdi),%ymm7
571         vpbroadcastd    0x20(%rdi),%ymm8
572         vpbroadcastd    0x24(%rdi),%ymm9
573         vpbroadcastd    0x28(%rdi),%ymm10
574         vpbroadcastd    0x2c(%rdi),%ymm11
575         vpbroadcastd    0x30(%rdi),%ymm12
576         vpbroadcastd    0x34(%rdi),%ymm13
577         vpbroadcastd    0x38(%rdi),%ymm14
578         vpbroadcastd    0x3c(%rdi),%ymm15
579         # x0..3 on stack
580         vmovdqa         %ymm0,0x00(%rsp)
581         vmovdqa         %ymm1,0x20(%rsp)
582         vmovdqa         %ymm2,0x40(%rsp)
583         vmovdqa         %ymm3,0x60(%rsp)
584
585         vmovdqa         CTRINC(%rip),%ymm1
586         vmovdqa         ROT8(%rip),%ymm2
587         vmovdqa         ROT16(%rip),%ymm3
588
589         # x12 += counter values 0-3
590         vpaddd          %ymm1,%ymm12,%ymm12
591
592 .Ldoubleround8:
593         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
594         vpaddd          0x00(%rsp),%ymm4,%ymm0
595         vmovdqa         %ymm0,0x00(%rsp)
596         vpxor           %ymm0,%ymm12,%ymm12
597         vpshufb         %ymm3,%ymm12,%ymm12
598         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
599         vpaddd          0x20(%rsp),%ymm5,%ymm0
600         vmovdqa         %ymm0,0x20(%rsp)
601         vpxor           %ymm0,%ymm13,%ymm13
602         vpshufb         %ymm3,%ymm13,%ymm13
603         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
604         vpaddd          0x40(%rsp),%ymm6,%ymm0
605         vmovdqa         %ymm0,0x40(%rsp)
606         vpxor           %ymm0,%ymm14,%ymm14
607         vpshufb         %ymm3,%ymm14,%ymm14
608         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
609         vpaddd          0x60(%rsp),%ymm7,%ymm0
610         vmovdqa         %ymm0,0x60(%rsp)
611         vpxor           %ymm0,%ymm15,%ymm15
612         vpshufb         %ymm3,%ymm15,%ymm15
613
614         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
615         vpaddd          %ymm12,%ymm8,%ymm8
616         vpxor           %ymm8,%ymm4,%ymm4
617         vpslld          $12,%ymm4,%ymm0
618         vpsrld          $20,%ymm4,%ymm4
619         vpor            %ymm0,%ymm4,%ymm4
620         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
621         vpaddd          %ymm13,%ymm9,%ymm9
622         vpxor           %ymm9,%ymm5,%ymm5
623         vpslld          $12,%ymm5,%ymm0
624         vpsrld          $20,%ymm5,%ymm5
625         vpor            %ymm0,%ymm5,%ymm5
626         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
627         vpaddd          %ymm14,%ymm10,%ymm10
628         vpxor           %ymm10,%ymm6,%ymm6
629         vpslld          $12,%ymm6,%ymm0
630         vpsrld          $20,%ymm6,%ymm6
631         vpor            %ymm0,%ymm6,%ymm6
632         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
633         vpaddd          %ymm15,%ymm11,%ymm11
634         vpxor           %ymm11,%ymm7,%ymm7
635         vpslld          $12,%ymm7,%ymm0
636         vpsrld          $20,%ymm7,%ymm7
637         vpor            %ymm0,%ymm7,%ymm7
638
639         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
640         vpaddd          0x00(%rsp),%ymm4,%ymm0
641         vmovdqa         %ymm0,0x00(%rsp)
642         vpxor           %ymm0,%ymm12,%ymm12
643         vpshufb         %ymm2,%ymm12,%ymm12
644         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
645         vpaddd          0x20(%rsp),%ymm5,%ymm0
646         vmovdqa         %ymm0,0x20(%rsp)
647         vpxor           %ymm0,%ymm13,%ymm13
648         vpshufb         %ymm2,%ymm13,%ymm13
649         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
650         vpaddd          0x40(%rsp),%ymm6,%ymm0
651         vmovdqa         %ymm0,0x40(%rsp)
652         vpxor           %ymm0,%ymm14,%ymm14
653         vpshufb         %ymm2,%ymm14,%ymm14
654         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
655         vpaddd          0x60(%rsp),%ymm7,%ymm0
656         vmovdqa         %ymm0,0x60(%rsp)
657         vpxor           %ymm0,%ymm15,%ymm15
658         vpshufb         %ymm2,%ymm15,%ymm15
659
660         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
661         vpaddd          %ymm12,%ymm8,%ymm8
662         vpxor           %ymm8,%ymm4,%ymm4
663         vpslld          $7,%ymm4,%ymm0
664         vpsrld          $25,%ymm4,%ymm4
665         vpor            %ymm0,%ymm4,%ymm4
666         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
667         vpaddd          %ymm13,%ymm9,%ymm9
668         vpxor           %ymm9,%ymm5,%ymm5
669         vpslld          $7,%ymm5,%ymm0
670         vpsrld          $25,%ymm5,%ymm5
671         vpor            %ymm0,%ymm5,%ymm5
672         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
673         vpaddd          %ymm14,%ymm10,%ymm10
674         vpxor           %ymm10,%ymm6,%ymm6
675         vpslld          $7,%ymm6,%ymm0
676         vpsrld          $25,%ymm6,%ymm6
677         vpor            %ymm0,%ymm6,%ymm6
678         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
679         vpaddd          %ymm15,%ymm11,%ymm11
680         vpxor           %ymm11,%ymm7,%ymm7
681         vpslld          $7,%ymm7,%ymm0
682         vpsrld          $25,%ymm7,%ymm7
683         vpor            %ymm0,%ymm7,%ymm7
684
685         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
686         vpaddd          0x00(%rsp),%ymm5,%ymm0
687         vmovdqa         %ymm0,0x00(%rsp)
688         vpxor           %ymm0,%ymm15,%ymm15
689         vpshufb         %ymm3,%ymm15,%ymm15
690         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
691         vpaddd          0x20(%rsp),%ymm6,%ymm0
692         vmovdqa         %ymm0,0x20(%rsp)
693         vpxor           %ymm0,%ymm12,%ymm12
694         vpshufb         %ymm3,%ymm12,%ymm12
695         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
696         vpaddd          0x40(%rsp),%ymm7,%ymm0
697         vmovdqa         %ymm0,0x40(%rsp)
698         vpxor           %ymm0,%ymm13,%ymm13
699         vpshufb         %ymm3,%ymm13,%ymm13
700         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
701         vpaddd          0x60(%rsp),%ymm4,%ymm0
702         vmovdqa         %ymm0,0x60(%rsp)
703         vpxor           %ymm0,%ymm14,%ymm14
704         vpshufb         %ymm3,%ymm14,%ymm14
705
706         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
707         vpaddd          %ymm15,%ymm10,%ymm10
708         vpxor           %ymm10,%ymm5,%ymm5
709         vpslld          $12,%ymm5,%ymm0
710         vpsrld          $20,%ymm5,%ymm5
711         vpor            %ymm0,%ymm5,%ymm5
712         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
713         vpaddd          %ymm12,%ymm11,%ymm11
714         vpxor           %ymm11,%ymm6,%ymm6
715         vpslld          $12,%ymm6,%ymm0
716         vpsrld          $20,%ymm6,%ymm6
717         vpor            %ymm0,%ymm6,%ymm6
718         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
719         vpaddd          %ymm13,%ymm8,%ymm8
720         vpxor           %ymm8,%ymm7,%ymm7
721         vpslld          $12,%ymm7,%ymm0
722         vpsrld          $20,%ymm7,%ymm7
723         vpor            %ymm0,%ymm7,%ymm7
724         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
725         vpaddd          %ymm14,%ymm9,%ymm9
726         vpxor           %ymm9,%ymm4,%ymm4
727         vpslld          $12,%ymm4,%ymm0
728         vpsrld          $20,%ymm4,%ymm4
729         vpor            %ymm0,%ymm4,%ymm4
730
731         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
732         vpaddd          0x00(%rsp),%ymm5,%ymm0
733         vmovdqa         %ymm0,0x00(%rsp)
734         vpxor           %ymm0,%ymm15,%ymm15
735         vpshufb         %ymm2,%ymm15,%ymm15
736         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
737         vpaddd          0x20(%rsp),%ymm6,%ymm0
738         vmovdqa         %ymm0,0x20(%rsp)
739         vpxor           %ymm0,%ymm12,%ymm12
740         vpshufb         %ymm2,%ymm12,%ymm12
741         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
742         vpaddd          0x40(%rsp),%ymm7,%ymm0
743         vmovdqa         %ymm0,0x40(%rsp)
744         vpxor           %ymm0,%ymm13,%ymm13
745         vpshufb         %ymm2,%ymm13,%ymm13
746         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
747         vpaddd          0x60(%rsp),%ymm4,%ymm0
748         vmovdqa         %ymm0,0x60(%rsp)
749         vpxor           %ymm0,%ymm14,%ymm14
750         vpshufb         %ymm2,%ymm14,%ymm14
751
752         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
753         vpaddd          %ymm15,%ymm10,%ymm10
754         vpxor           %ymm10,%ymm5,%ymm5
755         vpslld          $7,%ymm5,%ymm0
756         vpsrld          $25,%ymm5,%ymm5
757         vpor            %ymm0,%ymm5,%ymm5
758         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
759         vpaddd          %ymm12,%ymm11,%ymm11
760         vpxor           %ymm11,%ymm6,%ymm6
761         vpslld          $7,%ymm6,%ymm0
762         vpsrld          $25,%ymm6,%ymm6
763         vpor            %ymm0,%ymm6,%ymm6
764         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
765         vpaddd          %ymm13,%ymm8,%ymm8
766         vpxor           %ymm8,%ymm7,%ymm7
767         vpslld          $7,%ymm7,%ymm0
768         vpsrld          $25,%ymm7,%ymm7
769         vpor            %ymm0,%ymm7,%ymm7
770         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
771         vpaddd          %ymm14,%ymm9,%ymm9
772         vpxor           %ymm9,%ymm4,%ymm4
773         vpslld          $7,%ymm4,%ymm0
774         vpsrld          $25,%ymm4,%ymm4
775         vpor            %ymm0,%ymm4,%ymm4
776
777         sub             $2,%r8d
778         jnz             .Ldoubleround8
779
780         # x0..15[0-3] += s[0..15]
781         vpbroadcastd    0x00(%rdi),%ymm0
782         vpaddd          0x00(%rsp),%ymm0,%ymm0
783         vmovdqa         %ymm0,0x00(%rsp)
784         vpbroadcastd    0x04(%rdi),%ymm0
785         vpaddd          0x20(%rsp),%ymm0,%ymm0
786         vmovdqa         %ymm0,0x20(%rsp)
787         vpbroadcastd    0x08(%rdi),%ymm0
788         vpaddd          0x40(%rsp),%ymm0,%ymm0
789         vmovdqa         %ymm0,0x40(%rsp)
790         vpbroadcastd    0x0c(%rdi),%ymm0
791         vpaddd          0x60(%rsp),%ymm0,%ymm0
792         vmovdqa         %ymm0,0x60(%rsp)
793         vpbroadcastd    0x10(%rdi),%ymm0
794         vpaddd          %ymm0,%ymm4,%ymm4
795         vpbroadcastd    0x14(%rdi),%ymm0
796         vpaddd          %ymm0,%ymm5,%ymm5
797         vpbroadcastd    0x18(%rdi),%ymm0
798         vpaddd          %ymm0,%ymm6,%ymm6
799         vpbroadcastd    0x1c(%rdi),%ymm0
800         vpaddd          %ymm0,%ymm7,%ymm7
801         vpbroadcastd    0x20(%rdi),%ymm0
802         vpaddd          %ymm0,%ymm8,%ymm8
803         vpbroadcastd    0x24(%rdi),%ymm0
804         vpaddd          %ymm0,%ymm9,%ymm9
805         vpbroadcastd    0x28(%rdi),%ymm0
806         vpaddd          %ymm0,%ymm10,%ymm10
807         vpbroadcastd    0x2c(%rdi),%ymm0
808         vpaddd          %ymm0,%ymm11,%ymm11
809         vpbroadcastd    0x30(%rdi),%ymm0
810         vpaddd          %ymm0,%ymm12,%ymm12
811         vpbroadcastd    0x34(%rdi),%ymm0
812         vpaddd          %ymm0,%ymm13,%ymm13
813         vpbroadcastd    0x38(%rdi),%ymm0
814         vpaddd          %ymm0,%ymm14,%ymm14
815         vpbroadcastd    0x3c(%rdi),%ymm0
816         vpaddd          %ymm0,%ymm15,%ymm15
817
818         # x12 += counter values 0-3
819         vpaddd          %ymm1,%ymm12,%ymm12
820
821         # interleave 32-bit words in state n, n+1
822         vmovdqa         0x00(%rsp),%ymm0
823         vmovdqa         0x20(%rsp),%ymm1
824         vpunpckldq      %ymm1,%ymm0,%ymm2
825         vpunpckhdq      %ymm1,%ymm0,%ymm1
826         vmovdqa         %ymm2,0x00(%rsp)
827         vmovdqa         %ymm1,0x20(%rsp)
828         vmovdqa         0x40(%rsp),%ymm0
829         vmovdqa         0x60(%rsp),%ymm1
830         vpunpckldq      %ymm1,%ymm0,%ymm2
831         vpunpckhdq      %ymm1,%ymm0,%ymm1
832         vmovdqa         %ymm2,0x40(%rsp)
833         vmovdqa         %ymm1,0x60(%rsp)
834         vmovdqa         %ymm4,%ymm0
835         vpunpckldq      %ymm5,%ymm0,%ymm4
836         vpunpckhdq      %ymm5,%ymm0,%ymm5
837         vmovdqa         %ymm6,%ymm0
838         vpunpckldq      %ymm7,%ymm0,%ymm6
839         vpunpckhdq      %ymm7,%ymm0,%ymm7
840         vmovdqa         %ymm8,%ymm0
841         vpunpckldq      %ymm9,%ymm0,%ymm8
842         vpunpckhdq      %ymm9,%ymm0,%ymm9
843         vmovdqa         %ymm10,%ymm0
844         vpunpckldq      %ymm11,%ymm0,%ymm10
845         vpunpckhdq      %ymm11,%ymm0,%ymm11
846         vmovdqa         %ymm12,%ymm0
847         vpunpckldq      %ymm13,%ymm0,%ymm12
848         vpunpckhdq      %ymm13,%ymm0,%ymm13
849         vmovdqa         %ymm14,%ymm0
850         vpunpckldq      %ymm15,%ymm0,%ymm14
851         vpunpckhdq      %ymm15,%ymm0,%ymm15
852
853         # interleave 64-bit words in state n, n+2
854         vmovdqa         0x00(%rsp),%ymm0
855         vmovdqa         0x40(%rsp),%ymm2
856         vpunpcklqdq     %ymm2,%ymm0,%ymm1
857         vpunpckhqdq     %ymm2,%ymm0,%ymm2
858         vmovdqa         %ymm1,0x00(%rsp)
859         vmovdqa         %ymm2,0x40(%rsp)
860         vmovdqa         0x20(%rsp),%ymm0
861         vmovdqa         0x60(%rsp),%ymm2
862         vpunpcklqdq     %ymm2,%ymm0,%ymm1
863         vpunpckhqdq     %ymm2,%ymm0,%ymm2
864         vmovdqa         %ymm1,0x20(%rsp)
865         vmovdqa         %ymm2,0x60(%rsp)
866         vmovdqa         %ymm4,%ymm0
867         vpunpcklqdq     %ymm6,%ymm0,%ymm4
868         vpunpckhqdq     %ymm6,%ymm0,%ymm6
869         vmovdqa         %ymm5,%ymm0
870         vpunpcklqdq     %ymm7,%ymm0,%ymm5
871         vpunpckhqdq     %ymm7,%ymm0,%ymm7
872         vmovdqa         %ymm8,%ymm0
873         vpunpcklqdq     %ymm10,%ymm0,%ymm8
874         vpunpckhqdq     %ymm10,%ymm0,%ymm10
875         vmovdqa         %ymm9,%ymm0
876         vpunpcklqdq     %ymm11,%ymm0,%ymm9
877         vpunpckhqdq     %ymm11,%ymm0,%ymm11
878         vmovdqa         %ymm12,%ymm0
879         vpunpcklqdq     %ymm14,%ymm0,%ymm12
880         vpunpckhqdq     %ymm14,%ymm0,%ymm14
881         vmovdqa         %ymm13,%ymm0
882         vpunpcklqdq     %ymm15,%ymm0,%ymm13
883         vpunpckhqdq     %ymm15,%ymm0,%ymm15
884
885         # interleave 128-bit words in state n, n+4
886         # xor/write first four blocks
887         vmovdqa         0x00(%rsp),%ymm1
888         vperm2i128      $0x20,%ymm4,%ymm1,%ymm0
889         cmp             $0x0020,%rax
890         jl              .Lxorpart8
891         vpxor           0x0000(%rdx),%ymm0,%ymm0
892         vmovdqu         %ymm0,0x0000(%rsi)
893         vperm2i128      $0x31,%ymm4,%ymm1,%ymm4
894
895         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
896         cmp             $0x0040,%rax
897         jl              .Lxorpart8
898         vpxor           0x0020(%rdx),%ymm0,%ymm0
899         vmovdqu         %ymm0,0x0020(%rsi)
900         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
901
902         vmovdqa         0x40(%rsp),%ymm1
903         vperm2i128      $0x20,%ymm6,%ymm1,%ymm0
904         cmp             $0x0060,%rax
905         jl              .Lxorpart8
906         vpxor           0x0040(%rdx),%ymm0,%ymm0
907         vmovdqu         %ymm0,0x0040(%rsi)
908         vperm2i128      $0x31,%ymm6,%ymm1,%ymm6
909
910         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
911         cmp             $0x0080,%rax
912         jl              .Lxorpart8
913         vpxor           0x0060(%rdx),%ymm0,%ymm0
914         vmovdqu         %ymm0,0x0060(%rsi)
915         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
916
917         vmovdqa         0x20(%rsp),%ymm1
918         vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
919         cmp             $0x00a0,%rax
920         jl              .Lxorpart8
921         vpxor           0x0080(%rdx),%ymm0,%ymm0
922         vmovdqu         %ymm0,0x0080(%rsi)
923         vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
924
925         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
926         cmp             $0x00c0,%rax
927         jl              .Lxorpart8
928         vpxor           0x00a0(%rdx),%ymm0,%ymm0
929         vmovdqu         %ymm0,0x00a0(%rsi)
930         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
931
932         vmovdqa         0x60(%rsp),%ymm1
933         vperm2i128      $0x20,%ymm7,%ymm1,%ymm0
934         cmp             $0x00e0,%rax
935         jl              .Lxorpart8
936         vpxor           0x00c0(%rdx),%ymm0,%ymm0
937         vmovdqu         %ymm0,0x00c0(%rsi)
938         vperm2i128      $0x31,%ymm7,%ymm1,%ymm7
939
940         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
941         cmp             $0x0100,%rax
942         jl              .Lxorpart8
943         vpxor           0x00e0(%rdx),%ymm0,%ymm0
944         vmovdqu         %ymm0,0x00e0(%rsi)
945         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
946
947         # xor remaining blocks, write to output
948         vmovdqa         %ymm4,%ymm0
949         cmp             $0x0120,%rax
950         jl              .Lxorpart8
951         vpxor           0x0100(%rdx),%ymm0,%ymm0
952         vmovdqu         %ymm0,0x0100(%rsi)
953
954         vmovdqa         %ymm12,%ymm0
955         cmp             $0x0140,%rax
956         jl              .Lxorpart8
957         vpxor           0x0120(%rdx),%ymm0,%ymm0
958         vmovdqu         %ymm0,0x0120(%rsi)
959
960         vmovdqa         %ymm6,%ymm0
961         cmp             $0x0160,%rax
962         jl              .Lxorpart8
963         vpxor           0x0140(%rdx),%ymm0,%ymm0
964         vmovdqu         %ymm0,0x0140(%rsi)
965
966         vmovdqa         %ymm14,%ymm0
967         cmp             $0x0180,%rax
968         jl              .Lxorpart8
969         vpxor           0x0160(%rdx),%ymm0,%ymm0
970         vmovdqu         %ymm0,0x0160(%rsi)
971
972         vmovdqa         %ymm5,%ymm0
973         cmp             $0x01a0,%rax
974         jl              .Lxorpart8
975         vpxor           0x0180(%rdx),%ymm0,%ymm0
976         vmovdqu         %ymm0,0x0180(%rsi)
977
978         vmovdqa         %ymm13,%ymm0
979         cmp             $0x01c0,%rax
980         jl              .Lxorpart8
981         vpxor           0x01a0(%rdx),%ymm0,%ymm0
982         vmovdqu         %ymm0,0x01a0(%rsi)
983
984         vmovdqa         %ymm7,%ymm0
985         cmp             $0x01e0,%rax
986         jl              .Lxorpart8
987         vpxor           0x01c0(%rdx),%ymm0,%ymm0
988         vmovdqu         %ymm0,0x01c0(%rsi)
989
990         vmovdqa         %ymm15,%ymm0
991         cmp             $0x0200,%rax
992         jl              .Lxorpart8
993         vpxor           0x01e0(%rdx),%ymm0,%ymm0
994         vmovdqu         %ymm0,0x01e0(%rsi)
995
996 .Ldone8:
997         vzeroupper
998         lea             -8(%r10),%rsp
999         ret
1000
1001 .Lxorpart8:
1002         # xor remaining bytes from partial register into output
1003         mov             %rax,%r9
1004         and             $0x1f,%r9
1005         jz              .Ldone8
1006         and             $~0x1f,%rax
1007
1008         mov             %rsi,%r11
1009
1010         lea             (%rdx,%rax),%rsi
1011         mov             %rsp,%rdi
1012         mov             %r9,%rcx
1013         rep movsb
1014
1015         vpxor           0x00(%rsp),%ymm0,%ymm0
1016         vmovdqa         %ymm0,0x00(%rsp)
1017
1018         mov             %rsp,%rsi
1019         lea             (%r11,%rax),%rdi
1020         mov             %r9,%rcx
1021         rep movsb
1022
1023         jmp             .Ldone8
1024
1025 ENDPROC(chacha_8block_xor_avx2)