OSDN Git Service

crypto: x86/chacha20 - add XChaCha20 support
[uclinux-h8/linux.git] / arch / x86 / crypto / chacha-ssse3-x86_64.S
1 /*
2  * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
3  *
4  * Copyright (C) 2015 Martin Willi
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  */
11
12 #include <linux/linkage.h>
13 #include <asm/frame.h>
14
15 .section        .rodata.cst16.ROT8, "aM", @progbits, 16
16 .align 16
17 ROT8:   .octa 0x0e0d0c0f0a09080b0605040702010003
18 .section        .rodata.cst16.ROT16, "aM", @progbits, 16
19 .align 16
20 ROT16:  .octa 0x0d0c0f0e09080b0a0504070601000302
21 .section        .rodata.cst16.CTRINC, "aM", @progbits, 16
22 .align 16
23 CTRINC: .octa 0x00000003000000020000000100000000
24
25 .text
26
27 /*
28  * chacha_permute - permute one block
29  *
30  * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
31  * function performs matrix operations on four words in parallel, but requires
32  * shuffling to rearrange the words after each round.  8/16-bit word rotation is
33  * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
34  * rotation uses traditional shift+OR.
35  *
36  * The round count is given in %r8d.
37  *
38  * Clobbers: %r8d, %xmm4-%xmm7
39  */
40 chacha_permute:
41
42         movdqa          ROT8(%rip),%xmm4
43         movdqa          ROT16(%rip),%xmm5
44
45 .Ldoubleround:
46         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
47         paddd           %xmm1,%xmm0
48         pxor            %xmm0,%xmm3
49         pshufb          %xmm5,%xmm3
50
51         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
52         paddd           %xmm3,%xmm2
53         pxor            %xmm2,%xmm1
54         movdqa          %xmm1,%xmm6
55         pslld           $12,%xmm6
56         psrld           $20,%xmm1
57         por             %xmm6,%xmm1
58
59         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
60         paddd           %xmm1,%xmm0
61         pxor            %xmm0,%xmm3
62         pshufb          %xmm4,%xmm3
63
64         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
65         paddd           %xmm3,%xmm2
66         pxor            %xmm2,%xmm1
67         movdqa          %xmm1,%xmm7
68         pslld           $7,%xmm7
69         psrld           $25,%xmm1
70         por             %xmm7,%xmm1
71
72         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
73         pshufd          $0x39,%xmm1,%xmm1
74         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
75         pshufd          $0x4e,%xmm2,%xmm2
76         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
77         pshufd          $0x93,%xmm3,%xmm3
78
79         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
80         paddd           %xmm1,%xmm0
81         pxor            %xmm0,%xmm3
82         pshufb          %xmm5,%xmm3
83
84         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
85         paddd           %xmm3,%xmm2
86         pxor            %xmm2,%xmm1
87         movdqa          %xmm1,%xmm6
88         pslld           $12,%xmm6
89         psrld           $20,%xmm1
90         por             %xmm6,%xmm1
91
92         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
93         paddd           %xmm1,%xmm0
94         pxor            %xmm0,%xmm3
95         pshufb          %xmm4,%xmm3
96
97         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
98         paddd           %xmm3,%xmm2
99         pxor            %xmm2,%xmm1
100         movdqa          %xmm1,%xmm7
101         pslld           $7,%xmm7
102         psrld           $25,%xmm1
103         por             %xmm7,%xmm1
104
105         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
106         pshufd          $0x93,%xmm1,%xmm1
107         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
108         pshufd          $0x4e,%xmm2,%xmm2
109         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
110         pshufd          $0x39,%xmm3,%xmm3
111
112         sub             $2,%r8d
113         jnz             .Ldoubleround
114
115         ret
116 ENDPROC(chacha_permute)
117
118 ENTRY(chacha_block_xor_ssse3)
119         # %rdi: Input state matrix, s
120         # %rsi: up to 1 data block output, o
121         # %rdx: up to 1 data block input, i
122         # %rcx: input/output length in bytes
123         # %r8d: nrounds
124         FRAME_BEGIN
125
126         # x0..3 = s0..3
127         movdqa          0x00(%rdi),%xmm0
128         movdqa          0x10(%rdi),%xmm1
129         movdqa          0x20(%rdi),%xmm2
130         movdqa          0x30(%rdi),%xmm3
131         movdqa          %xmm0,%xmm8
132         movdqa          %xmm1,%xmm9
133         movdqa          %xmm2,%xmm10
134         movdqa          %xmm3,%xmm11
135
136         mov             %rcx,%rax
137         call            chacha_permute
138
139         # o0 = i0 ^ (x0 + s0)
140         paddd           %xmm8,%xmm0
141         cmp             $0x10,%rax
142         jl              .Lxorpart
143         movdqu          0x00(%rdx),%xmm4
144         pxor            %xmm4,%xmm0
145         movdqu          %xmm0,0x00(%rsi)
146         # o1 = i1 ^ (x1 + s1)
147         paddd           %xmm9,%xmm1
148         movdqa          %xmm1,%xmm0
149         cmp             $0x20,%rax
150         jl              .Lxorpart
151         movdqu          0x10(%rdx),%xmm0
152         pxor            %xmm1,%xmm0
153         movdqu          %xmm0,0x10(%rsi)
154         # o2 = i2 ^ (x2 + s2)
155         paddd           %xmm10,%xmm2
156         movdqa          %xmm2,%xmm0
157         cmp             $0x30,%rax
158         jl              .Lxorpart
159         movdqu          0x20(%rdx),%xmm0
160         pxor            %xmm2,%xmm0
161         movdqu          %xmm0,0x20(%rsi)
162         # o3 = i3 ^ (x3 + s3)
163         paddd           %xmm11,%xmm3
164         movdqa          %xmm3,%xmm0
165         cmp             $0x40,%rax
166         jl              .Lxorpart
167         movdqu          0x30(%rdx),%xmm0
168         pxor            %xmm3,%xmm0
169         movdqu          %xmm0,0x30(%rsi)
170
171 .Ldone:
172         FRAME_END
173         ret
174
175 .Lxorpart:
176         # xor remaining bytes from partial register into output
177         mov             %rax,%r9
178         and             $0x0f,%r9
179         jz              .Ldone
180         and             $~0x0f,%rax
181
182         mov             %rsi,%r11
183
184         lea             8(%rsp),%r10
185         sub             $0x10,%rsp
186         and             $~31,%rsp
187
188         lea             (%rdx,%rax),%rsi
189         mov             %rsp,%rdi
190         mov             %r9,%rcx
191         rep movsb
192
193         pxor            0x00(%rsp),%xmm0
194         movdqa          %xmm0,0x00(%rsp)
195
196         mov             %rsp,%rsi
197         lea             (%r11,%rax),%rdi
198         mov             %r9,%rcx
199         rep movsb
200
201         lea             -8(%r10),%rsp
202         jmp             .Ldone
203
204 ENDPROC(chacha_block_xor_ssse3)
205
206 ENTRY(hchacha_block_ssse3)
207         # %rdi: Input state matrix, s
208         # %rsi: output (8 32-bit words)
209         # %edx: nrounds
210         FRAME_BEGIN
211
212         movdqa          0x00(%rdi),%xmm0
213         movdqa          0x10(%rdi),%xmm1
214         movdqa          0x20(%rdi),%xmm2
215         movdqa          0x30(%rdi),%xmm3
216
217         mov             %edx,%r8d
218         call            chacha_permute
219
220         movdqu          %xmm0,0x00(%rsi)
221         movdqu          %xmm3,0x10(%rsi)
222
223         FRAME_END
224         ret
225 ENDPROC(hchacha_block_ssse3)
226
227 ENTRY(chacha_4block_xor_ssse3)
228         # %rdi: Input state matrix, s
229         # %rsi: up to 4 data blocks output, o
230         # %rdx: up to 4 data blocks input, i
231         # %rcx: input/output length in bytes
232         # %r8d: nrounds
233
234         # This function encrypts four consecutive ChaCha blocks by loading the
235         # the state matrix in SSE registers four times. As we need some scratch
236         # registers, we save the first four registers on the stack. The
237         # algorithm performs each operation on the corresponding word of each
238         # state matrix, hence requires no word shuffling. For final XORing step
239         # we transpose the matrix by interleaving 32- and then 64-bit words,
240         # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
241         # done with the slightly better performing SSSE3 byte shuffling,
242         # 7/12-bit word rotation uses traditional shift+OR.
243
244         lea             8(%rsp),%r10
245         sub             $0x80,%rsp
246         and             $~63,%rsp
247         mov             %rcx,%rax
248
249         # x0..15[0-3] = s0..3[0..3]
250         movq            0x00(%rdi),%xmm1
251         pshufd          $0x00,%xmm1,%xmm0
252         pshufd          $0x55,%xmm1,%xmm1
253         movq            0x08(%rdi),%xmm3
254         pshufd          $0x00,%xmm3,%xmm2
255         pshufd          $0x55,%xmm3,%xmm3
256         movq            0x10(%rdi),%xmm5
257         pshufd          $0x00,%xmm5,%xmm4
258         pshufd          $0x55,%xmm5,%xmm5
259         movq            0x18(%rdi),%xmm7
260         pshufd          $0x00,%xmm7,%xmm6
261         pshufd          $0x55,%xmm7,%xmm7
262         movq            0x20(%rdi),%xmm9
263         pshufd          $0x00,%xmm9,%xmm8
264         pshufd          $0x55,%xmm9,%xmm9
265         movq            0x28(%rdi),%xmm11
266         pshufd          $0x00,%xmm11,%xmm10
267         pshufd          $0x55,%xmm11,%xmm11
268         movq            0x30(%rdi),%xmm13
269         pshufd          $0x00,%xmm13,%xmm12
270         pshufd          $0x55,%xmm13,%xmm13
271         movq            0x38(%rdi),%xmm15
272         pshufd          $0x00,%xmm15,%xmm14
273         pshufd          $0x55,%xmm15,%xmm15
274         # x0..3 on stack
275         movdqa          %xmm0,0x00(%rsp)
276         movdqa          %xmm1,0x10(%rsp)
277         movdqa          %xmm2,0x20(%rsp)
278         movdqa          %xmm3,0x30(%rsp)
279
280         movdqa          CTRINC(%rip),%xmm1
281         movdqa          ROT8(%rip),%xmm2
282         movdqa          ROT16(%rip),%xmm3
283
284         # x12 += counter values 0-3
285         paddd           %xmm1,%xmm12
286
287 .Ldoubleround4:
288         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
289         movdqa          0x00(%rsp),%xmm0
290         paddd           %xmm4,%xmm0
291         movdqa          %xmm0,0x00(%rsp)
292         pxor            %xmm0,%xmm12
293         pshufb          %xmm3,%xmm12
294         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
295         movdqa          0x10(%rsp),%xmm0
296         paddd           %xmm5,%xmm0
297         movdqa          %xmm0,0x10(%rsp)
298         pxor            %xmm0,%xmm13
299         pshufb          %xmm3,%xmm13
300         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
301         movdqa          0x20(%rsp),%xmm0
302         paddd           %xmm6,%xmm0
303         movdqa          %xmm0,0x20(%rsp)
304         pxor            %xmm0,%xmm14
305         pshufb          %xmm3,%xmm14
306         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
307         movdqa          0x30(%rsp),%xmm0
308         paddd           %xmm7,%xmm0
309         movdqa          %xmm0,0x30(%rsp)
310         pxor            %xmm0,%xmm15
311         pshufb          %xmm3,%xmm15
312
313         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
314         paddd           %xmm12,%xmm8
315         pxor            %xmm8,%xmm4
316         movdqa          %xmm4,%xmm0
317         pslld           $12,%xmm0
318         psrld           $20,%xmm4
319         por             %xmm0,%xmm4
320         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
321         paddd           %xmm13,%xmm9
322         pxor            %xmm9,%xmm5
323         movdqa          %xmm5,%xmm0
324         pslld           $12,%xmm0
325         psrld           $20,%xmm5
326         por             %xmm0,%xmm5
327         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
328         paddd           %xmm14,%xmm10
329         pxor            %xmm10,%xmm6
330         movdqa          %xmm6,%xmm0
331         pslld           $12,%xmm0
332         psrld           $20,%xmm6
333         por             %xmm0,%xmm6
334         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
335         paddd           %xmm15,%xmm11
336         pxor            %xmm11,%xmm7
337         movdqa          %xmm7,%xmm0
338         pslld           $12,%xmm0
339         psrld           $20,%xmm7
340         por             %xmm0,%xmm7
341
342         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
343         movdqa          0x00(%rsp),%xmm0
344         paddd           %xmm4,%xmm0
345         movdqa          %xmm0,0x00(%rsp)
346         pxor            %xmm0,%xmm12
347         pshufb          %xmm2,%xmm12
348         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
349         movdqa          0x10(%rsp),%xmm0
350         paddd           %xmm5,%xmm0
351         movdqa          %xmm0,0x10(%rsp)
352         pxor            %xmm0,%xmm13
353         pshufb          %xmm2,%xmm13
354         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
355         movdqa          0x20(%rsp),%xmm0
356         paddd           %xmm6,%xmm0
357         movdqa          %xmm0,0x20(%rsp)
358         pxor            %xmm0,%xmm14
359         pshufb          %xmm2,%xmm14
360         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
361         movdqa          0x30(%rsp),%xmm0
362         paddd           %xmm7,%xmm0
363         movdqa          %xmm0,0x30(%rsp)
364         pxor            %xmm0,%xmm15
365         pshufb          %xmm2,%xmm15
366
367         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
368         paddd           %xmm12,%xmm8
369         pxor            %xmm8,%xmm4
370         movdqa          %xmm4,%xmm0
371         pslld           $7,%xmm0
372         psrld           $25,%xmm4
373         por             %xmm0,%xmm4
374         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
375         paddd           %xmm13,%xmm9
376         pxor            %xmm9,%xmm5
377         movdqa          %xmm5,%xmm0
378         pslld           $7,%xmm0
379         psrld           $25,%xmm5
380         por             %xmm0,%xmm5
381         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
382         paddd           %xmm14,%xmm10
383         pxor            %xmm10,%xmm6
384         movdqa          %xmm6,%xmm0
385         pslld           $7,%xmm0
386         psrld           $25,%xmm6
387         por             %xmm0,%xmm6
388         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
389         paddd           %xmm15,%xmm11
390         pxor            %xmm11,%xmm7
391         movdqa          %xmm7,%xmm0
392         pslld           $7,%xmm0
393         psrld           $25,%xmm7
394         por             %xmm0,%xmm7
395
396         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
397         movdqa          0x00(%rsp),%xmm0
398         paddd           %xmm5,%xmm0
399         movdqa          %xmm0,0x00(%rsp)
400         pxor            %xmm0,%xmm15
401         pshufb          %xmm3,%xmm15
402         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
403         movdqa          0x10(%rsp),%xmm0
404         paddd           %xmm6,%xmm0
405         movdqa          %xmm0,0x10(%rsp)
406         pxor            %xmm0,%xmm12
407         pshufb          %xmm3,%xmm12
408         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
409         movdqa          0x20(%rsp),%xmm0
410         paddd           %xmm7,%xmm0
411         movdqa          %xmm0,0x20(%rsp)
412         pxor            %xmm0,%xmm13
413         pshufb          %xmm3,%xmm13
414         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
415         movdqa          0x30(%rsp),%xmm0
416         paddd           %xmm4,%xmm0
417         movdqa          %xmm0,0x30(%rsp)
418         pxor            %xmm0,%xmm14
419         pshufb          %xmm3,%xmm14
420
421         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
422         paddd           %xmm15,%xmm10
423         pxor            %xmm10,%xmm5
424         movdqa          %xmm5,%xmm0
425         pslld           $12,%xmm0
426         psrld           $20,%xmm5
427         por             %xmm0,%xmm5
428         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
429         paddd           %xmm12,%xmm11
430         pxor            %xmm11,%xmm6
431         movdqa          %xmm6,%xmm0
432         pslld           $12,%xmm0
433         psrld           $20,%xmm6
434         por             %xmm0,%xmm6
435         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
436         paddd           %xmm13,%xmm8
437         pxor            %xmm8,%xmm7
438         movdqa          %xmm7,%xmm0
439         pslld           $12,%xmm0
440         psrld           $20,%xmm7
441         por             %xmm0,%xmm7
442         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
443         paddd           %xmm14,%xmm9
444         pxor            %xmm9,%xmm4
445         movdqa          %xmm4,%xmm0
446         pslld           $12,%xmm0
447         psrld           $20,%xmm4
448         por             %xmm0,%xmm4
449
450         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
451         movdqa          0x00(%rsp),%xmm0
452         paddd           %xmm5,%xmm0
453         movdqa          %xmm0,0x00(%rsp)
454         pxor            %xmm0,%xmm15
455         pshufb          %xmm2,%xmm15
456         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
457         movdqa          0x10(%rsp),%xmm0
458         paddd           %xmm6,%xmm0
459         movdqa          %xmm0,0x10(%rsp)
460         pxor            %xmm0,%xmm12
461         pshufb          %xmm2,%xmm12
462         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
463         movdqa          0x20(%rsp),%xmm0
464         paddd           %xmm7,%xmm0
465         movdqa          %xmm0,0x20(%rsp)
466         pxor            %xmm0,%xmm13
467         pshufb          %xmm2,%xmm13
468         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
469         movdqa          0x30(%rsp),%xmm0
470         paddd           %xmm4,%xmm0
471         movdqa          %xmm0,0x30(%rsp)
472         pxor            %xmm0,%xmm14
473         pshufb          %xmm2,%xmm14
474
475         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
476         paddd           %xmm15,%xmm10
477         pxor            %xmm10,%xmm5
478         movdqa          %xmm5,%xmm0
479         pslld           $7,%xmm0
480         psrld           $25,%xmm5
481         por             %xmm0,%xmm5
482         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
483         paddd           %xmm12,%xmm11
484         pxor            %xmm11,%xmm6
485         movdqa          %xmm6,%xmm0
486         pslld           $7,%xmm0
487         psrld           $25,%xmm6
488         por             %xmm0,%xmm6
489         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
490         paddd           %xmm13,%xmm8
491         pxor            %xmm8,%xmm7
492         movdqa          %xmm7,%xmm0
493         pslld           $7,%xmm0
494         psrld           $25,%xmm7
495         por             %xmm0,%xmm7
496         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
497         paddd           %xmm14,%xmm9
498         pxor            %xmm9,%xmm4
499         movdqa          %xmm4,%xmm0
500         pslld           $7,%xmm0
501         psrld           $25,%xmm4
502         por             %xmm0,%xmm4
503
504         sub             $2,%r8d
505         jnz             .Ldoubleround4
506
507         # x0[0-3] += s0[0]
508         # x1[0-3] += s0[1]
509         movq            0x00(%rdi),%xmm3
510         pshufd          $0x00,%xmm3,%xmm2
511         pshufd          $0x55,%xmm3,%xmm3
512         paddd           0x00(%rsp),%xmm2
513         movdqa          %xmm2,0x00(%rsp)
514         paddd           0x10(%rsp),%xmm3
515         movdqa          %xmm3,0x10(%rsp)
516         # x2[0-3] += s0[2]
517         # x3[0-3] += s0[3]
518         movq            0x08(%rdi),%xmm3
519         pshufd          $0x00,%xmm3,%xmm2
520         pshufd          $0x55,%xmm3,%xmm3
521         paddd           0x20(%rsp),%xmm2
522         movdqa          %xmm2,0x20(%rsp)
523         paddd           0x30(%rsp),%xmm3
524         movdqa          %xmm3,0x30(%rsp)
525
526         # x4[0-3] += s1[0]
527         # x5[0-3] += s1[1]
528         movq            0x10(%rdi),%xmm3
529         pshufd          $0x00,%xmm3,%xmm2
530         pshufd          $0x55,%xmm3,%xmm3
531         paddd           %xmm2,%xmm4
532         paddd           %xmm3,%xmm5
533         # x6[0-3] += s1[2]
534         # x7[0-3] += s1[3]
535         movq            0x18(%rdi),%xmm3
536         pshufd          $0x00,%xmm3,%xmm2
537         pshufd          $0x55,%xmm3,%xmm3
538         paddd           %xmm2,%xmm6
539         paddd           %xmm3,%xmm7
540
541         # x8[0-3] += s2[0]
542         # x9[0-3] += s2[1]
543         movq            0x20(%rdi),%xmm3
544         pshufd          $0x00,%xmm3,%xmm2
545         pshufd          $0x55,%xmm3,%xmm3
546         paddd           %xmm2,%xmm8
547         paddd           %xmm3,%xmm9
548         # x10[0-3] += s2[2]
549         # x11[0-3] += s2[3]
550         movq            0x28(%rdi),%xmm3
551         pshufd          $0x00,%xmm3,%xmm2
552         pshufd          $0x55,%xmm3,%xmm3
553         paddd           %xmm2,%xmm10
554         paddd           %xmm3,%xmm11
555
556         # x12[0-3] += s3[0]
557         # x13[0-3] += s3[1]
558         movq            0x30(%rdi),%xmm3
559         pshufd          $0x00,%xmm3,%xmm2
560         pshufd          $0x55,%xmm3,%xmm3
561         paddd           %xmm2,%xmm12
562         paddd           %xmm3,%xmm13
563         # x14[0-3] += s3[2]
564         # x15[0-3] += s3[3]
565         movq            0x38(%rdi),%xmm3
566         pshufd          $0x00,%xmm3,%xmm2
567         pshufd          $0x55,%xmm3,%xmm3
568         paddd           %xmm2,%xmm14
569         paddd           %xmm3,%xmm15
570
571         # x12 += counter values 0-3
572         paddd           %xmm1,%xmm12
573
574         # interleave 32-bit words in state n, n+1
575         movdqa          0x00(%rsp),%xmm0
576         movdqa          0x10(%rsp),%xmm1
577         movdqa          %xmm0,%xmm2
578         punpckldq       %xmm1,%xmm2
579         punpckhdq       %xmm1,%xmm0
580         movdqa          %xmm2,0x00(%rsp)
581         movdqa          %xmm0,0x10(%rsp)
582         movdqa          0x20(%rsp),%xmm0
583         movdqa          0x30(%rsp),%xmm1
584         movdqa          %xmm0,%xmm2
585         punpckldq       %xmm1,%xmm2
586         punpckhdq       %xmm1,%xmm0
587         movdqa          %xmm2,0x20(%rsp)
588         movdqa          %xmm0,0x30(%rsp)
589         movdqa          %xmm4,%xmm0
590         punpckldq       %xmm5,%xmm4
591         punpckhdq       %xmm5,%xmm0
592         movdqa          %xmm0,%xmm5
593         movdqa          %xmm6,%xmm0
594         punpckldq       %xmm7,%xmm6
595         punpckhdq       %xmm7,%xmm0
596         movdqa          %xmm0,%xmm7
597         movdqa          %xmm8,%xmm0
598         punpckldq       %xmm9,%xmm8
599         punpckhdq       %xmm9,%xmm0
600         movdqa          %xmm0,%xmm9
601         movdqa          %xmm10,%xmm0
602         punpckldq       %xmm11,%xmm10
603         punpckhdq       %xmm11,%xmm0
604         movdqa          %xmm0,%xmm11
605         movdqa          %xmm12,%xmm0
606         punpckldq       %xmm13,%xmm12
607         punpckhdq       %xmm13,%xmm0
608         movdqa          %xmm0,%xmm13
609         movdqa          %xmm14,%xmm0
610         punpckldq       %xmm15,%xmm14
611         punpckhdq       %xmm15,%xmm0
612         movdqa          %xmm0,%xmm15
613
614         # interleave 64-bit words in state n, n+2
615         movdqa          0x00(%rsp),%xmm0
616         movdqa          0x20(%rsp),%xmm1
617         movdqa          %xmm0,%xmm2
618         punpcklqdq      %xmm1,%xmm2
619         punpckhqdq      %xmm1,%xmm0
620         movdqa          %xmm2,0x00(%rsp)
621         movdqa          %xmm0,0x20(%rsp)
622         movdqa          0x10(%rsp),%xmm0
623         movdqa          0x30(%rsp),%xmm1
624         movdqa          %xmm0,%xmm2
625         punpcklqdq      %xmm1,%xmm2
626         punpckhqdq      %xmm1,%xmm0
627         movdqa          %xmm2,0x10(%rsp)
628         movdqa          %xmm0,0x30(%rsp)
629         movdqa          %xmm4,%xmm0
630         punpcklqdq      %xmm6,%xmm4
631         punpckhqdq      %xmm6,%xmm0
632         movdqa          %xmm0,%xmm6
633         movdqa          %xmm5,%xmm0
634         punpcklqdq      %xmm7,%xmm5
635         punpckhqdq      %xmm7,%xmm0
636         movdqa          %xmm0,%xmm7
637         movdqa          %xmm8,%xmm0
638         punpcklqdq      %xmm10,%xmm8
639         punpckhqdq      %xmm10,%xmm0
640         movdqa          %xmm0,%xmm10
641         movdqa          %xmm9,%xmm0
642         punpcklqdq      %xmm11,%xmm9
643         punpckhqdq      %xmm11,%xmm0
644         movdqa          %xmm0,%xmm11
645         movdqa          %xmm12,%xmm0
646         punpcklqdq      %xmm14,%xmm12
647         punpckhqdq      %xmm14,%xmm0
648         movdqa          %xmm0,%xmm14
649         movdqa          %xmm13,%xmm0
650         punpcklqdq      %xmm15,%xmm13
651         punpckhqdq      %xmm15,%xmm0
652         movdqa          %xmm0,%xmm15
653
654         # xor with corresponding input, write to output
655         movdqa          0x00(%rsp),%xmm0
656         cmp             $0x10,%rax
657         jl              .Lxorpart4
658         movdqu          0x00(%rdx),%xmm1
659         pxor            %xmm1,%xmm0
660         movdqu          %xmm0,0x00(%rsi)
661
662         movdqu          %xmm4,%xmm0
663         cmp             $0x20,%rax
664         jl              .Lxorpart4
665         movdqu          0x10(%rdx),%xmm1
666         pxor            %xmm1,%xmm0
667         movdqu          %xmm0,0x10(%rsi)
668
669         movdqu          %xmm8,%xmm0
670         cmp             $0x30,%rax
671         jl              .Lxorpart4
672         movdqu          0x20(%rdx),%xmm1
673         pxor            %xmm1,%xmm0
674         movdqu          %xmm0,0x20(%rsi)
675
676         movdqu          %xmm12,%xmm0
677         cmp             $0x40,%rax
678         jl              .Lxorpart4
679         movdqu          0x30(%rdx),%xmm1
680         pxor            %xmm1,%xmm0
681         movdqu          %xmm0,0x30(%rsi)
682
683         movdqa          0x20(%rsp),%xmm0
684         cmp             $0x50,%rax
685         jl              .Lxorpart4
686         movdqu          0x40(%rdx),%xmm1
687         pxor            %xmm1,%xmm0
688         movdqu          %xmm0,0x40(%rsi)
689
690         movdqu          %xmm6,%xmm0
691         cmp             $0x60,%rax
692         jl              .Lxorpart4
693         movdqu          0x50(%rdx),%xmm1
694         pxor            %xmm1,%xmm0
695         movdqu          %xmm0,0x50(%rsi)
696
697         movdqu          %xmm10,%xmm0
698         cmp             $0x70,%rax
699         jl              .Lxorpart4
700         movdqu          0x60(%rdx),%xmm1
701         pxor            %xmm1,%xmm0
702         movdqu          %xmm0,0x60(%rsi)
703
704         movdqu          %xmm14,%xmm0
705         cmp             $0x80,%rax
706         jl              .Lxorpart4
707         movdqu          0x70(%rdx),%xmm1
708         pxor            %xmm1,%xmm0
709         movdqu          %xmm0,0x70(%rsi)
710
711         movdqa          0x10(%rsp),%xmm0
712         cmp             $0x90,%rax
713         jl              .Lxorpart4
714         movdqu          0x80(%rdx),%xmm1
715         pxor            %xmm1,%xmm0
716         movdqu          %xmm0,0x80(%rsi)
717
718         movdqu          %xmm5,%xmm0
719         cmp             $0xa0,%rax
720         jl              .Lxorpart4
721         movdqu          0x90(%rdx),%xmm1
722         pxor            %xmm1,%xmm0
723         movdqu          %xmm0,0x90(%rsi)
724
725         movdqu          %xmm9,%xmm0
726         cmp             $0xb0,%rax
727         jl              .Lxorpart4
728         movdqu          0xa0(%rdx),%xmm1
729         pxor            %xmm1,%xmm0
730         movdqu          %xmm0,0xa0(%rsi)
731
732         movdqu          %xmm13,%xmm0
733         cmp             $0xc0,%rax
734         jl              .Lxorpart4
735         movdqu          0xb0(%rdx),%xmm1
736         pxor            %xmm1,%xmm0
737         movdqu          %xmm0,0xb0(%rsi)
738
739         movdqa          0x30(%rsp),%xmm0
740         cmp             $0xd0,%rax
741         jl              .Lxorpart4
742         movdqu          0xc0(%rdx),%xmm1
743         pxor            %xmm1,%xmm0
744         movdqu          %xmm0,0xc0(%rsi)
745
746         movdqu          %xmm7,%xmm0
747         cmp             $0xe0,%rax
748         jl              .Lxorpart4
749         movdqu          0xd0(%rdx),%xmm1
750         pxor            %xmm1,%xmm0
751         movdqu          %xmm0,0xd0(%rsi)
752
753         movdqu          %xmm11,%xmm0
754         cmp             $0xf0,%rax
755         jl              .Lxorpart4
756         movdqu          0xe0(%rdx),%xmm1
757         pxor            %xmm1,%xmm0
758         movdqu          %xmm0,0xe0(%rsi)
759
760         movdqu          %xmm15,%xmm0
761         cmp             $0x100,%rax
762         jl              .Lxorpart4
763         movdqu          0xf0(%rdx),%xmm1
764         pxor            %xmm1,%xmm0
765         movdqu          %xmm0,0xf0(%rsi)
766
767 .Ldone4:
768         lea             -8(%r10),%rsp
769         ret
770
771 .Lxorpart4:
772         # xor remaining bytes from partial register into output
773         mov             %rax,%r9
774         and             $0x0f,%r9
775         jz              .Ldone4
776         and             $~0x0f,%rax
777
778         mov             %rsi,%r11
779
780         lea             (%rdx,%rax),%rsi
781         mov             %rsp,%rdi
782         mov             %r9,%rcx
783         rep movsb
784
785         pxor            0x00(%rsp),%xmm0
786         movdqa          %xmm0,0x00(%rsp)
787
788         mov             %rsp,%rsi
789         lea             (%r11,%rax),%rdi
790         mov             %r9,%rcx
791         rep movsb
792
793         jmp             .Ldone4
794
795 ENDPROC(chacha_4block_xor_ssse3)