2 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
4 * Copyright (C) 2015 Martin Willi
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
12 #include <linux/linkage.h>
13 #include <asm/frame.h>
15 .section .rodata.cst16.ROT8, "aM", @progbits, 16
17 ROT8: .octa 0x0e0d0c0f0a09080b0605040702010003
18 .section .rodata.cst16.ROT16, "aM", @progbits, 16
20 ROT16: .octa 0x0d0c0f0e09080b0a0504070601000302
21 .section .rodata.cst16.CTRINC, "aM", @progbits, 16
23 CTRINC: .octa 0x00000003000000020000000100000000
28 * chacha_permute - permute one block
30 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This
31 * function performs matrix operations on four words in parallel, but requires
32 * shuffling to rearrange the words after each round. 8/16-bit word rotation is
33 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
34 * rotation uses traditional shift+OR.
36 * The round count is given in %r8d.
38 * Clobbers: %r8d, %xmm4-%xmm7
42 movdqa ROT8(%rip),%xmm4
43 movdqa ROT16(%rip),%xmm5
46 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
51 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
59 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
64 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
72 # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
73 pshufd $0x39,%xmm1,%xmm1
74 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
75 pshufd $0x4e,%xmm2,%xmm2
76 # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
77 pshufd $0x93,%xmm3,%xmm3
79 # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
84 # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
92 # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
97 # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
105 # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
106 pshufd $0x93,%xmm1,%xmm1
107 # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
108 pshufd $0x4e,%xmm2,%xmm2
109 # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
110 pshufd $0x39,%xmm3,%xmm3
116 ENDPROC(chacha_permute)
118 ENTRY(chacha_block_xor_ssse3)
119 # %rdi: Input state matrix, s
120 # %rsi: up to 1 data block output, o
121 # %rdx: up to 1 data block input, i
122 # %rcx: input/output length in bytes
127 movdqa 0x00(%rdi),%xmm0
128 movdqa 0x10(%rdi),%xmm1
129 movdqa 0x20(%rdi),%xmm2
130 movdqa 0x30(%rdi),%xmm3
139 # o0 = i0 ^ (x0 + s0)
143 movdqu 0x00(%rdx),%xmm4
145 movdqu %xmm0,0x00(%rsi)
146 # o1 = i1 ^ (x1 + s1)
151 movdqu 0x10(%rdx),%xmm0
153 movdqu %xmm0,0x10(%rsi)
154 # o2 = i2 ^ (x2 + s2)
159 movdqu 0x20(%rdx),%xmm0
161 movdqu %xmm0,0x20(%rsi)
162 # o3 = i3 ^ (x3 + s3)
167 movdqu 0x30(%rdx),%xmm0
169 movdqu %xmm0,0x30(%rsi)
176 # xor remaining bytes from partial register into output
193 pxor 0x00(%rsp),%xmm0
194 movdqa %xmm0,0x00(%rsp)
204 ENDPROC(chacha_block_xor_ssse3)
206 ENTRY(hchacha_block_ssse3)
207 # %rdi: Input state matrix, s
208 # %rsi: output (8 32-bit words)
212 movdqa 0x00(%rdi),%xmm0
213 movdqa 0x10(%rdi),%xmm1
214 movdqa 0x20(%rdi),%xmm2
215 movdqa 0x30(%rdi),%xmm3
220 movdqu %xmm0,0x00(%rsi)
221 movdqu %xmm3,0x10(%rsi)
225 ENDPROC(hchacha_block_ssse3)
227 ENTRY(chacha_4block_xor_ssse3)
228 # %rdi: Input state matrix, s
229 # %rsi: up to 4 data blocks output, o
230 # %rdx: up to 4 data blocks input, i
231 # %rcx: input/output length in bytes
234 # This function encrypts four consecutive ChaCha blocks by loading the
235 # the state matrix in SSE registers four times. As we need some scratch
236 # registers, we save the first four registers on the stack. The
237 # algorithm performs each operation on the corresponding word of each
238 # state matrix, hence requires no word shuffling. For final XORing step
239 # we transpose the matrix by interleaving 32- and then 64-bit words,
240 # which allows us to do XOR in SSE registers. 8/16-bit word rotation is
241 # done with the slightly better performing SSSE3 byte shuffling,
242 # 7/12-bit word rotation uses traditional shift+OR.
249 # x0..15[0-3] = s0..3[0..3]
250 movq 0x00(%rdi),%xmm1
251 pshufd $0x00,%xmm1,%xmm0
252 pshufd $0x55,%xmm1,%xmm1
253 movq 0x08(%rdi),%xmm3
254 pshufd $0x00,%xmm3,%xmm2
255 pshufd $0x55,%xmm3,%xmm3
256 movq 0x10(%rdi),%xmm5
257 pshufd $0x00,%xmm5,%xmm4
258 pshufd $0x55,%xmm5,%xmm5
259 movq 0x18(%rdi),%xmm7
260 pshufd $0x00,%xmm7,%xmm6
261 pshufd $0x55,%xmm7,%xmm7
262 movq 0x20(%rdi),%xmm9
263 pshufd $0x00,%xmm9,%xmm8
264 pshufd $0x55,%xmm9,%xmm9
265 movq 0x28(%rdi),%xmm11
266 pshufd $0x00,%xmm11,%xmm10
267 pshufd $0x55,%xmm11,%xmm11
268 movq 0x30(%rdi),%xmm13
269 pshufd $0x00,%xmm13,%xmm12
270 pshufd $0x55,%xmm13,%xmm13
271 movq 0x38(%rdi),%xmm15
272 pshufd $0x00,%xmm15,%xmm14
273 pshufd $0x55,%xmm15,%xmm15
275 movdqa %xmm0,0x00(%rsp)
276 movdqa %xmm1,0x10(%rsp)
277 movdqa %xmm2,0x20(%rsp)
278 movdqa %xmm3,0x30(%rsp)
280 movdqa CTRINC(%rip),%xmm1
281 movdqa ROT8(%rip),%xmm2
282 movdqa ROT16(%rip),%xmm3
284 # x12 += counter values 0-3
288 # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
289 movdqa 0x00(%rsp),%xmm0
291 movdqa %xmm0,0x00(%rsp)
294 # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
295 movdqa 0x10(%rsp),%xmm0
297 movdqa %xmm0,0x10(%rsp)
300 # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
301 movdqa 0x20(%rsp),%xmm0
303 movdqa %xmm0,0x20(%rsp)
306 # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
307 movdqa 0x30(%rsp),%xmm0
309 movdqa %xmm0,0x30(%rsp)
313 # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
320 # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
327 # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
334 # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
342 # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
343 movdqa 0x00(%rsp),%xmm0
345 movdqa %xmm0,0x00(%rsp)
348 # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
349 movdqa 0x10(%rsp),%xmm0
351 movdqa %xmm0,0x10(%rsp)
354 # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
355 movdqa 0x20(%rsp),%xmm0
357 movdqa %xmm0,0x20(%rsp)
360 # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
361 movdqa 0x30(%rsp),%xmm0
363 movdqa %xmm0,0x30(%rsp)
367 # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
374 # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
381 # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
388 # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
396 # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
397 movdqa 0x00(%rsp),%xmm0
399 movdqa %xmm0,0x00(%rsp)
402 # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
403 movdqa 0x10(%rsp),%xmm0
405 movdqa %xmm0,0x10(%rsp)
408 # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
409 movdqa 0x20(%rsp),%xmm0
411 movdqa %xmm0,0x20(%rsp)
414 # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
415 movdqa 0x30(%rsp),%xmm0
417 movdqa %xmm0,0x30(%rsp)
421 # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
428 # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
435 # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
442 # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
450 # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
451 movdqa 0x00(%rsp),%xmm0
453 movdqa %xmm0,0x00(%rsp)
456 # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
457 movdqa 0x10(%rsp),%xmm0
459 movdqa %xmm0,0x10(%rsp)
462 # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
463 movdqa 0x20(%rsp),%xmm0
465 movdqa %xmm0,0x20(%rsp)
468 # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
469 movdqa 0x30(%rsp),%xmm0
471 movdqa %xmm0,0x30(%rsp)
475 # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
482 # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
489 # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
496 # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
509 movq 0x00(%rdi),%xmm3
510 pshufd $0x00,%xmm3,%xmm2
511 pshufd $0x55,%xmm3,%xmm3
512 paddd 0x00(%rsp),%xmm2
513 movdqa %xmm2,0x00(%rsp)
514 paddd 0x10(%rsp),%xmm3
515 movdqa %xmm3,0x10(%rsp)
518 movq 0x08(%rdi),%xmm3
519 pshufd $0x00,%xmm3,%xmm2
520 pshufd $0x55,%xmm3,%xmm3
521 paddd 0x20(%rsp),%xmm2
522 movdqa %xmm2,0x20(%rsp)
523 paddd 0x30(%rsp),%xmm3
524 movdqa %xmm3,0x30(%rsp)
528 movq 0x10(%rdi),%xmm3
529 pshufd $0x00,%xmm3,%xmm2
530 pshufd $0x55,%xmm3,%xmm3
535 movq 0x18(%rdi),%xmm3
536 pshufd $0x00,%xmm3,%xmm2
537 pshufd $0x55,%xmm3,%xmm3
543 movq 0x20(%rdi),%xmm3
544 pshufd $0x00,%xmm3,%xmm2
545 pshufd $0x55,%xmm3,%xmm3
550 movq 0x28(%rdi),%xmm3
551 pshufd $0x00,%xmm3,%xmm2
552 pshufd $0x55,%xmm3,%xmm3
558 movq 0x30(%rdi),%xmm3
559 pshufd $0x00,%xmm3,%xmm2
560 pshufd $0x55,%xmm3,%xmm3
565 movq 0x38(%rdi),%xmm3
566 pshufd $0x00,%xmm3,%xmm2
567 pshufd $0x55,%xmm3,%xmm3
571 # x12 += counter values 0-3
574 # interleave 32-bit words in state n, n+1
575 movdqa 0x00(%rsp),%xmm0
576 movdqa 0x10(%rsp),%xmm1
578 punpckldq %xmm1,%xmm2
579 punpckhdq %xmm1,%xmm0
580 movdqa %xmm2,0x00(%rsp)
581 movdqa %xmm0,0x10(%rsp)
582 movdqa 0x20(%rsp),%xmm0
583 movdqa 0x30(%rsp),%xmm1
585 punpckldq %xmm1,%xmm2
586 punpckhdq %xmm1,%xmm0
587 movdqa %xmm2,0x20(%rsp)
588 movdqa %xmm0,0x30(%rsp)
590 punpckldq %xmm5,%xmm4
591 punpckhdq %xmm5,%xmm0
594 punpckldq %xmm7,%xmm6
595 punpckhdq %xmm7,%xmm0
598 punpckldq %xmm9,%xmm8
599 punpckhdq %xmm9,%xmm0
602 punpckldq %xmm11,%xmm10
603 punpckhdq %xmm11,%xmm0
606 punpckldq %xmm13,%xmm12
607 punpckhdq %xmm13,%xmm0
610 punpckldq %xmm15,%xmm14
611 punpckhdq %xmm15,%xmm0
614 # interleave 64-bit words in state n, n+2
615 movdqa 0x00(%rsp),%xmm0
616 movdqa 0x20(%rsp),%xmm1
618 punpcklqdq %xmm1,%xmm2
619 punpckhqdq %xmm1,%xmm0
620 movdqa %xmm2,0x00(%rsp)
621 movdqa %xmm0,0x20(%rsp)
622 movdqa 0x10(%rsp),%xmm0
623 movdqa 0x30(%rsp),%xmm1
625 punpcklqdq %xmm1,%xmm2
626 punpckhqdq %xmm1,%xmm0
627 movdqa %xmm2,0x10(%rsp)
628 movdqa %xmm0,0x30(%rsp)
630 punpcklqdq %xmm6,%xmm4
631 punpckhqdq %xmm6,%xmm0
634 punpcklqdq %xmm7,%xmm5
635 punpckhqdq %xmm7,%xmm0
638 punpcklqdq %xmm10,%xmm8
639 punpckhqdq %xmm10,%xmm0
642 punpcklqdq %xmm11,%xmm9
643 punpckhqdq %xmm11,%xmm0
646 punpcklqdq %xmm14,%xmm12
647 punpckhqdq %xmm14,%xmm0
650 punpcklqdq %xmm15,%xmm13
651 punpckhqdq %xmm15,%xmm0
654 # xor with corresponding input, write to output
655 movdqa 0x00(%rsp),%xmm0
658 movdqu 0x00(%rdx),%xmm1
660 movdqu %xmm0,0x00(%rsi)
665 movdqu 0x10(%rdx),%xmm1
667 movdqu %xmm0,0x10(%rsi)
672 movdqu 0x20(%rdx),%xmm1
674 movdqu %xmm0,0x20(%rsi)
679 movdqu 0x30(%rdx),%xmm1
681 movdqu %xmm0,0x30(%rsi)
683 movdqa 0x20(%rsp),%xmm0
686 movdqu 0x40(%rdx),%xmm1
688 movdqu %xmm0,0x40(%rsi)
693 movdqu 0x50(%rdx),%xmm1
695 movdqu %xmm0,0x50(%rsi)
700 movdqu 0x60(%rdx),%xmm1
702 movdqu %xmm0,0x60(%rsi)
707 movdqu 0x70(%rdx),%xmm1
709 movdqu %xmm0,0x70(%rsi)
711 movdqa 0x10(%rsp),%xmm0
714 movdqu 0x80(%rdx),%xmm1
716 movdqu %xmm0,0x80(%rsi)
721 movdqu 0x90(%rdx),%xmm1
723 movdqu %xmm0,0x90(%rsi)
728 movdqu 0xa0(%rdx),%xmm1
730 movdqu %xmm0,0xa0(%rsi)
735 movdqu 0xb0(%rdx),%xmm1
737 movdqu %xmm0,0xb0(%rsi)
739 movdqa 0x30(%rsp),%xmm0
742 movdqu 0xc0(%rdx),%xmm1
744 movdqu %xmm0,0xc0(%rsi)
749 movdqu 0xd0(%rdx),%xmm1
751 movdqu %xmm0,0xd0(%rsi)
756 movdqu 0xe0(%rdx),%xmm1
758 movdqu %xmm0,0xe0(%rsi)
763 movdqu 0xf0(%rdx),%xmm1
765 movdqu %xmm0,0xf0(%rsi)
772 # xor remaining bytes from partial register into output
785 pxor 0x00(%rsp),%xmm0
786 movdqa %xmm0,0x00(%rsp)
795 ENDPROC(chacha_4block_xor_ssse3)