1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare.
7 // +build go1.7,amd64,!gccgo,!appengine
10 // General register allocation
14 #define adp CX // free to reuse, after we hash the additional data
15 #define keyp R8 // free to reuse, when we copy the key to stack
16 #define itr2 R9 // general iterator
17 #define itr1 CX // general iterator
25 // Register and stack allocation for the SSE code
26 #define rStore (0*16)(BP)
27 #define sStore (1*16)(BP)
28 #define state1Store (2*16)(BP)
29 #define state2Store (3*16)(BP)
30 #define tmpStore (4*16)(BP)
31 #define ctr0Store (5*16)(BP)
32 #define ctr1Store (6*16)(BP)
33 #define ctr2Store (7*16)(BP)
34 #define ctr3Store (8*16)(BP)
55 // Register and stack allocation for the AVX2 code
56 #define rsStoreAVX2 (0*32)(BP)
57 #define state1StoreAVX2 (1*32)(BP)
58 #define state2StoreAVX2 (2*32)(BP)
59 #define ctr0StoreAVX2 (3*32)(BP)
60 #define ctr1StoreAVX2 (4*32)(BP)
61 #define ctr2StoreAVX2 (5*32)(BP)
62 #define ctr3StoreAVX2 (6*32)(BP)
63 #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack
85 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
86 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
87 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
88 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
89 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
90 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
91 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
92 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
94 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
95 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
96 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
97 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
99 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
100 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
101 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
102 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
104 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
105 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
106 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
107 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
109 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
110 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
111 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
112 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
113 // Poly1305 key clamp
114 DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
115 DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
116 DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
117 DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
119 DATA ·sseIncMask<>+0x00(SB)/8, $0x1
120 DATA ·sseIncMask<>+0x08(SB)/8, $0x0
121 // To load/store the last < 16 bytes in a buffer
122 DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
123 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
124 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
125 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
126 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
127 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
128 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
129 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
130 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
131 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
132 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
133 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
134 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
135 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
136 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
137 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
138 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
139 DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
140 DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
141 DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
142 DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
143 DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
144 DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
145 DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
146 DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
147 DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
148 DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
149 DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
150 DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
151 DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
153 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
154 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
155 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
156 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
157 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
158 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
159 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
160 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
161 // No PALIGNR in Go ASM yet (but VPALIGNR is present).
162 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
163 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
164 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5
165 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13
166 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6
167 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7
168 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8
169 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14
170 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9
171 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10
172 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11
173 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15
174 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3
175 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4
176 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5
177 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13
178 #define shiftC0Right shiftC0Left
179 #define shiftC1Right shiftC1Left
180 #define shiftC2Right shiftC2Left
181 #define shiftC3Right shiftC3Left
182 #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9
183 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10
184 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11
185 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
187 #define chachaQR(A, B, C, D, T) \
188 PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
189 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
190 PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
191 PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
193 #define chachaQR_AVX2(A, B, C, D, T) \
194 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
195 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
196 VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
197 VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
199 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
200 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2
201 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX
202 #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3
203 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2
205 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2
206 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3
207 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3
209 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage
210 #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage
211 // ----------------------------------------------------------------------------
212 TEXT polyHashADInternal<>(SB), NOSPLIT, $0
213 // adp points to beginning of additional data
214 // itr2 holds ad length
222 // Special treatment for the TLS case of 13 bytes
231 // Hash in 16 byte chunks
235 LEAQ (1*16)(adp), adp
244 // Hash last < 16 byte tail
260 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
267 // ----------------------------------------------------------------------------
268 // func chacha20Poly1305Open(dst, key, src, ad []byte) bool
269 TEXT ·chacha20Poly1305Open(SB), 0, $288-97
270 // For aligned stack access
275 MOVQ key+24(FP), keyp
277 MOVQ src_len+56(FP), inl
280 // Check for AVX2 support
281 CMPB ·useAVX2(SB), $1
282 JE chacha20Poly1305Open_AVX2
284 // Special optimization, for very short buffers
286 JBE openSSE128 // About 16% faster
288 // For long buffers, prepare the poly key first
289 MOVOU ·chacha20Constants<>(SB), A0
290 MOVOU (1*16)(keyp), B0
291 MOVOU (2*16)(keyp), C0
292 MOVOU (3*16)(keyp), D0
295 // Store state on stack for future use
301 openSSEPreparePolyKey:
302 chachaQR(A0, B0, C0, D0, T0)
303 shiftB0Left; shiftC0Left; shiftD0Left
304 chachaQR(A0, B0, C0, D0, T0)
305 shiftB0Right; shiftC0Right; shiftD0Right
307 JNE openSSEPreparePolyKey
309 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
310 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
312 // Clamp and store the key
313 PAND ·polyClampMask<>(SB), A0
314 MOVO A0, rStore; MOVO B0, sStore
317 MOVQ ad_len+80(FP), itr2
318 CALL polyHashADInternal<>(SB)
322 JB openSSEMainLoopDone
324 // Load state, increment counter blocks
325 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
326 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
327 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
328 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
331 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
333 // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
339 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
342 chachaQR(A3, B3, C3, D3, C1)
345 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
346 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
347 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
350 LEAQ (2*8)(itr2), itr2
352 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
356 chachaQR(A3, B3, C3, D3, C1)
359 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
360 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
361 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
363 JGE openSSEInternalLoop
367 LEAQ (2*8)(itr2), itr2
370 JG openSSEInternalLoop
373 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
374 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
375 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
376 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
378 // Load - xor - store
380 MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup)
381 MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup)
382 MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup)
383 MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup)
384 MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup)
385 MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup)
386 MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup)
387 MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup)
388 MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup)
389 MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup)
390 MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup)
391 MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup)
392 MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup)
393 MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup)
394 MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup)
395 MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup)
402 // Handle the various tail sizes efficiently
414 // Hash in the PT, AAD lengths
415 ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2
429 // Add in the "s" part of the key
433 // Finally, constant time compare to the tag at the end of the message
436 XORQ (0*8)(inp), acc0
437 XORQ (1*8)(inp), acc1
441 // Return true iff tags are equal
445 // ----------------------------------------------------------------------------
446 // Special optimization for buffers smaller than 129 bytes
448 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
449 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
450 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
451 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
452 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
455 openSSE128InnerCipherLoop:
456 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
457 shiftB0Left; shiftB1Left; shiftB2Left
458 shiftC0Left; shiftC1Left; shiftC2Left
459 shiftD0Left; shiftD1Left; shiftD2Left
460 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
461 shiftB0Right; shiftB1Right; shiftB2Right
462 shiftC0Right; shiftC1Right; shiftC2Right
463 shiftD0Right; shiftD1Right; shiftD2Right
465 JNE openSSE128InnerCipherLoop
467 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
468 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
469 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
470 PADDL T2, C1; PADDL T2, C2
471 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
473 // Clamp and store the key
474 PAND ·polyClampMask<>(SB), A0
475 MOVOU A0, rStore; MOVOU B0, sStore
478 MOVQ ad_len+80(FP), itr2
479 CALL polyHashADInternal<>(SB)
489 // Load for decryption
490 MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup)
491 LEAQ (1*16)(inp), inp
492 LEAQ (1*16)(oup), oup
495 // Shift the stream "left"
509 // We can safely load the CT from the end, because it is padded with the MAC
512 LEAQ ·andMask<>(SB), t0
515 PAND -16(t0)(itr2*1), T0
521 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes
528 JNE openSSETail16Store
529 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
533 // ----------------------------------------------------------------------------
534 // Special optimization for the last 64 bytes of ciphertext
536 // Need to decrypt up to 64 bytes - prepare single block
537 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
541 JB openSSETail64LoopB
544 // Perform ChaCha rounds, while hashing the remaining input
545 polyAdd(0(inp)(itr2*1))
551 chachaQR(A0, B0, C0, D0, T0)
552 shiftB0Left; shiftC0Left; shiftD0Left
553 chachaQR(A0, B0, C0, D0, T0)
554 shiftB0Right; shiftC0Right; shiftD0Right
557 JAE openSSETail64LoopA
560 JNE openSSETail64LoopB
562 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
564 openSSETail64DecLoop:
566 JB openSSETail64DecLoopDone
576 JMP openSSETail64DecLoop
578 openSSETail64DecLoopDone:
582 // ----------------------------------------------------------------------------
583 // Special optimization for the last 128 bytes of ciphertext
585 // Need to decrypt up to 128 bytes - prepare two blocks
586 MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
587 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
593 // Perform ChaCha rounds, while hashing the remaining input
594 polyAdd(0(inp)(itr2*1))
599 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
600 shiftB0Left; shiftC0Left; shiftD0Left
601 shiftB1Left; shiftC1Left; shiftD1Left
602 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
603 shiftB0Right; shiftC0Right; shiftD0Right
604 shiftB1Right; shiftC1Right; shiftD1Right
607 JB openSSETail128LoopA
610 JNE openSSETail128LoopB
612 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
613 PADDL state1Store, B0; PADDL state1Store, B1
614 PADDL state2Store, C0; PADDL state2Store, C1
615 PADDL ctr1Store, D0; PADDL ctr0Store, D1
617 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
618 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
619 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
624 JMP openSSETail64DecLoop
626 // ----------------------------------------------------------------------------
627 // Special optimization for the last 192 bytes of ciphertext
629 // Need to decrypt up to 192 bytes - prepare three blocks
630 MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
631 MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
632 MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
642 // Perform ChaCha rounds, while hashing the remaining input
643 polyAdd(0(inp)(itr2*1))
648 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
649 shiftB0Left; shiftC0Left; shiftD0Left
650 shiftB1Left; shiftC1Left; shiftD1Left
651 shiftB2Left; shiftC2Left; shiftD2Left
653 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
654 shiftB0Right; shiftC0Right; shiftD0Right
655 shiftB1Right; shiftC1Right; shiftD1Right
656 shiftB2Right; shiftC2Right; shiftD2Right
659 JB openSSLTail192LoopA
662 JNE openSSLTail192LoopB
665 JB openSSLTail192Store
671 JB openSSLTail192Store
677 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
678 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
679 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
680 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
682 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
683 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2
684 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup)
686 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
687 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
688 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
693 JMP openSSETail64DecLoop
695 // ----------------------------------------------------------------------------
696 // Special optimization for the last 256 bytes of ciphertext
698 // Need to decrypt up to 256 bytes - prepare four blocks
699 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
700 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
701 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
702 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
705 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
709 // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication
710 polyAdd(0(inp)(itr2*1))
712 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
715 chachaQR(A3, B3, C3, D3, C1)
717 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
718 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
719 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
723 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
726 chachaQR(A3, B3, C3, D3, C1)
730 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
731 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
732 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
735 JB openSSETail256Loop
739 openSSETail256HashLoop:
740 polyAdd(0(inp)(itr2*1))
744 JB openSSETail256HashLoop
747 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
748 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
749 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
750 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
753 // Load - xor - store
754 MOVOU (0*16)(inp), D3; PXOR D3, A0
755 MOVOU (1*16)(inp), D3; PXOR D3, B0
756 MOVOU (2*16)(inp), D3; PXOR D3, C0
757 MOVOU (3*16)(inp), D3; PXOR D3, D0
758 MOVOU A0, (0*16)(oup)
759 MOVOU B0, (1*16)(oup)
760 MOVOU C0, (2*16)(oup)
761 MOVOU D0, (3*16)(oup)
762 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
763 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
764 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
765 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
766 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
767 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
776 JMP openSSETail64DecLoop
778 // ----------------------------------------------------------------------------
779 // ------------------------- AVX2 Code ----------------------------------------
780 chacha20Poly1305Open_AVX2:
782 VMOVDQU ·chacha20Constants<>(SB), AA0
783 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
784 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
785 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
786 VPADDD ·avx2InitMask<>(SB), DD0, DD0
788 // Special optimization, for very short buffers
794 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
795 VMOVDQA BB0, state1StoreAVX2
796 VMOVDQA CC0, state2StoreAVX2
797 VMOVDQA DD0, ctr3StoreAVX2
800 openAVX2PreparePolyKey:
801 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
802 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
803 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
804 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
806 JNE openAVX2PreparePolyKey
808 VPADDD ·chacha20Constants<>(SB), AA0, AA0
809 VPADDD state1StoreAVX2, BB0, BB0
810 VPADDD state2StoreAVX2, CC0, CC0
811 VPADDD ctr3StoreAVX2, DD0, DD0
813 VPERM2I128 $0x02, AA0, BB0, TT0
815 // Clamp and store poly key
816 VPAND ·polyClampMask<>(SB), TT0, TT0
817 VMOVDQA TT0, rsStoreAVX2
819 // Stream for the first 64 bytes
820 VPERM2I128 $0x13, AA0, BB0, AA0
821 VPERM2I128 $0x13, CC0, DD0, BB0
823 // Hash AD + first 64 bytes
824 MOVQ ad_len+80(FP), itr2
825 CALL polyHashADInternal<>(SB)
828 openAVX2InitialHash64:
829 polyAdd(0(inp)(itr1*1))
833 JNE openAVX2InitialHash64
835 // Decrypt the first 64 bytes
836 VPXOR (0*32)(inp), AA0, AA0
837 VPXOR (1*32)(inp), BB0, BB0
838 VMOVDQU AA0, (0*32)(oup)
839 VMOVDQU BB0, (1*32)(oup)
840 LEAQ (2*32)(inp), inp
841 LEAQ (2*32)(oup), oup
846 JB openAVX2MainLoopDone
848 // Load state, increment counter blocks, store the incremented counters
849 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
850 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
851 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
852 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
853 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
856 openAVX2InternalLoop:
857 // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications
858 // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext
859 polyAdd(0*8(inp)(itr1*1))
860 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
862 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
863 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
865 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
866 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
868 VMOVDQA CC3, tmpStoreAVX2
869 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
870 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
871 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
872 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
873 VMOVDQA tmpStoreAVX2, CC3
875 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
876 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
877 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
878 polyAdd(2*8(inp)(itr1*1))
879 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
881 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
882 VMOVDQA CC3, tmpStoreAVX2
883 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
884 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
885 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
886 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
887 VMOVDQA tmpStoreAVX2, CC3
889 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
890 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
891 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
892 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
894 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
895 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
897 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
898 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
899 polyAdd(4*8(inp)(itr1*1))
900 LEAQ (6*8)(itr1), itr1
901 VMOVDQA CC3, tmpStoreAVX2
902 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
903 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
904 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
905 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
906 VMOVDQA tmpStoreAVX2, CC3
908 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
909 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
911 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
912 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
914 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
915 VMOVDQA CC3, tmpStoreAVX2
916 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
917 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
918 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
919 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
920 VMOVDQA tmpStoreAVX2, CC3
922 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
923 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
924 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
926 JNE openAVX2InternalLoop
928 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
929 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
930 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
931 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
932 VMOVDQA CC3, tmpStoreAVX2
934 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
937 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
938 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
939 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
940 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
941 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
942 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
947 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
948 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
949 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
950 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
951 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
952 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
953 LEAQ (32*16)(inp), inp
954 LEAQ (32*16)(oup), oup
958 openAVX2MainLoopDone:
959 // Handle the various tail sizes efficiently
970 // ----------------------------------------------------------------------------
971 // Special optimization for buffers smaller than 193 bytes
973 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
977 VPADDD ·avx2IncMask<>(SB), DD0, DD1
985 openAVX2192InnerCipherLoop:
986 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
987 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
988 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
989 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
990 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
991 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
992 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
993 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
995 JNE openAVX2192InnerCipherLoop
996 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
997 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
998 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
999 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
1000 VPERM2I128 $0x02, AA0, BB0, TT0
1002 // Clamp and store poly key
1003 VPAND ·polyClampMask<>(SB), TT0, TT0
1004 VMOVDQA TT0, rsStoreAVX2
1006 // Stream for up to 192 bytes
1007 VPERM2I128 $0x13, AA0, BB0, AA0
1008 VPERM2I128 $0x13, CC0, DD0, BB0
1009 VPERM2I128 $0x02, AA1, BB1, CC0
1010 VPERM2I128 $0x02, CC1, DD1, DD0
1011 VPERM2I128 $0x13, AA1, BB1, AA1
1012 VPERM2I128 $0x13, CC1, DD1, BB1
1016 MOVQ ad_len+80(FP), itr2
1017 CALL polyHashADInternal<>(SB)
1019 openAVX2ShortOpenLoop:
1021 JB openAVX2ShortTail32
1030 // Load for decryption
1031 VPXOR (inp), AA0, AA0
1033 LEAQ (1*32)(inp), inp
1034 LEAQ (1*32)(oup), oup
1036 // Shift stream left
1046 JMP openAVX2ShortOpenLoop
1048 openAVX2ShortTail32:
1051 JB openAVX2ShortDone
1059 // Load for decryption
1062 LEAQ (1*16)(inp), inp
1063 LEAQ (1*16)(oup), oup
1064 VPERM2I128 $0x11, AA0, AA0, AA0
1071 // ----------------------------------------------------------------------------
1072 // Special optimization for buffers smaller than 321 bytes
1074 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
1075 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
1076 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
1077 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
1080 openAVX2320InnerCipherLoop:
1081 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1082 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1083 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1084 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1085 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1086 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1087 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1088 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1090 JNE openAVX2320InnerCipherLoop
1092 VMOVDQA ·chacha20Constants<>(SB), TT0
1093 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
1094 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
1095 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
1096 VMOVDQA ·avx2IncMask<>(SB), TT0
1097 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
1098 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
1099 VPADDD TT3, DD2, DD2
1101 // Clamp and store poly key
1102 VPERM2I128 $0x02, AA0, BB0, TT0
1103 VPAND ·polyClampMask<>(SB), TT0, TT0
1104 VMOVDQA TT0, rsStoreAVX2
1106 // Stream for up to 320 bytes
1107 VPERM2I128 $0x13, AA0, BB0, AA0
1108 VPERM2I128 $0x13, CC0, DD0, BB0
1109 VPERM2I128 $0x02, AA1, BB1, CC0
1110 VPERM2I128 $0x02, CC1, DD1, DD0
1111 VPERM2I128 $0x13, AA1, BB1, AA1
1112 VPERM2I128 $0x13, CC1, DD1, BB1
1113 VPERM2I128 $0x02, AA2, BB2, CC1
1114 VPERM2I128 $0x02, CC2, DD2, DD1
1115 VPERM2I128 $0x13, AA2, BB2, AA2
1116 VPERM2I128 $0x13, CC2, DD2, BB2
1117 JMP openAVX2ShortOpen
1119 // ----------------------------------------------------------------------------
1120 // Special optimization for the last 128 bytes of ciphertext
1122 // Need to decrypt up to 128 bytes - prepare two blocks
1123 VMOVDQA ·chacha20Constants<>(SB), AA1
1124 VMOVDQA state1StoreAVX2, BB1
1125 VMOVDQA state2StoreAVX2, CC1
1126 VMOVDQA ctr3StoreAVX2, DD1
1127 VPADDD ·avx2IncMask<>(SB), DD1, DD1
1134 JE openAVX2Tail128LoopB
1136 openAVX2Tail128LoopA:
1137 // Perform ChaCha rounds, while hashing the remaining input
1138 polyAdd(0(inp)(itr2*1))
1141 openAVX2Tail128LoopB:
1143 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1144 VPALIGNR $4, BB1, BB1, BB1
1145 VPALIGNR $8, CC1, CC1, CC1
1146 VPALIGNR $12, DD1, DD1, DD1
1147 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1148 VPALIGNR $12, BB1, BB1, BB1
1149 VPALIGNR $8, CC1, CC1, CC1
1150 VPALIGNR $4, DD1, DD1, DD1
1152 JB openAVX2Tail128LoopA
1154 JNE openAVX2Tail128LoopB
1156 VPADDD ·chacha20Constants<>(SB), AA1, AA1
1157 VPADDD state1StoreAVX2, BB1, BB1
1158 VPADDD state2StoreAVX2, CC1, CC1
1159 VPADDD DD0, DD1, DD1
1160 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1167 // Load for decryption
1168 VPXOR (inp), AA0, AA0
1170 LEAQ (1*32)(inp), inp
1171 LEAQ (1*32)(oup), oup
1175 JMP openAVX2TailLoop
1183 // Load for decryption
1186 LEAQ (1*16)(inp), inp
1187 LEAQ (1*16)(oup), oup
1188 VPERM2I128 $0x11, AA0, AA0, AA0
1195 // ----------------------------------------------------------------------------
1196 // Special optimization for the last 256 bytes of ciphertext
1198 // Need to decrypt up to 256 bytes - prepare four blocks
1199 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
1200 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
1201 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
1202 VMOVDQA ctr3StoreAVX2, DD0
1203 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1204 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1208 // Compute the number of iterations that will hash data
1209 MOVQ inl, tmpStoreAVX2
1219 openAVX2Tail256LoopA:
1224 // Perform ChaCha rounds, while hashing the remaining input
1225 openAVX2Tail256LoopB:
1226 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1227 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
1228 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1229 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
1231 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
1232 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
1233 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
1234 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
1236 JB openAVX2Tail256LoopA
1239 JNE openAVX2Tail256LoopB
1244 MOVQ tmpStoreAVX2, inl
1246 // Hash the remainder of data (if any)
1247 openAVX2Tail256Hash:
1250 JGT openAVX2Tail256HashEnd
1254 JMP openAVX2Tail256Hash
1256 // Store 128 bytes safely, then go to store loop
1257 openAVX2Tail256HashEnd:
1258 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
1259 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
1260 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
1261 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
1262 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2
1263 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1265 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2
1266 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup)
1267 LEAQ (4*32)(inp), inp
1268 LEAQ (4*32)(oup), oup
1271 JMP openAVX2TailLoop
1273 // ----------------------------------------------------------------------------
1274 // Special optimization for the last 384 bytes of ciphertext
1276 // Need to decrypt up to 384 bytes - prepare six blocks
1277 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
1278 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
1279 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
1280 VMOVDQA ctr3StoreAVX2, DD0
1281 VPADDD ·avx2IncMask<>(SB), DD0, DD0
1282 VPADDD ·avx2IncMask<>(SB), DD0, DD1
1283 VPADDD ·avx2IncMask<>(SB), DD1, DD2
1284 VMOVDQA DD0, ctr0StoreAVX2
1285 VMOVDQA DD1, ctr1StoreAVX2
1286 VMOVDQA DD2, ctr2StoreAVX2
1288 // Compute the number of iterations that will hash two blocks of data
1289 MOVQ inl, tmpStoreAVX2
1300 // Perform ChaCha rounds, while hashing the remaining input
1301 openAVX2Tail384LoopB:
1306 openAVX2Tail384LoopA:
1307 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1308 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
1309 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1310 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
1315 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
1316 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
1317 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
1318 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
1321 JB openAVX2Tail384LoopB
1324 JNE openAVX2Tail384LoopA
1329 MOVQ tmpStoreAVX2, inl
1331 openAVX2Tail384Hash:
1334 JGT openAVX2Tail384HashEnd
1338 JMP openAVX2Tail384Hash
1340 // Store 256 bytes safely, then go to store loop
1341 openAVX2Tail384HashEnd:
1342 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
1343 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
1344 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
1345 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
1346 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3
1347 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
1348 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
1349 VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3
1350 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
1351 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
1352 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1353 LEAQ (8*32)(inp), inp
1354 LEAQ (8*32)(oup), oup
1356 JMP openAVX2TailLoop
1358 // ----------------------------------------------------------------------------
1359 // Special optimization for the last 512 bytes of ciphertext
1361 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1362 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
1363 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
1364 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
1365 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
1369 openAVX2Tail512LoopB:
1372 LEAQ (2*8)(itr2), itr2
1374 openAVX2Tail512LoopA:
1375 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1376 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1377 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1378 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1379 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1380 VMOVDQA CC3, tmpStoreAVX2
1381 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1382 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1383 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1384 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1385 VMOVDQA tmpStoreAVX2, CC3
1388 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1389 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1390 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1391 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1392 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1393 VMOVDQA CC3, tmpStoreAVX2
1394 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1395 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1396 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1397 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1398 VMOVDQA tmpStoreAVX2, CC3
1399 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
1400 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1401 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
1402 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1403 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1404 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
1405 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1406 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1409 LEAQ (4*8)(itr2), itr2
1410 VMOVDQA CC3, tmpStoreAVX2
1411 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
1412 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
1413 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
1414 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
1415 VMOVDQA tmpStoreAVX2, CC3
1416 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
1417 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
1418 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
1419 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
1420 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
1421 VMOVDQA CC3, tmpStoreAVX2
1422 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
1423 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
1424 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
1425 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
1426 VMOVDQA tmpStoreAVX2, CC3
1427 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
1428 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
1429 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
1432 JLT openAVX2Tail512LoopB
1435 JNE openAVX2Tail512LoopA
1441 openAVX2Tail512HashLoop:
1443 JE openAVX2Tail512HashEnd
1448 JMP openAVX2Tail512HashLoop
1450 openAVX2Tail512HashEnd:
1451 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
1452 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
1453 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
1454 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
1455 VMOVDQA CC3, tmpStoreAVX2
1456 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
1457 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
1458 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
1459 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
1460 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
1461 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
1462 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
1463 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
1464 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
1465 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
1467 LEAQ (12*32)(inp), inp
1468 LEAQ (12*32)(oup), oup
1471 JMP openAVX2TailLoop
1473 // ----------------------------------------------------------------------------
1474 // ----------------------------------------------------------------------------
1475 // func chacha20Poly1305Seal(dst, key, src, ad []byte)
1476 TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
1477 // For aligned stack access
1482 MOVQ key+24(FP), keyp
1483 MOVQ src+48(FP), inp
1484 MOVQ src_len+56(FP), inl
1487 CMPB ·useAVX2(SB), $1
1488 JE chacha20Poly1305Seal_AVX2
1490 // Special optimization, for very short buffers
1492 JBE sealSSE128 // About 15% faster
1494 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
1495 MOVOU ·chacha20Constants<>(SB), A0
1496 MOVOU (1*16)(keyp), B0
1497 MOVOU (2*16)(keyp), C0
1498 MOVOU (3*16)(keyp), D0
1500 // Store state on stack for future use
1501 MOVO B0, state1Store
1502 MOVO C0, state2Store
1504 // Load state, increment counter blocks
1505 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1506 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1507 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1510 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1515 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1518 chachaQR(A3, B3, C3, D3, C1)
1520 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1521 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1522 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1525 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1528 chachaQR(A3, B3, C3, D3, C1)
1530 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1531 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1532 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1534 JNE sealSSEIntroLoop
1537 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1538 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1539 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1540 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1542 // Clamp and store the key
1543 PAND ·polyClampMask<>(SB), A0
1548 MOVQ ad_len+80(FP), itr2
1549 CALL polyHashADInternal<>(SB)
1551 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1552 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1553 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup)
1554 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1555 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1556 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup)
1562 MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1
1565 JBE sealSSE128SealHash
1567 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1568 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1569 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup)
1586 // Load state, increment counter blocks
1587 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
1588 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1589 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1590 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
1593 MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
1597 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1600 chachaQR(A3, B3, C3, D3, C1)
1603 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left
1604 shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left
1605 shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left
1608 LEAQ (2*8)(oup), oup
1610 chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3)
1614 chachaQR(A3, B3, C3, D3, C1)
1617 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right
1618 shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right
1619 shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right
1621 JGE sealSSEInnerLoop
1624 LEAQ (2*8)(oup), oup
1629 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
1630 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
1631 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
1632 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
1635 // Load - xor - store
1636 MOVOU (0*16)(inp), D3; PXOR D3, A0
1637 MOVOU (1*16)(inp), D3; PXOR D3, B0
1638 MOVOU (2*16)(inp), D3; PXOR D3, C0
1639 MOVOU (3*16)(inp), D3; PXOR D3, D0
1640 MOVOU A0, (0*16)(oup)
1641 MOVOU B0, (1*16)(oup)
1642 MOVOU C0, (2*16)(oup)
1643 MOVOU D0, (3*16)(oup)
1646 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0
1647 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1
1648 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1649 MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0
1650 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2
1651 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup)
1660 JBE sealSSE128SealHash
1661 MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0
1662 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3
1663 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup)
1673 JE sealSSE128SealHash
1681 // ----------------------------------------------------------------------------
1682 // Special optimization for the last 64 bytes of plaintext
1684 // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
1685 MOVO ·chacha20Constants<>(SB), A1
1686 MOVO state1Store, B1
1687 MOVO state2Store, C1
1689 PADDL ·sseIncMask<>(SB), D1
1693 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1699 chachaQR(A1, B1, C1, D1, T1)
1700 shiftB1Left; shiftC1Left; shiftD1Left
1701 chachaQR(A1, B1, C1, D1, T1)
1702 shiftB1Right; shiftC1Right; shiftD1Right
1708 JG sealSSETail64LoopA
1711 JGE sealSSETail64LoopB
1712 PADDL ·chacha20Constants<>(SB), A1
1713 PADDL state1Store, B1
1714 PADDL state2Store, C1
1719 // ----------------------------------------------------------------------------
1720 // Special optimization for the last 128 bytes of plaintext
1722 // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
1723 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1724 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1726 sealSSETail128LoopA:
1727 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1732 sealSSETail128LoopB:
1733 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1734 shiftB0Left; shiftC0Left; shiftD0Left
1735 shiftB1Left; shiftC1Left; shiftD1Left
1739 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0)
1740 shiftB0Right; shiftC0Right; shiftD0Right
1741 shiftB1Right; shiftC1Right; shiftD1Right
1744 JG sealSSETail128LoopA
1747 JGE sealSSETail128LoopB
1749 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
1750 PADDL state1Store, B0; PADDL state1Store, B1
1751 PADDL state2Store, C0; PADDL state2Store, C1
1752 PADDL ctr0Store, D0; PADDL ctr1Store, D1
1754 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1755 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1756 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1762 JMP sealSSE128SealHash
1764 // ----------------------------------------------------------------------------
1765 // Special optimization for the last 192 bytes of plaintext
1767 // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
1768 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
1769 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
1770 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
1772 sealSSETail192LoopA:
1773 // Perform ChaCha rounds, while hashing the previously encrypted ciphertext
1778 sealSSETail192LoopB:
1779 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1780 shiftB0Left; shiftC0Left; shiftD0Left
1781 shiftB1Left; shiftC1Left; shiftD1Left
1782 shiftB2Left; shiftC2Left; shiftD2Left
1788 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1789 shiftB0Right; shiftC0Right; shiftD0Right
1790 shiftB1Right; shiftC1Right; shiftD1Right
1791 shiftB2Right; shiftC2Right; shiftD2Right
1794 JG sealSSETail192LoopA
1797 JGE sealSSETail192LoopB
1799 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1800 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
1801 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
1802 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
1804 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3
1805 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0
1806 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup)
1807 MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3
1808 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1
1809 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup)
1819 JMP sealSSE128SealHash
1821 // ----------------------------------------------------------------------------
1822 // Special seal optimization for buffers smaller than 129 bytes
1824 // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
1825 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
1826 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
1827 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
1828 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
1831 sealSSE128InnerCipherLoop:
1832 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1833 shiftB0Left; shiftB1Left; shiftB2Left
1834 shiftC0Left; shiftC1Left; shiftC2Left
1835 shiftD0Left; shiftD1Left; shiftD2Left
1836 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0)
1837 shiftB0Right; shiftB1Right; shiftB2Right
1838 shiftC0Right; shiftC1Right; shiftC2Right
1839 shiftD0Right; shiftD1Right; shiftD2Right
1841 JNE sealSSE128InnerCipherLoop
1843 // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
1844 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
1845 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
1846 PADDL T2, C1; PADDL T2, C2
1847 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
1848 PAND ·polyClampMask<>(SB), A0
1853 MOVQ ad_len+80(FP), itr2
1854 CALL polyHashADInternal<>(SB)
1858 // itr1 holds the number of bytes encrypted but not yet hashed
1867 JMP sealSSE128SealHash
1874 // Load for decryption
1878 LEAQ (1*16)(inp), inp
1879 LEAQ (1*16)(oup), oup
1881 // Extract for hashing
1885 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1888 // Shift the stream "left"
1902 // We can only load the PT one byte at a time to avoid read after end of buffer
1905 LEAQ ·andMask<>(SB), t0
1907 LEAQ -1(inp)(inl*1), inp
1912 sealSSETailLoadLoop:
1919 JNE sealSSETailLoadLoop
1924 MOVOU -16(t0)(itr2*1), T0
1929 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2
1935 // Hash in the buffer lengths
1936 ADDQ ad_len+80(FP), acc0
1937 ADCQ src_len+56(FP), acc1
1952 // Add in the "s" part of the key
1956 // Finally store the tag at the end of the message
1957 MOVQ acc0, (0*8)(oup)
1958 MOVQ acc1, (1*8)(oup)
1961 // ----------------------------------------------------------------------------
1962 // ------------------------- AVX2 Code ----------------------------------------
1963 chacha20Poly1305Seal_AVX2:
1965 VMOVDQU ·chacha20Constants<>(SB), AA0
1966 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
1967 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
1968 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
1969 VPADDD ·avx2InitMask<>(SB), DD0, DD0
1971 // Special optimizations, for very short buffers
1973 JBE seal192AVX2 // 33% faster
1975 JBE seal320AVX2 // 17% faster
1977 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream
1978 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
1979 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
1980 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
1981 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
1982 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
1983 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
1984 VMOVDQA DD3, ctr3StoreAVX2
1988 VMOVDQA CC3, tmpStoreAVX2
1989 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
1990 VMOVDQA tmpStoreAVX2, CC3
1991 VMOVDQA CC1, tmpStoreAVX2
1992 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
1993 VMOVDQA tmpStoreAVX2, CC1
1995 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
1996 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
1997 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
1998 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2000 VMOVDQA CC3, tmpStoreAVX2
2001 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2002 VMOVDQA tmpStoreAVX2, CC3
2003 VMOVDQA CC1, tmpStoreAVX2
2004 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2005 VMOVDQA tmpStoreAVX2, CC1
2007 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2008 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2009 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2010 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2012 JNE sealAVX2IntroLoop
2014 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2015 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2016 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2017 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2019 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127
2020 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key
2021 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
2023 // Clamp and store poly key
2024 VPAND ·polyClampMask<>(SB), DD0, DD0
2025 VMOVDQA DD0, rsStoreAVX2
2028 MOVQ ad_len+80(FP), itr2
2029 CALL polyHashADInternal<>(SB)
2031 // Can store at least 320 bytes
2032 VPXOR (0*32)(inp), AA0, AA0
2033 VPXOR (1*32)(inp), CC0, CC0
2034 VMOVDQU AA0, (0*32)(oup)
2035 VMOVDQU CC0, (1*32)(oup)
2037 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2038 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0
2039 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup)
2040 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2041 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0
2042 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup)
2048 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0
2050 JBE sealAVX2SealHash
2052 VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0
2053 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup)
2069 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
2070 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2071 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2072 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2073 VMOVDQA ctr3StoreAVX2, DD0
2074 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2075 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2077 VMOVDQA CC3, tmpStoreAVX2
2078 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2079 VMOVDQA tmpStoreAVX2, CC3
2080 VMOVDQA CC1, tmpStoreAVX2
2081 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2082 VMOVDQA tmpStoreAVX2, CC1
2084 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0
2085 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1
2086 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2
2087 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3
2089 VMOVDQA CC3, tmpStoreAVX2
2090 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3)
2091 VMOVDQA tmpStoreAVX2, CC3
2092 VMOVDQA CC1, tmpStoreAVX2
2093 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1)
2094 VMOVDQA tmpStoreAVX2, CC1
2096 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0
2097 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1
2098 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2
2099 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
2100 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2101 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2102 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2103 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2104 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2105 VMOVDQA CC3, tmpStoreAVX2
2106 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2107 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2108 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2109 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2110 VMOVDQA tmpStoreAVX2, CC3
2112 SUBQ $16, oup // Adjust the pointer
2114 JMP sealAVX2InternalLoopStart
2117 // Load state, increment counter blocks, store the incremented counters
2118 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2119 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2120 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2121 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2122 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2125 sealAVX2InternalLoop:
2127 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2129 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2130 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2132 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2133 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2135 VMOVDQA CC3, tmpStoreAVX2
2136 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2137 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2138 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2139 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2140 VMOVDQA tmpStoreAVX2, CC3
2143 sealAVX2InternalLoopStart:
2144 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2145 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2146 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2148 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2150 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2151 VMOVDQA CC3, tmpStoreAVX2
2152 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2153 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2154 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2155 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2156 VMOVDQA tmpStoreAVX2, CC3
2158 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2159 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2160 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2161 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2163 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2164 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2166 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2167 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2169 LEAQ (6*8)(oup), oup
2170 VMOVDQA CC3, tmpStoreAVX2
2171 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2172 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2173 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2174 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2175 VMOVDQA tmpStoreAVX2, CC3
2177 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2178 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2180 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2181 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2183 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2184 VMOVDQA CC3, tmpStoreAVX2
2185 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2186 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2187 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2188 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2189 VMOVDQA tmpStoreAVX2, CC3
2191 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2192 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2193 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2195 JNE sealAVX2InternalLoop
2197 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2198 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2199 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2200 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2201 VMOVDQA CC3, tmpStoreAVX2
2203 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here
2206 LEAQ (4*8)(oup), oup
2207 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0
2208 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0
2209 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup)
2210 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0
2211 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2212 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2217 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0
2218 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2219 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2220 VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2221 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0
2222 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup)
2223 LEAQ (32*16)(inp), inp
2228 // Tail can only hash 480 bytes
2245 // ----------------------------------------------------------------------------
2246 // Special optimization for buffers smaller than 193 bytes
2248 // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks
2252 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2260 sealAVX2192InnerCipherLoop:
2261 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2262 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2263 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2264 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2265 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2266 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2267 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2268 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2270 JNE sealAVX2192InnerCipherLoop
2271 VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1
2272 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1
2273 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1
2274 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1
2275 VPERM2I128 $0x02, AA0, BB0, TT0
2277 // Clamp and store poly key
2278 VPAND ·polyClampMask<>(SB), TT0, TT0
2279 VMOVDQA TT0, rsStoreAVX2
2281 // Stream for up to 192 bytes
2282 VPERM2I128 $0x13, AA0, BB0, AA0
2283 VPERM2I128 $0x13, CC0, DD0, BB0
2284 VPERM2I128 $0x02, AA1, BB1, CC0
2285 VPERM2I128 $0x02, CC1, DD1, DD0
2286 VPERM2I128 $0x13, AA1, BB1, AA1
2287 VPERM2I128 $0x13, CC1, DD1, BB1
2291 MOVQ ad_len+80(FP), itr2
2292 CALL polyHashADInternal<>(SB)
2296 // itr1 holds the number of bytes encrypted but not yet hashed
2298 JB sealAVX2ShortSealLoop
2303 JMP sealAVX2SealHash
2305 sealAVX2ShortSealLoop:
2307 JB sealAVX2ShortTail32
2310 // Load for encryption
2311 VPXOR (inp), AA0, AA0
2313 LEAQ (1*32)(inp), inp
2320 LEAQ (1*32)(oup), oup
2322 // Shift stream left
2332 JMP sealAVX2ShortSealLoop
2334 sealAVX2ShortTail32:
2337 JB sealAVX2ShortDone
2341 // Load for encryption
2344 LEAQ (1*16)(inp), inp
2349 LEAQ (1*16)(oup), oup
2350 VPERM2I128 $0x11, AA0, AA0, AA0
2357 // ----------------------------------------------------------------------------
2358 // Special optimization for buffers smaller than 321 bytes
2360 // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
2361 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
2362 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2363 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
2366 sealAVX2320InnerCipherLoop:
2367 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2368 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2369 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2370 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2371 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2372 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2373 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2374 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2376 JNE sealAVX2320InnerCipherLoop
2378 VMOVDQA ·chacha20Constants<>(SB), TT0
2379 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
2380 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
2381 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
2382 VMOVDQA ·avx2IncMask<>(SB), TT0
2383 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
2384 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
2385 VPADDD TT3, DD2, DD2
2387 // Clamp and store poly key
2388 VPERM2I128 $0x02, AA0, BB0, TT0
2389 VPAND ·polyClampMask<>(SB), TT0, TT0
2390 VMOVDQA TT0, rsStoreAVX2
2392 // Stream for up to 320 bytes
2393 VPERM2I128 $0x13, AA0, BB0, AA0
2394 VPERM2I128 $0x13, CC0, DD0, BB0
2395 VPERM2I128 $0x02, AA1, BB1, CC0
2396 VPERM2I128 $0x02, CC1, DD1, DD0
2397 VPERM2I128 $0x13, AA1, BB1, AA1
2398 VPERM2I128 $0x13, CC1, DD1, BB1
2399 VPERM2I128 $0x02, AA2, BB2, CC1
2400 VPERM2I128 $0x02, CC2, DD2, DD1
2401 VPERM2I128 $0x13, AA2, BB2, AA2
2402 VPERM2I128 $0x13, CC2, DD2, BB2
2403 JMP sealAVX2ShortSeal
2405 // ----------------------------------------------------------------------------
2406 // Special optimization for the last 128 bytes of ciphertext
2408 // Need to decrypt up to 128 bytes - prepare two blocks
2409 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2410 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2411 VMOVDQA ·chacha20Constants<>(SB), AA0
2412 VMOVDQA state1StoreAVX2, BB0
2413 VMOVDQA state2StoreAVX2, CC0
2414 VMOVDQA ctr3StoreAVX2, DD0
2415 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2418 sealAVX2Tail128LoopA:
2423 sealAVX2Tail128LoopB:
2424 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2427 VPALIGNR $4, BB0, BB0, BB0
2428 VPALIGNR $8, CC0, CC0, CC0
2429 VPALIGNR $12, DD0, DD0, DD0
2430 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0)
2434 VPALIGNR $12, BB0, BB0, BB0
2435 VPALIGNR $8, CC0, CC0, CC0
2436 VPALIGNR $4, DD0, DD0, DD0
2438 JG sealAVX2Tail128LoopA
2440 JGE sealAVX2Tail128LoopB
2442 VPADDD ·chacha20Constants<>(SB), AA0, AA1
2443 VPADDD state1StoreAVX2, BB0, BB1
2444 VPADDD state2StoreAVX2, CC0, CC1
2445 VPADDD DD1, DD0, DD1
2447 VPERM2I128 $0x02, AA1, BB1, AA0
2448 VPERM2I128 $0x02, CC1, DD1, BB0
2449 VPERM2I128 $0x13, AA1, BB1, CC0
2450 VPERM2I128 $0x13, CC1, DD1, DD0
2451 JMP sealAVX2ShortSealLoop
2453 // ----------------------------------------------------------------------------
2454 // Special optimization for the last 256 bytes of ciphertext
2456 // Need to decrypt up to 256 bytes - prepare two blocks
2457 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2458 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2459 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
2460 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
2461 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
2462 VMOVDQA ctr3StoreAVX2, DD0
2463 VPADDD ·avx2IncMask<>(SB), DD0, DD0
2464 VPADDD ·avx2IncMask<>(SB), DD0, DD1
2468 sealAVX2Tail256LoopA:
2473 sealAVX2Tail256LoopB:
2474 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2477 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1
2478 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2479 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1
2480 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0)
2484 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1
2485 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1
2486 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1
2488 JG sealAVX2Tail256LoopA
2490 JGE sealAVX2Tail256LoopB
2492 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
2493 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
2494 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
2495 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
2496 VPERM2I128 $0x02, AA0, BB0, TT0
2497 VPERM2I128 $0x02, CC0, DD0, TT1
2498 VPERM2I128 $0x13, AA0, BB0, TT2
2499 VPERM2I128 $0x13, CC0, DD0, TT3
2500 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2501 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2505 VPERM2I128 $0x02, AA1, BB1, AA0
2506 VPERM2I128 $0x02, CC1, DD1, BB0
2507 VPERM2I128 $0x13, AA1, BB1, CC0
2508 VPERM2I128 $0x13, CC1, DD1, DD0
2510 JMP sealAVX2SealHash
2512 // ----------------------------------------------------------------------------
2513 // Special optimization for the last 384 bytes of ciphertext
2515 // Need to decrypt up to 384 bytes - prepare two blocks
2516 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2517 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2518 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
2519 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
2520 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
2521 VMOVDQA ctr3StoreAVX2, DD0
2522 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
2523 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
2525 sealAVX2Tail384LoopA:
2530 sealAVX2Tail384LoopB:
2531 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2534 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2
2535 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2536 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2
2537 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0)
2541 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2
2542 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2
2543 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2
2545 JG sealAVX2Tail384LoopA
2547 JGE sealAVX2Tail384LoopB
2549 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
2550 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
2551 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
2552 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
2553 VPERM2I128 $0x02, AA0, BB0, TT0
2554 VPERM2I128 $0x02, CC0, DD0, TT1
2555 VPERM2I128 $0x13, AA0, BB0, TT2
2556 VPERM2I128 $0x13, CC0, DD0, TT3
2557 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3
2558 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup)
2559 VPERM2I128 $0x02, AA1, BB1, TT0
2560 VPERM2I128 $0x02, CC1, DD1, TT1
2561 VPERM2I128 $0x13, AA1, BB1, TT2
2562 VPERM2I128 $0x13, CC1, DD1, TT3
2563 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3
2564 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup)
2568 VPERM2I128 $0x02, AA2, BB2, AA0
2569 VPERM2I128 $0x02, CC2, DD2, BB0
2570 VPERM2I128 $0x13, AA2, BB2, CC0
2571 VPERM2I128 $0x13, CC2, DD2, DD0
2573 JMP sealAVX2SealHash
2575 // ----------------------------------------------------------------------------
2576 // Special optimization for the last 512 bytes of ciphertext
2578 // Need to decrypt up to 512 bytes - prepare two blocks
2579 // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
2580 // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
2581 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
2582 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
2583 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
2584 VMOVDQA ctr3StoreAVX2, DD0
2585 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
2586 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
2588 sealAVX2Tail512LoopA:
2593 sealAVX2Tail512LoopB:
2594 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2595 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2596 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2597 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2598 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2599 VMOVDQA CC3, tmpStoreAVX2
2600 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2601 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2602 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2603 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2604 VMOVDQA tmpStoreAVX2, CC3
2607 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2608 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2609 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2610 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2611 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2612 VMOVDQA CC3, tmpStoreAVX2
2613 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2614 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2615 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2616 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2617 VMOVDQA tmpStoreAVX2, CC3
2618 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3
2619 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2620 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
2621 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2622 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2623 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
2624 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2625 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2628 LEAQ (4*8)(oup), oup
2629 VMOVDQA CC3, tmpStoreAVX2
2630 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0
2631 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1
2632 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2
2633 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3
2634 VMOVDQA tmpStoreAVX2, CC3
2635 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
2636 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
2637 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
2638 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
2639 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
2640 VMOVDQA CC3, tmpStoreAVX2
2641 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0
2642 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1
2643 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2
2644 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3
2645 VMOVDQA tmpStoreAVX2, CC3
2646 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3
2647 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3
2648 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3
2651 JG sealAVX2Tail512LoopA
2653 JGE sealAVX2Tail512LoopB
2655 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
2656 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
2657 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
2658 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
2659 VMOVDQA CC3, tmpStoreAVX2
2660 VPERM2I128 $0x02, AA0, BB0, CC3
2661 VPXOR (0*32)(inp), CC3, CC3
2662 VMOVDQU CC3, (0*32)(oup)
2663 VPERM2I128 $0x02, CC0, DD0, CC3
2664 VPXOR (1*32)(inp), CC3, CC3
2665 VMOVDQU CC3, (1*32)(oup)
2666 VPERM2I128 $0x13, AA0, BB0, CC3
2667 VPXOR (2*32)(inp), CC3, CC3
2668 VMOVDQU CC3, (2*32)(oup)
2669 VPERM2I128 $0x13, CC0, DD0, CC3
2670 VPXOR (3*32)(inp), CC3, CC3
2671 VMOVDQU CC3, (3*32)(oup)
2673 VPERM2I128 $0x02, AA1, BB1, AA0
2674 VPERM2I128 $0x02, CC1, DD1, BB0
2675 VPERM2I128 $0x13, AA1, BB1, CC0
2676 VPERM2I128 $0x13, CC1, DD1, DD0
2677 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0
2678 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup)
2680 VPERM2I128 $0x02, AA2, BB2, AA0
2681 VPERM2I128 $0x02, CC2, DD2, BB0
2682 VPERM2I128 $0x13, AA2, BB2, CC0
2683 VPERM2I128 $0x13, CC2, DD2, DD0
2684 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0
2685 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup)
2690 VPERM2I128 $0x02, AA3, BB3, AA0
2691 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0
2692 VPERM2I128 $0x13, AA3, BB3, CC0
2693 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0
2695 JMP sealAVX2SealHash
2697 // func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32)
2698 TEXT ·cpuid(SB), NOSPLIT, $0-24
2699 MOVL eaxArg+0(FP), AX
2700 MOVL ecxArg+4(FP), CX
2708 // func xgetbv() (eax, edx uint32)
2709 TEXT ·xgetbv(SB),NOSPLIT,$0-8