// Copyright 2016 The Go Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. // +build go1.7,amd64,!gccgo,!appengine #include "textflag.h" // General register allocation #define oup DI #define inp SI #define inl BX #define adp CX // free to reuse, after we hash the additional data #define keyp R8 // free to reuse, when we copy the key to stack #define itr2 R9 // general iterator #define itr1 CX // general iterator #define acc0 R10 #define acc1 R11 #define acc2 R12 #define t0 R13 #define t1 R14 #define t2 R15 #define t3 R8 // Register and stack allocation for the SSE code #define rStore (0*16)(BP) #define sStore (1*16)(BP) #define state1Store (2*16)(BP) #define state2Store (3*16)(BP) #define tmpStore (4*16)(BP) #define ctr0Store (5*16)(BP) #define ctr1Store (6*16)(BP) #define ctr2Store (7*16)(BP) #define ctr3Store (8*16)(BP) #define A0 X0 #define A1 X1 #define A2 X2 #define B0 X3 #define B1 X4 #define B2 X5 #define C0 X6 #define C1 X7 #define C2 X8 #define D0 X9 #define D1 X10 #define D2 X11 #define T0 X12 #define T1 X13 #define T2 X14 #define T3 X15 #define A3 T0 #define B3 T1 #define C3 T2 #define D3 T3 // Register and stack allocation for the AVX2 code #define rsStoreAVX2 (0*32)(BP) #define state1StoreAVX2 (1*32)(BP) #define state2StoreAVX2 (2*32)(BP) #define ctr0StoreAVX2 (3*32)(BP) #define ctr1StoreAVX2 (4*32)(BP) #define ctr2StoreAVX2 (5*32)(BP) #define ctr3StoreAVX2 (6*32)(BP) #define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack #define AA0 Y0 #define AA1 Y5 #define AA2 Y6 #define AA3 Y7 #define BB0 Y14 #define BB1 Y9 #define BB2 Y10 #define BB3 Y11 #define CC0 Y12 #define CC1 Y13 #define CC2 Y8 #define CC3 Y15 #define DD0 Y4 #define DD1 Y1 #define DD2 Y2 #define DD3 Y3 #define TT0 DD3 #define TT1 AA3 #define TT2 BB3 #define TT3 CC3 // ChaCha20 constants DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 // <<< 16 with PSHUFB DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A // <<< 8 with PSHUFB DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 // Poly1305 key clamp DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF DATA ·sseIncMask<>+0x00(SB)/8, $0x1 DATA ·sseIncMask<>+0x08(SB)/8, $0x0 // To load/store the last < 16 bytes in a buffer DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 // No PALIGNR in Go ASM yet (but VPALIGNR is present). #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 #define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 #define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 #define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 #define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 #define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 #define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 #define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 #define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 #define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 #define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 #define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 #define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 #define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 #define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 #define shiftC0Right shiftC0Left #define shiftC1Right shiftC1Left #define shiftC2Right shiftC2Left #define shiftC3Right shiftC3Left #define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 #define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 #define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 // Some macros #define chachaQR(A, B, C, D, T) \ PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B #define chachaQR_AVX2(A, B, C, D, T) \ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 #define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 #define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX #define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 #define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t2:t3; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 #define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 #define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 #define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 #define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage #define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage // ---------------------------------------------------------------------------- TEXT polyHashADInternal<>(SB), NOSPLIT, $0 // adp points to beginning of additional data // itr2 holds ad length XORQ acc0, acc0 XORQ acc1, acc1 XORQ acc2, acc2 CMPQ itr2, $13 JNE hashADLoop openFastTLSAD: // Special treatment for the TLS case of 13 bytes MOVQ (adp), acc0 MOVQ 5(adp), acc1 SHRQ $24, acc1 MOVQ $1, acc2 polyMul RET hashADLoop: // Hash in 16 byte chunks CMPQ itr2, $16 JB hashADTail polyAdd(0(adp)) LEAQ (1*16)(adp), adp SUBQ $16, itr2 polyMul JMP hashADLoop hashADTail: CMPQ itr2, $0 JE hashADDone // Hash last < 16 byte tail XORQ t0, t0 XORQ t1, t1 XORQ t2, t2 ADDQ itr2, adp hashADTailLoop: SHLQ $8, t1:t0 SHLQ $8, t0 MOVB -1(adp), t2 XORQ t2, t0 DECQ adp DECQ itr2 JNE hashADTailLoop hashADTailFinish: ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 polyMul // Finished AD hashADDone: RET // ---------------------------------------------------------------------------- // func chacha20Poly1305Open(dst, key, src, ad []byte) bool TEXT ·chacha20Poly1305Open(SB), 0, $288-97 // For aligned stack access MOVQ SP, BP ADDQ $32, BP ANDQ $-32, BP MOVQ dst+0(FP), oup MOVQ key+24(FP), keyp MOVQ src+48(FP), inp MOVQ src_len+56(FP), inl MOVQ ad+72(FP), adp // Check for AVX2 support CMPB ·useAVX2(SB), $1 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers CMPQ inl, $128 JBE openSSE128 // About 16% faster // For long buffers, prepare the poly key first MOVOU ·chacha20Constants<>(SB), A0 MOVOU (1*16)(keyp), B0 MOVOU (2*16)(keyp), C0 MOVOU (3*16)(keyp), D0 MOVO D0, T1 // Store state on stack for future use MOVO B0, state1Store MOVO C0, state2Store MOVO D0, ctr3Store MOVQ $10, itr2 openSSEPreparePolyKey: chachaQR(A0, B0, C0, D0, T0) shiftB0Left; shiftC0Left; shiftD0Left chachaQR(A0, B0, C0, D0, T0) shiftB0Right; shiftC0Right; shiftD0Right DECQ itr2 JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 // Clamp and store the key PAND ·polyClampMask<>(SB), A0 MOVO A0, rStore; MOVO B0, sStore // Hash AAD MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) openSSEMainLoop: CMPQ inl, $256 JB openSSEMainLoopDone // Load state, increment counter blocks MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 MOVQ $4, itr1 MOVQ inp, itr2 openSSEInternalLoop: MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 polyAdd(0(itr2)) shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left polyMulStage1 polyMulStage2 LEAQ (2*8)(itr2), itr2 MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore polyMulStage3 chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 polyMulReduceStage shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right DECQ itr1 JGE openSSEInternalLoop polyAdd(0(itr2)) polyMul LEAQ (2*8)(itr2), itr2 CMPQ itr1, $-6 JG openSSEInternalLoop // Add in the state PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 // Load - xor - store MOVO D3, tmpStore MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) LEAQ 256(inp), inp LEAQ 256(oup), oup SUBQ $256, inl JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently TESTQ inl, inl JE openSSEFinalize CMPQ inl, $64 JBE openSSETail64 CMPQ inl, $128 JBE openSSETail128 CMPQ inl, $192 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 polyMul // Final reduce MOVQ acc0, t0 MOVQ acc1, t1 MOVQ acc2, t2 SUBQ $-5, acc0 SBBQ $-1, acc1 SBBQ $3, acc2 CMOVQCS t0, acc0 CMOVQCS t1, acc1 CMOVQCS t2, acc2 // Add in the "s" part of the key ADDQ 0+sStore, acc0 ADCQ 8+sStore, acc1 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX MOVQ $1, DX XORQ (0*8)(inp), acc0 XORQ (1*8)(inp), acc1 ORQ acc1, acc0 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET // ---------------------------------------------------------------------------- // Special optimization for buffers smaller than 129 bytes openSSE128: // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVQ $10, itr2 openSSE128InnerCipherLoop: chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Left; shiftB1Left; shiftB2Left shiftC0Left; shiftC1Left; shiftC2Left shiftD0Left; shiftD1Left; shiftD2Left chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Right; shiftB1Right; shiftB2Right shiftC0Right; shiftC1Right; shiftC2Right shiftD0Right; shiftD1Right; shiftD2Right DECQ itr2 JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T2, C1; PADDL T2, C2 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 // Clamp and store the key PAND ·polyClampMask<>(SB), A0 MOVOU A0, rStore; MOVOU B0, sStore // Hash MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) openSSE128Open: CMPQ inl, $16 JB openSSETail16 SUBQ $16, inl // Load for hashing polyAdd(0(inp)) // Load for decryption MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) LEAQ (1*16)(inp), inp LEAQ (1*16)(oup), oup polyMul // Shift the stream "left" MOVO B1, A1 MOVO C1, B1 MOVO D1, C1 MOVO A2, D1 MOVO B2, A2 MOVO C2, B2 MOVO D2, C2 JMP openSSE128Open openSSETail16: TESTQ inl, inl JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC MOVQ inl, itr2 SHLQ $4, itr2 LEAQ ·andMask<>(SB), t0 MOVOU (inp), T0 ADDQ inl, inp PAND -16(t0)(itr2*1), T0 MOVO T0, 0+tmpStore MOVQ T0, t0 MOVQ 8+tmpStore, t1 PXOR A1, T0 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: MOVQ T0, t3 MOVB t3, (oup) PSRLDQ $1, T0 INCQ oup DECQ inl JNE openSSETail16Store ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 polyMul JMP openSSEFinalize // ---------------------------------------------------------------------------- // Special optimization for the last 64 bytes of ciphertext openSSETail64: // Need to decrypt up to 64 bytes - prepare single block MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store XORQ itr2, itr2 MOVQ inl, itr1 CMPQ itr1, $16 JB openSSETail64LoopB openSSETail64LoopA: // Perform ChaCha rounds, while hashing the remaining input polyAdd(0(inp)(itr2*1)) polyMul SUBQ $16, itr1 openSSETail64LoopB: ADDQ $16, itr2 chachaQR(A0, B0, C0, D0, T0) shiftB0Left; shiftC0Left; shiftD0Left chachaQR(A0, B0, C0, D0, T0) shiftB0Right; shiftC0Right; shiftD0Right CMPQ itr1, $16 JAE openSSETail64LoopA CMPQ itr2, $160 JNE openSSETail64LoopB PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 openSSETail64DecLoop: CMPQ inl, $16 JB openSSETail64DecLoopDone SUBQ $16, inl MOVOU (inp), T0 PXOR T0, A0 MOVOU A0, (oup) LEAQ 16(inp), inp LEAQ 16(oup), oup MOVO B0, A0 MOVO C0, B0 MOVO D0, C0 JMP openSSETail64DecLoop openSSETail64DecLoopDone: MOVO A0, A1 JMP openSSETail16 // ---------------------------------------------------------------------------- // Special optimization for the last 128 bytes of ciphertext openSSETail128: // Need to decrypt up to 128 bytes - prepare two blocks MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store XORQ itr2, itr2 MOVQ inl, itr1 ANDQ $-16, itr1 openSSETail128LoopA: // Perform ChaCha rounds, while hashing the remaining input polyAdd(0(inp)(itr2*1)) polyMul openSSETail128LoopB: ADDQ $16, itr2 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) shiftB0Left; shiftC0Left; shiftD0Left shiftB1Left; shiftC1Left; shiftD1Left chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) shiftB0Right; shiftC0Right; shiftD0Right shiftB1Right; shiftC1Right; shiftD1Right CMPQ itr2, itr1 JB openSSETail128LoopA CMPQ itr2, $160 JNE openSSETail128LoopB PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B0; PADDL state1Store, B1 PADDL state2Store, C0; PADDL state2Store, C1 PADDL ctr1Store, D0; PADDL ctr0Store, D1 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) SUBQ $64, inl LEAQ 64(inp), inp LEAQ 64(oup), oup JMP openSSETail64DecLoop // ---------------------------------------------------------------------------- // Special optimization for the last 192 bytes of ciphertext openSSETail192: // Need to decrypt up to 192 bytes - prepare three blocks MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store MOVQ inl, itr1 MOVQ $160, itr2 CMPQ itr1, $160 CMOVQGT itr2, itr1 ANDQ $-16, itr1 XORQ itr2, itr2 openSSLTail192LoopA: // Perform ChaCha rounds, while hashing the remaining input polyAdd(0(inp)(itr2*1)) polyMul openSSLTail192LoopB: ADDQ $16, itr2 chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Left; shiftC0Left; shiftD0Left shiftB1Left; shiftC1Left; shiftD1Left shiftB2Left; shiftC2Left; shiftD2Left chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Right; shiftC0Right; shiftD0Right shiftB1Right; shiftC1Right; shiftD1Right shiftB2Right; shiftC2Right; shiftD2Right CMPQ itr2, itr1 JB openSSLTail192LoopA CMPQ itr2, $160 JNE openSSLTail192LoopB CMPQ inl, $176 JB openSSLTail192Store polyAdd(160(inp)) polyMul CMPQ inl, $192 JB openSSLTail192Store polyAdd(176(inp)) polyMul openSSLTail192Store: PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) SUBQ $128, inl LEAQ 128(inp), inp LEAQ 128(oup), oup JMP openSSETail64DecLoop // ---------------------------------------------------------------------------- // Special optimization for the last 256 bytes of ciphertext openSSETail256: // Need to decrypt up to 256 bytes - prepare four blocks MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store XORQ itr2, itr2 openSSETail256Loop: // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication polyAdd(0(inp)(itr2*1)) MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left polyMulStage1 polyMulStage2 MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 polyMulStage3 polyMulReduceStage shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right ADDQ $2*8, itr2 CMPQ itr2, $160 JB openSSETail256Loop MOVQ inl, itr1 ANDQ $-16, itr1 openSSETail256HashLoop: polyAdd(0(inp)(itr2*1)) polyMul ADDQ $2*8, itr2 CMPQ itr2, itr1 JB openSSETail256HashLoop // Add in the state PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 MOVO D3, tmpStore // Load - xor - store MOVOU (0*16)(inp), D3; PXOR D3, A0 MOVOU (1*16)(inp), D3; PXOR D3, B0 MOVOU (2*16)(inp), D3; PXOR D3, C0 MOVOU (3*16)(inp), D3; PXOR D3, D0 MOVOU A0, (0*16)(oup) MOVOU B0, (1*16)(oup) MOVOU C0, (2*16)(oup) MOVOU D0, (3*16)(oup) MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) LEAQ 192(inp), inp LEAQ 192(oup), oup SUBQ $192, inl MOVO A3, A0 MOVO B3, B0 MOVO C3, C0 MOVO tmpStore, D0 JMP openSSETail64DecLoop // ---------------------------------------------------------------------------- // ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Open_AVX2: VZEROUPPER VMOVDQU ·chacha20Constants<>(SB), AA0 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 VPADDD ·avx2InitMask<>(SB), DD0, DD0 // Special optimization, for very short buffers CMPQ inl, $192 JBE openAVX2192 CMPQ inl, $320 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream VMOVDQA BB0, state1StoreAVX2 VMOVDQA CC0, state2StoreAVX2 VMOVDQA DD0, ctr3StoreAVX2 MOVQ $10, itr2 openAVX2PreparePolyKey: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 DECQ itr2 JNE openAVX2PreparePolyKey VPADDD ·chacha20Constants<>(SB), AA0, AA0 VPADDD state1StoreAVX2, BB0, BB0 VPADDD state2StoreAVX2, CC0, CC0 VPADDD ctr3StoreAVX2, DD0, DD0 VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for the first 64 bytes VPERM2I128 $0x13, AA0, BB0, AA0 VPERM2I128 $0x13, CC0, DD0, BB0 // Hash AD + first 64 bytes MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) XORQ itr1, itr1 openAVX2InitialHash64: polyAdd(0(inp)(itr1*1)) polyMulAVX2 ADDQ $16, itr1 CMPQ itr1, $64 JNE openAVX2InitialHash64 // Decrypt the first 64 bytes VPXOR (0*32)(inp), AA0, AA0 VPXOR (1*32)(inp), BB0, BB0 VMOVDQU AA0, (0*32)(oup) VMOVDQU BB0, (1*32)(oup) LEAQ (2*32)(inp), inp LEAQ (2*32)(oup), oup SUBQ $64, inl openAVX2MainLoop: CMPQ inl, $512 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 XORQ itr1, itr1 openAVX2InternalLoop: // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext polyAdd(0*8(inp)(itr1*1)) VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage1_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulStage2_AVX2 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyMulStage3_AVX2 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulReduceStage VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 polyAdd(2*8(inp)(itr1*1)) VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage1_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulStage2_AVX2 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage3_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulReduceStage VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(4*8(inp)(itr1*1)) LEAQ (6*8)(itr1), itr1 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulStage1_AVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 polyMulStage2_AVX2 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage3_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulReduceStage VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 CMPQ itr1, $480 JNE openAVX2InternalLoop VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here polyAdd(480(inp)) polyMulAVX2 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) // and here polyAdd(496(inp)) polyMulAVX2 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) LEAQ (32*16)(inp), inp LEAQ (32*16)(oup), oup SUBQ $(32*16), inl JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently TESTQ inl, inl JE openSSEFinalize CMPQ inl, $128 JBE openAVX2Tail128 CMPQ inl, $256 JBE openAVX2Tail256 CMPQ inl, $384 JBE openAVX2Tail384 JMP openAVX2Tail512 // ---------------------------------------------------------------------------- // Special optimization for buffers smaller than 193 bytes openAVX2192: // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks VMOVDQA AA0, AA1 VMOVDQA BB0, BB1 VMOVDQA CC0, CC1 VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2 VMOVDQA BB0, BB2 VMOVDQA CC0, CC2 VMOVDQA DD0, DD2 VMOVDQA DD1, TT3 MOVQ $10, itr2 openAVX2192InnerCipherLoop: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 DECQ itr2 JNE openAVX2192InnerCipherLoop VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 192 bytes VPERM2I128 $0x13, AA0, BB0, AA0 VPERM2I128 $0x13, CC0, DD0, BB0 VPERM2I128 $0x02, AA1, BB1, CC0 VPERM2I128 $0x02, CC1, DD1, DD0 VPERM2I128 $0x13, AA1, BB1, AA1 VPERM2I128 $0x13, CC1, DD1, BB1 openAVX2ShortOpen: // Hash MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: CMPQ inl, $32 JB openAVX2ShortTail32 SUBQ $32, inl // Load for hashing polyAdd(0*8(inp)) polyMulAVX2 polyAdd(2*8(inp)) polyMulAVX2 // Load for decryption VPXOR (inp), AA0, AA0 VMOVDQU AA0, (oup) LEAQ (1*32)(inp), inp LEAQ (1*32)(oup), oup // Shift stream left VMOVDQA BB0, AA0 VMOVDQA CC0, BB0 VMOVDQA DD0, CC0 VMOVDQA AA1, DD0 VMOVDQA BB1, AA1 VMOVDQA CC1, BB1 VMOVDQA DD1, CC1 VMOVDQA AA2, DD1 VMOVDQA BB2, AA2 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: CMPQ inl, $16 VMOVDQA A0, A1 JB openAVX2ShortDone SUBQ $16, inl // Load for hashing polyAdd(0*8(inp)) polyMulAVX2 // Load for decryption VPXOR (inp), A0, T0 VMOVDQU T0, (oup) LEAQ (1*16)(inp), inp LEAQ (1*16)(oup), oup VPERM2I128 $0x11, AA0, AA0, AA0 VMOVDQA A0, A1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 // ---------------------------------------------------------------------------- // Special optimization for buffers smaller than 321 bytes openAVX2320: // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 MOVQ $10, itr2 openAVX2320InnerCipherLoop: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 DECQ itr2 JNE openAVX2320InnerCipherLoop VMOVDQA ·chacha20Constants<>(SB), TT0 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 VMOVDQA ·avx2IncMask<>(SB), TT0 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD2, DD2 // Clamp and store poly key VPERM2I128 $0x02, AA0, BB0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 320 bytes VPERM2I128 $0x13, AA0, BB0, AA0 VPERM2I128 $0x13, CC0, DD0, BB0 VPERM2I128 $0x02, AA1, BB1, CC0 VPERM2I128 $0x02, CC1, DD1, DD0 VPERM2I128 $0x13, AA1, BB1, AA1 VPERM2I128 $0x13, CC1, DD1, BB1 VPERM2I128 $0x02, AA2, BB2, CC1 VPERM2I128 $0x02, CC2, DD2, DD1 VPERM2I128 $0x13, AA2, BB2, AA2 VPERM2I128 $0x13, CC2, DD2, BB2 JMP openAVX2ShortOpen // ---------------------------------------------------------------------------- // Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks VMOVDQA ·chacha20Constants<>(SB), AA1 VMOVDQA state1StoreAVX2, BB1 VMOVDQA state2StoreAVX2, CC1 VMOVDQA ctr3StoreAVX2, DD1 VPADDD ·avx2IncMask<>(SB), DD1, DD1 VMOVDQA DD1, DD0 XORQ itr2, itr2 MOVQ inl, itr1 ANDQ $-16, itr1 TESTQ itr1, itr1 JE openAVX2Tail128LoopB openAVX2Tail128LoopA: // Perform ChaCha rounds, while hashing the remaining input polyAdd(0(inp)(itr2*1)) polyMulAVX2 openAVX2Tail128LoopB: ADDQ $16, itr2 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $4, BB1, BB1, BB1 VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $12, DD1, DD1, DD1 chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $12, BB1, BB1, BB1 VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $4, DD1, DD1, DD1 CMPQ itr2, itr1 JB openAVX2Tail128LoopA CMPQ itr2, $160 JNE openAVX2Tail128LoopB VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC1, CC1 VPADDD DD0, DD1, DD1 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 openAVX2TailLoop: CMPQ inl, $32 JB openAVX2Tail SUBQ $32, inl // Load for decryption VPXOR (inp), AA0, AA0 VMOVDQU AA0, (oup) LEAQ (1*32)(inp), inp LEAQ (1*32)(oup), oup VMOVDQA BB0, AA0 VMOVDQA CC0, BB0 VMOVDQA DD0, CC0 JMP openAVX2TailLoop openAVX2Tail: CMPQ inl, $16 VMOVDQA A0, A1 JB openAVX2TailDone SUBQ $16, inl // Load for decryption VPXOR (inp), A0, T0 VMOVDQU T0, (oup) LEAQ (1*16)(inp), inp LEAQ (1*16)(oup), oup VPERM2I128 $0x11, AA0, AA0, AA0 VMOVDQA A0, A1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 // ---------------------------------------------------------------------------- // Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: // Need to decrypt up to 256 bytes - prepare four blocks VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA DD0, TT1 VMOVDQA DD1, TT2 // Compute the number of iterations that will hash data MOVQ inl, tmpStoreAVX2 MOVQ inl, itr1 SUBQ $128, itr1 SHRQ $4, itr1 MOVQ $10, itr2 CMPQ itr1, $10 CMOVQGT itr2, itr1 MOVQ inp, inl XORQ itr2, itr2 openAVX2Tail256LoopA: polyAdd(0(inl)) polyMulAVX2 LEAQ 16(inl), inl // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 INCQ itr2 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 CMPQ itr2, itr1 JB openAVX2Tail256LoopA CMPQ itr2, $10 JNE openAVX2Tail256LoopB MOVQ inl, itr2 SUBQ inp, inl MOVQ inl, itr1 MOVQ tmpStoreAVX2, inl // Hash the remainder of data (if any) openAVX2Tail256Hash: ADDQ $16, itr1 CMPQ itr1, inl JGT openAVX2Tail256HashEnd polyAdd (0(itr2)) polyMulAVX2 LEAQ 16(itr2), itr2 JMP openAVX2Tail256Hash // Store 128 bytes safely, then go to store loop openAVX2Tail256HashEnd: VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) LEAQ (4*32)(inp), inp LEAQ (4*32)(oup), oup SUBQ $4*32, inl JMP openAVX2TailLoop // ---------------------------------------------------------------------------- // Special optimization for the last 384 bytes of ciphertext openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA DD0, ctr0StoreAVX2 VMOVDQA DD1, ctr1StoreAVX2 VMOVDQA DD2, ctr2StoreAVX2 // Compute the number of iterations that will hash two blocks of data MOVQ inl, tmpStoreAVX2 MOVQ inl, itr1 SUBQ $256, itr1 SHRQ $4, itr1 ADDQ $6, itr1 MOVQ $10, itr2 CMPQ itr1, $10 CMOVQGT itr2, itr1 MOVQ inp, inl XORQ itr2, itr2 // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail384LoopB: polyAdd(0(inl)) polyMulAVX2 LEAQ 16(inl), inl openAVX2Tail384LoopA: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 polyAdd(0(inl)) polyMulAVX2 LEAQ 16(inl), inl INCQ itr2 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 CMPQ itr2, itr1 JB openAVX2Tail384LoopB CMPQ itr2, $10 JNE openAVX2Tail384LoopA MOVQ inl, itr2 SUBQ inp, inl MOVQ inl, itr1 MOVQ tmpStoreAVX2, inl openAVX2Tail384Hash: ADDQ $16, itr1 CMPQ itr1, inl JGT openAVX2Tail384HashEnd polyAdd(0(itr2)) polyMulAVX2 LEAQ 16(itr2), itr2 JMP openAVX2Tail384Hash // Store 256 bytes safely, then go to store loop openAVX2Tail384HashEnd: VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 LEAQ (8*32)(inp), inp LEAQ (8*32)(oup), oup SUBQ $8*32, inl JMP openAVX2TailLoop // ---------------------------------------------------------------------------- // Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 XORQ itr1, itr1 MOVQ inp, itr2 openAVX2Tail512LoopB: polyAdd(0(itr2)) polyMulAVX2 LEAQ (2*8)(itr2), itr2 openAVX2Tail512LoopA: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyAdd(0*8(itr2)) polyMulAVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(2*8(itr2)) polyMulAVX2 LEAQ (4*8)(itr2), itr2 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 INCQ itr1 CMPQ itr1, $4 JLT openAVX2Tail512LoopB CMPQ itr1, $10 JNE openAVX2Tail512LoopA MOVQ inl, itr1 SUBQ $384, itr1 ANDQ $-16, itr1 openAVX2Tail512HashLoop: TESTQ itr1, itr1 JE openAVX2Tail512HashEnd polyAdd(0(itr2)) polyMulAVX2 LEAQ 16(itr2), itr2 SUBQ $16, itr1 JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 LEAQ (12*32)(inp), inp LEAQ (12*32)(oup), oup SUBQ $12*32, inl JMP openAVX2TailLoop // ---------------------------------------------------------------------------- // ---------------------------------------------------------------------------- // func chacha20Poly1305Seal(dst, key, src, ad []byte) TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 // For aligned stack access MOVQ SP, BP ADDQ $32, BP ANDQ $-32, BP MOVQ dst+0(FP), oup MOVQ key+24(FP), keyp MOVQ src+48(FP), inp MOVQ src_len+56(FP), inl MOVQ ad+72(FP), adp CMPB ·useAVX2(SB), $1 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers CMPQ inl, $128 JBE sealSSE128 // About 15% faster // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration MOVOU ·chacha20Constants<>(SB), A0 MOVOU (1*16)(keyp), B0 MOVOU (2*16)(keyp), C0 MOVOU (3*16)(keyp), D0 // Store state on stack for future use MOVO B0, state1Store MOVO C0, state2Store // Load state, increment counter blocks MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store MOVQ $10, itr2 sealSSEIntroLoop: MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right DECQ itr2 JNE sealSSEIntroLoop // Add in the state PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 // Clamp and store the key PAND ·polyClampMask<>(SB), A0 MOVO A0, rStore MOVO B0, sStore // Hash AAD MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) MOVQ $128, itr1 SUBQ $128, inl LEAQ 128(inp), inp MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 CMPQ inl, $64 JBE sealSSE128SealHash MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) ADDQ $64, itr1 SUBQ $64, inl LEAQ 64(inp), inp MOVQ $2, itr1 MOVQ $8, itr2 CMPQ inl, $64 JBE sealSSETail64 CMPQ inl, $128 JBE sealSSETail128 CMPQ inl, $192 JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 // Store counters MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store sealSSEInnerLoop: MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 polyAdd(0(oup)) shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left polyMulStage1 polyMulStage2 LEAQ (2*8)(oup), oup MOVO C3, tmpStore chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) MOVO tmpStore, C3 MOVO C1, tmpStore polyMulStage3 chachaQR(A3, B3, C3, D3, C1) MOVO tmpStore, C1 polyMulReduceStage shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right DECQ itr2 JGE sealSSEInnerLoop polyAdd(0(oup)) polyMul LEAQ (2*8)(oup), oup DECQ itr1 JG sealSSEInnerLoop // Add in the state PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 MOVO D3, tmpStore // Load - xor - store MOVOU (0*16)(inp), D3; PXOR D3, A0 MOVOU (1*16)(inp), D3; PXOR D3, B0 MOVOU (2*16)(inp), D3; PXOR D3, C0 MOVOU (3*16)(inp), D3; PXOR D3, D0 MOVOU A0, (0*16)(oup) MOVOU B0, (1*16)(oup) MOVOU C0, (2*16)(oup) MOVOU D0, (3*16)(oup) MOVO tmpStore, D3 MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) ADDQ $192, inp MOVQ $192, itr1 SUBQ $192, inl MOVO A3, A1 MOVO B3, B1 MOVO C3, C1 MOVO D3, D1 CMPQ inl, $64 JBE sealSSE128SealHash MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) LEAQ 64(inp), inp SUBQ $64, inl MOVQ $6, itr1 MOVQ $4, itr2 CMPQ inl, $192 JG sealSSEMainLoop MOVQ inl, itr1 TESTQ inl, inl JE sealSSE128SealHash MOVQ $6, itr1 CMPQ inl, $64 JBE sealSSETail64 CMPQ inl, $128 JBE sealSSETail128 JMP sealSSETail192 // ---------------------------------------------------------------------------- // Special optimization for the last 64 bytes of plaintext sealSSETail64: // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes MOVO ·chacha20Constants<>(SB), A1 MOVO state1Store, B1 MOVO state2Store, C1 MOVO ctr3Store, D1 PADDL ·sseIncMask<>(SB), D1 MOVO D1, ctr0Store sealSSETail64LoopA: // Perform ChaCha rounds, while hashing the previously encrypted ciphertext polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealSSETail64LoopB: chachaQR(A1, B1, C1, D1, T1) shiftB1Left; shiftC1Left; shiftD1Left chachaQR(A1, B1, C1, D1, T1) shiftB1Right; shiftC1Right; shiftD1Right polyAdd(0(oup)) polyMul LEAQ 16(oup), oup DECQ itr1 JG sealSSETail64LoopA DECQ itr2 JGE sealSSETail64LoopB PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B1 PADDL state2Store, C1 PADDL ctr0Store, D1 JMP sealSSE128Seal // ---------------------------------------------------------------------------- // Special optimization for the last 128 bytes of plaintext sealSSETail128: // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store sealSSETail128LoopA: // Perform ChaCha rounds, while hashing the previously encrypted ciphertext polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealSSETail128LoopB: chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) shiftB0Left; shiftC0Left; shiftD0Left shiftB1Left; shiftC1Left; shiftD1Left polyAdd(0(oup)) polyMul LEAQ 16(oup), oup chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) shiftB0Right; shiftC0Right; shiftD0Right shiftB1Right; shiftC1Right; shiftD1Right DECQ itr1 JG sealSSETail128LoopA DECQ itr2 JGE sealSSETail128LoopB PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 PADDL state1Store, B0; PADDL state1Store, B1 PADDL state2Store, C0; PADDL state2Store, C1 PADDL ctr0Store, D0; PADDL ctr1Store, D1 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) MOVQ $64, itr1 LEAQ 64(inp), inp SUBQ $64, inl JMP sealSSE128SealHash // ---------------------------------------------------------------------------- // Special optimization for the last 192 bytes of plaintext sealSSETail192: // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store sealSSETail192LoopA: // Perform ChaCha rounds, while hashing the previously encrypted ciphertext polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealSSETail192LoopB: chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Left; shiftC0Left; shiftD0Left shiftB1Left; shiftC1Left; shiftD1Left shiftB2Left; shiftC2Left; shiftD2Left polyAdd(0(oup)) polyMul LEAQ 16(oup), oup chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Right; shiftC0Right; shiftD0Right shiftB1Right; shiftC1Right; shiftD1Right shiftB2Right; shiftC2Right; shiftD2Right DECQ itr1 JG sealSSETail192LoopA DECQ itr2 JGE sealSSETail192LoopB PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) MOVO A2, A1 MOVO B2, B1 MOVO C2, C1 MOVO D2, D1 MOVQ $128, itr1 LEAQ 128(inp), inp SUBQ $128, inl JMP sealSSE128SealHash // ---------------------------------------------------------------------------- // Special seal optimization for buffers smaller than 129 bytes sealSSE128: // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVQ $10, itr2 sealSSE128InnerCipherLoop: chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Left; shiftB1Left; shiftB2Left shiftC0Left; shiftC1Left; shiftC2Left shiftD0Left; shiftD1Left; shiftD2Left chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) shiftB0Right; shiftB1Right; shiftB2Right shiftC0Right; shiftC1Right; shiftC2Right shiftD0Right; shiftD1Right; shiftD2Right DECQ itr2 JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T2, C1; PADDL T2, C2 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 PAND ·polyClampMask<>(SB), A0 MOVOU A0, rStore MOVOU B0, sStore // Hash MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) XORQ itr1, itr1 sealSSE128SealHash: // itr1 holds the number of bytes encrypted but not yet hashed CMPQ itr1, $16 JB sealSSE128Seal polyAdd(0(oup)) polyMul SUBQ $16, itr1 ADDQ $16, oup JMP sealSSE128SealHash sealSSE128Seal: CMPQ inl, $16 JB sealSSETail SUBQ $16, inl // Load for decryption MOVOU (inp), T0 PXOR T0, A1 MOVOU A1, (oup) LEAQ (1*16)(inp), inp LEAQ (1*16)(oup), oup // Extract for hashing MOVQ A1, t0 PSRLDQ $8, A1 MOVQ A1, t1 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 polyMul // Shift the stream "left" MOVO B1, A1 MOVO C1, B1 MOVO D1, C1 MOVO A2, D1 MOVO B2, A2 MOVO C2, B2 MOVO D2, C2 JMP sealSSE128Seal sealSSETail: TESTQ inl, inl JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer MOVQ inl, itr2 SHLQ $4, itr2 LEAQ ·andMask<>(SB), t0 MOVQ inl, itr1 LEAQ -1(inp)(inl*1), inp XORQ t2, t2 XORQ t3, t3 XORQ AX, AX sealSSETailLoadLoop: SHLQ $8, t2, t3 SHLQ $8, t2 MOVB (inp), AX XORQ AX, t2 LEAQ -1(inp), inp DECQ itr1 JNE sealSSETailLoadLoop MOVQ t2, 0+tmpStore MOVQ t3, 8+tmpStore PXOR 0+tmpStore, A1 MOVOU A1, (oup) MOVOU -16(t0)(itr2*1), T0 PAND T0, A1 MOVQ A1, t0 PSRLDQ $8, A1 MOVQ A1, t1 ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 polyMul ADDQ inl, oup sealSSEFinalize: // Hash in the buffer lengths ADDQ ad_len+80(FP), acc0 ADCQ src_len+56(FP), acc1 ADCQ $1, acc2 polyMul // Final reduce MOVQ acc0, t0 MOVQ acc1, t1 MOVQ acc2, t2 SUBQ $-5, acc0 SBBQ $-1, acc1 SBBQ $3, acc2 CMOVQCS t0, acc0 CMOVQCS t1, acc1 CMOVQCS t2, acc2 // Add in the "s" part of the key ADDQ 0+sStore, acc0 ADCQ 8+sStore, acc1 // Finally store the tag at the end of the message MOVQ acc0, (0*8)(oup) MOVQ acc1, (1*8)(oup) RET // ---------------------------------------------------------------------------- // ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER VMOVDQU ·chacha20Constants<>(SB), AA0 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 VPADDD ·avx2InitMask<>(SB), DD0, DD0 // Special optimizations, for very short buffers CMPQ inl, $192 JBE seal192AVX2 // 33% faster CMPQ inl, $320 JBE seal320AVX2 // 17% faster // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 VMOVDQA DD3, ctr3StoreAVX2 MOVQ $10, itr2 sealAVX2IntroLoop: VMOVDQA CC3, tmpStoreAVX2 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) VMOVDQA tmpStoreAVX2, CC3 VMOVDQA CC1, tmpStoreAVX2 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) VMOVDQA tmpStoreAVX2, CC1 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) VMOVDQA tmpStoreAVX2, CC3 VMOVDQA CC1, tmpStoreAVX2 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) VMOVDQA tmpStoreAVX2, CC1 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 DECQ itr2 JNE sealAVX2IntroLoop VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 // Clamp and store poly key VPAND ·polyClampMask<>(SB), DD0, DD0 VMOVDQA DD0, rsStoreAVX2 // Hash AD MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes VPXOR (0*32)(inp), AA0, AA0 VPXOR (1*32)(inp), CC0, CC0 VMOVDQU AA0, (0*32)(oup) VMOVDQU CC0, (1*32)(oup) VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) MOVQ $320, itr1 SUBQ $320, inl LEAQ 320(inp), inp VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 CMPQ inl, $128 JBE sealAVX2SealHash VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) SUBQ $128, inl LEAQ 128(inp), inp MOVQ $8, itr1 MOVQ $2, itr2 CMPQ inl, $128 JBE sealAVX2Tail128 CMPQ inl, $256 JBE sealAVX2Tail256 CMPQ inl, $384 JBE sealAVX2Tail384 CMPQ inl, $512 JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA CC3, tmpStoreAVX2 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) VMOVDQA tmpStoreAVX2, CC3 VMOVDQA CC1, tmpStoreAVX2 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) VMOVDQA tmpStoreAVX2, CC1 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) VMOVDQA tmpStoreAVX2, CC3 VMOVDQA CC1, tmpStoreAVX2 chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) VMOVDQA tmpStoreAVX2, CC1 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 SUBQ $16, oup // Adjust the pointer MOVQ $9, itr1 JMP sealAVX2InternalLoopStart sealAVX2MainLoop: // Load state, increment counter blocks, store the incremented counters VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 MOVQ $10, itr1 sealAVX2InternalLoop: polyAdd(0*8(oup)) VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage1_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulStage2_AVX2 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyMulStage3_AVX2 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulReduceStage sealAVX2InternalLoopStart: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 polyAdd(2*8(oup)) VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage1_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulStage2_AVX2 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 polyMulStage3_AVX2 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 polyMulReduceStage VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(4*8(oup)) LEAQ (6*8)(oup), oup VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulStage1_AVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 polyMulStage2_AVX2 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 polyMulStage3_AVX2 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyMulReduceStage VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 DECQ itr1 JNE sealAVX2InternalLoop VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 // We only hashed 480 of the 512 bytes available - hash the remaining 32 here polyAdd(0*8(oup)) polyMulAVX2 LEAQ (4*8)(oup), oup VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) // and here polyAdd(-2*8(oup)) polyMulAVX2 VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) LEAQ (32*16)(inp), inp SUBQ $(32*16), inl CMPQ inl, $512 JG sealAVX2MainLoop // Tail can only hash 480 bytes polyAdd(0*8(oup)) polyMulAVX2 polyAdd(2*8(oup)) polyMulAVX2 LEAQ 32(oup), oup MOVQ $10, itr1 MOVQ $0, itr2 CMPQ inl, $128 JBE sealAVX2Tail128 CMPQ inl, $256 JBE sealAVX2Tail256 CMPQ inl, $384 JBE sealAVX2Tail384 JMP sealAVX2Tail512 // ---------------------------------------------------------------------------- // Special optimization for buffers smaller than 193 bytes seal192AVX2: // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks VMOVDQA AA0, AA1 VMOVDQA BB0, BB1 VMOVDQA CC0, CC1 VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2 VMOVDQA BB0, BB2 VMOVDQA CC0, CC2 VMOVDQA DD0, DD2 VMOVDQA DD1, TT3 MOVQ $10, itr2 sealAVX2192InnerCipherLoop: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 DECQ itr2 JNE sealAVX2192InnerCipherLoop VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 VPERM2I128 $0x02, AA0, BB0, TT0 // Clamp and store poly key VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 192 bytes VPERM2I128 $0x13, AA0, BB0, AA0 VPERM2I128 $0x13, CC0, DD0, BB0 VPERM2I128 $0x02, AA1, BB1, CC0 VPERM2I128 $0x02, CC1, DD1, DD0 VPERM2I128 $0x13, AA1, BB1, AA1 VPERM2I128 $0x13, CC1, DD1, BB1 sealAVX2ShortSeal: // Hash aad MOVQ ad_len+80(FP), itr2 CALL polyHashADInternal<>(SB) XORQ itr1, itr1 sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed CMPQ itr1, $16 JB sealAVX2ShortSealLoop polyAdd(0(oup)) polyMul SUBQ $16, itr1 ADDQ $16, oup JMP sealAVX2SealHash sealAVX2ShortSealLoop: CMPQ inl, $32 JB sealAVX2ShortTail32 SUBQ $32, inl // Load for encryption VPXOR (inp), AA0, AA0 VMOVDQU AA0, (oup) LEAQ (1*32)(inp), inp // Now can hash polyAdd(0*8(oup)) polyMulAVX2 polyAdd(2*8(oup)) polyMulAVX2 LEAQ (1*32)(oup), oup // Shift stream left VMOVDQA BB0, AA0 VMOVDQA CC0, BB0 VMOVDQA DD0, CC0 VMOVDQA AA1, DD0 VMOVDQA BB1, AA1 VMOVDQA CC1, BB1 VMOVDQA DD1, CC1 VMOVDQA AA2, DD1 VMOVDQA BB2, AA2 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: CMPQ inl, $16 VMOVDQA A0, A1 JB sealAVX2ShortDone SUBQ $16, inl // Load for encryption VPXOR (inp), A0, T0 VMOVDQU T0, (oup) LEAQ (1*16)(inp), inp // Hash polyAdd(0*8(oup)) polyMulAVX2 LEAQ (1*16)(oup), oup VPERM2I128 $0x11, AA0, AA0, AA0 VMOVDQA A0, A1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail // ---------------------------------------------------------------------------- // Special optimization for buffers smaller than 321 bytes seal320AVX2: // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 MOVQ $10, itr2 sealAVX2320InnerCipherLoop: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 DECQ itr2 JNE sealAVX2320InnerCipherLoop VMOVDQA ·chacha20Constants<>(SB), TT0 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 VMOVDQA ·avx2IncMask<>(SB), TT0 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD2, DD2 // Clamp and store poly key VPERM2I128 $0x02, AA0, BB0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0 VMOVDQA TT0, rsStoreAVX2 // Stream for up to 320 bytes VPERM2I128 $0x13, AA0, BB0, AA0 VPERM2I128 $0x13, CC0, DD0, BB0 VPERM2I128 $0x02, AA1, BB1, CC0 VPERM2I128 $0x02, CC1, DD1, DD0 VPERM2I128 $0x13, AA1, BB1, AA1 VPERM2I128 $0x13, CC1, DD1, BB1 VPERM2I128 $0x02, AA2, BB2, CC1 VPERM2I128 $0x02, CC2, DD2, DD1 VPERM2I128 $0x13, AA2, BB2, AA2 VPERM2I128 $0x13, CC2, DD2, BB2 JMP sealAVX2ShortSeal // ---------------------------------------------------------------------------- // Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed VMOVDQA ·chacha20Constants<>(SB), AA0 VMOVDQA state1StoreAVX2, BB0 VMOVDQA state2StoreAVX2, CC0 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0 VMOVDQA DD0, DD1 sealAVX2Tail128LoopA: polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealAVX2Tail128LoopB: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) polyAdd(0(oup)) polyMul VPALIGNR $4, BB0, BB0, BB0 VPALIGNR $8, CC0, CC0, CC0 VPALIGNR $12, DD0, DD0, DD0 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) polyAdd(16(oup)) polyMul LEAQ 32(oup), oup VPALIGNR $12, BB0, BB0, BB0 VPALIGNR $8, CC0, CC0, CC0 VPALIGNR $4, DD0, DD0, DD0 DECQ itr1 JG sealAVX2Tail128LoopA DECQ itr2 JGE sealAVX2Tail128LoopB VPADDD ·chacha20Constants<>(SB), AA0, AA1 VPADDD state1StoreAVX2, BB0, BB1 VPADDD state2StoreAVX2, CC0, CC1 VPADDD DD1, DD0, DD1 VPERM2I128 $0x02, AA1, BB1, AA0 VPERM2I128 $0x02, CC1, DD1, BB0 VPERM2I128 $0x13, AA1, BB1, CC0 VPERM2I128 $0x13, CC1, DD1, DD0 JMP sealAVX2ShortSealLoop // ---------------------------------------------------------------------------- // Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: // Need to decrypt up to 256 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD1 VMOVDQA DD0, TT1 VMOVDQA DD1, TT2 sealAVX2Tail256LoopA: polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealAVX2Tail256LoopB: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) polyAdd(0(oup)) polyMul VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) polyAdd(16(oup)) polyMul LEAQ 32(oup), oup VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 DECQ itr1 JG sealAVX2Tail256LoopA DECQ itr2 JGE sealAVX2Tail256LoopB VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, CC0, DD0, TT1 VPERM2I128 $0x13, AA0, BB0, TT2 VPERM2I128 $0x13, CC0, DD0, TT3 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) MOVQ $128, itr1 LEAQ 128(inp), inp SUBQ $128, inl VPERM2I128 $0x02, AA1, BB1, AA0 VPERM2I128 $0x02, CC1, DD1, BB0 VPERM2I128 $0x13, AA1, BB1, CC0 VPERM2I128 $0x13, CC1, DD1, DD0 JMP sealAVX2SealHash // ---------------------------------------------------------------------------- // Special optimization for the last 384 bytes of ciphertext sealAVX2Tail384: // Need to decrypt up to 384 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 sealAVX2Tail384LoopA: polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealAVX2Tail384LoopB: chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) polyAdd(0(oup)) polyMul VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) polyAdd(16(oup)) polyMul LEAQ 32(oup), oup VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 DECQ itr1 JG sealAVX2Tail384LoopA DECQ itr2 JGE sealAVX2Tail384LoopB VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, CC0, DD0, TT1 VPERM2I128 $0x13, AA0, BB0, TT2 VPERM2I128 $0x13, CC0, DD0, TT3 VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, TT0 VPERM2I128 $0x02, CC1, DD1, TT1 VPERM2I128 $0x13, AA1, BB1, TT2 VPERM2I128 $0x13, CC1, DD1, TT3 VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) MOVQ $256, itr1 LEAQ 256(inp), inp SUBQ $256, inl VPERM2I128 $0x02, AA2, BB2, AA0 VPERM2I128 $0x02, CC2, DD2, BB0 VPERM2I128 $0x13, AA2, BB2, CC0 VPERM2I128 $0x13, CC2, DD2, DD0 JMP sealAVX2SealHash // ---------------------------------------------------------------------------- // Special optimization for the last 512 bytes of ciphertext sealAVX2Tail512: // Need to decrypt up to 512 bytes - prepare two blocks // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA ctr3StoreAVX2, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 sealAVX2Tail512LoopA: polyAdd(0(oup)) polyMul LEAQ 16(oup), oup sealAVX2Tail512LoopB: VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 polyAdd(0*8(oup)) polyMulAVX2 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 polyAdd(2*8(oup)) polyMulAVX2 LEAQ (4*8)(oup), oup VMOVDQA CC3, tmpStoreAVX2 VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VMOVDQA CC3, tmpStoreAVX2 VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 VMOVDQA tmpStoreAVX2, CC3 VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 DECQ itr1 JG sealAVX2Tail512LoopA DECQ itr2 JGE sealAVX2Tail512LoopB VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VMOVDQA CC3, tmpStoreAVX2 VPERM2I128 $0x02, AA0, BB0, CC3 VPXOR (0*32)(inp), CC3, CC3 VMOVDQU CC3, (0*32)(oup) VPERM2I128 $0x02, CC0, DD0, CC3 VPXOR (1*32)(inp), CC3, CC3 VMOVDQU CC3, (1*32)(oup) VPERM2I128 $0x13, AA0, BB0, CC3 VPXOR (2*32)(inp), CC3, CC3 VMOVDQU CC3, (2*32)(oup) VPERM2I128 $0x13, CC0, DD0, CC3 VPXOR (3*32)(inp), CC3, CC3 VMOVDQU CC3, (3*32)(oup) VPERM2I128 $0x02, AA1, BB1, AA0 VPERM2I128 $0x02, CC1, DD1, BB0 VPERM2I128 $0x13, AA1, BB1, CC0 VPERM2I128 $0x13, CC1, DD1, DD0 VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) VPERM2I128 $0x02, AA2, BB2, AA0 VPERM2I128 $0x02, CC2, DD2, BB0 VPERM2I128 $0x13, AA2, BB2, CC0 VPERM2I128 $0x13, CC2, DD2, DD0 VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) MOVQ $384, itr1 LEAQ 384(inp), inp SUBQ $384, inl VPERM2I128 $0x02, AA3, BB3, AA0 VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 VPERM2I128 $0x13, AA3, BB3, CC0 VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 JMP sealAVX2SealHash // func cpuid(eaxArg, ecxArg uint32) (eax, ebx, ecx, edx uint32) TEXT ·cpuid(SB), NOSPLIT, $0-24 MOVL eaxArg+0(FP), AX MOVL ecxArg+4(FP), CX CPUID MOVL AX, eax+8(FP) MOVL BX, ebx+12(FP) MOVL CX, ecx+16(FP) MOVL DX, edx+20(FP) RET // func xgetbv() (eax, edx uint32) TEXT ·xgetbv(SB),NOSPLIT,$0-8 MOVL $0, CX XGETBV MOVL AX, eax+0(FP) MOVL DX, edx+4(FP) RET